mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/small-fixes
This commit is contained in:
commit
02247cccaf
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a41,<8.0.0a50",
|
"thinc>=8.0.0a43,<8.0.0a50",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
|
12
setup.cfg
12
setup.cfg
|
@ -34,16 +34,16 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
@ -65,7 +65,7 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==0.4.0.dev0
|
spacy_lookups_data==1.0.0rc0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
@ -84,7 +84,7 @@ cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.5
|
sudachipy>=0.4.9
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20200330
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
|
@ -98,7 +98,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503
|
ignore = E203, E266, E501, E731, W503, E741
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a26"
|
__version__ = "3.0.0a29"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
|
|
@ -10,12 +10,13 @@ from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.api import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
|
from ..util import ENV_VARS
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
@ -65,7 +65,7 @@ def setup_cli() -> None:
|
||||||
|
|
||||||
|
|
||||||
def parse_config_overrides(
|
def parse_config_overrides(
|
||||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Generate a dictionary of config overrides based on the extra arguments
|
"""Generate a dictionary of config overrides based on the extra arguments
|
||||||
provided on the CLI, e.g. --training.batch_size to override
|
provided on the CLI, e.g. --training.batch_size to override
|
||||||
|
@ -275,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
|
||||||
"""RETURNS (List[str]): All sourced components in the original config,
|
|
||||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
|
||||||
"factory", we assume it refers to a component factory.
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
name
|
|
||||||
for name, cfg in config.get("components", {}).items()
|
|
||||||
if "factory" not in cfg and "source" in cfg
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
|
@ -458,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = int(p)
|
p = int(p)
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def setup_gpu(use_gpu: int) -> None:
|
||||||
|
"""Configure the GPU and log info."""
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
||||||
|
|
|
@ -9,7 +9,8 @@ import sys
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
from typing import Optional, Dict, Any, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from wasabi import msg, table
|
||||||
from thinc.api import Config, ConfigValidationError
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,10 +54,10 @@ def debug_config(
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp = util.load_model_from_config(config)
|
nlp = util.load_model_from_config(config)
|
||||||
# Use the resolved config here in case user has one function returning
|
config = nlp.config.interpolate()
|
||||||
# a dict of corpora etc.
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
resolved = util.resolve_training_config(nlp.config)
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
util.resolve_dot_names(config, dot_names)
|
||||||
msg.good("Config is valid")
|
msg.good("Config is valid")
|
||||||
if show_vars:
|
if show_vars:
|
||||||
variables = get_variables(config)
|
variables = get_variables(config)
|
||||||
|
@ -97,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
||||||
value = util.dot_to_object(config, path)
|
value = util.dot_to_object(config, path)
|
||||||
result[variable] = repr(value)
|
result[variable] = repr(value)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
|
||||||
"""Validate fields in the config that refer to other sections or values
|
|
||||||
(e.g. in the corpora) and make sure that those references exist.
|
|
||||||
"""
|
|
||||||
errors = []
|
|
||||||
for field in fields:
|
|
||||||
# If the field doesn't exist in the config, we ignore it
|
|
||||||
try:
|
|
||||||
value = util.dot_to_object(config, field)
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
util.dot_to_object(config, value)
|
|
||||||
except KeyError:
|
|
||||||
msg = f"not a valid section reference: {value}"
|
|
||||||
errors.append({"loc": field.split("."), "msg": msg})
|
|
||||||
if errors:
|
|
||||||
raise ConfigValidationError(config=config, errors=errors)
|
|
||||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli
|
||||||
from ..training import Corpus, Example
|
from ..training import Example
|
||||||
|
from ..training.initialize import get_sourced_components
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..util import registry, resolve_dot_names
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
)
|
)
|
||||||
@app.command(
|
@app.command(
|
||||||
"debug-data",
|
"debug-data",
|
||||||
|
@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
def debug_data_cli(
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
|
||||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
|
@ -59,8 +60,6 @@ def debug_data_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
debug_data(
|
debug_data(
|
||||||
train_path,
|
|
||||||
dev_path,
|
|
||||||
config_path,
|
config_path,
|
||||||
config_overrides=overrides,
|
config_overrides=overrides,
|
||||||
ignore_warnings=ignore_warnings,
|
ignore_warnings=ignore_warnings,
|
||||||
|
@ -71,8 +70,6 @@ def debug_data_cli(
|
||||||
|
|
||||||
|
|
||||||
def debug_data(
|
def debug_data(
|
||||||
train_path: Path,
|
|
||||||
dev_path: Path,
|
|
||||||
config_path: Path,
|
config_path: Path,
|
||||||
*,
|
*,
|
||||||
config_overrides: Dict[str, Any] = {},
|
config_overrides: Dict[str, Any] = {},
|
||||||
|
@ -85,57 +82,29 @@ def debug_data(
|
||||||
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||||
)
|
)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not train_path.exists():
|
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
|
||||||
if not dev_path.exists():
|
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp = util.load_model_from_config(cfg)
|
nlp = util.load_model_from_config(cfg)
|
||||||
C = util.resolve_training_config(nlp.config)
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
frozen_components = C["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
pipeline = nlp.pipe_names
|
pipeline = nlp.pipe_names
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
tag_map_path = util.ensure_path(C["training"]["tag_map"])
|
|
||||||
tag_map = {}
|
|
||||||
if tag_map_path is not None:
|
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
|
||||||
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
|
|
||||||
morph_rules = {}
|
|
||||||
if morph_rules_path is not None:
|
|
||||||
morph_rules = srsly.read_json(morph_rules_path)
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
msg.divider("Data file validation")
|
msg.divider("Data file validation")
|
||||||
|
|
||||||
# Create the gold corpus to be able to better analyze data
|
# Create the gold corpus to be able to better analyze data
|
||||||
loading_train_error_message = ""
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
loading_dev_error_message = ""
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
with msg.loading("Loading corpus..."):
|
train_dataset = list(train_corpus(nlp))
|
||||||
try:
|
dev_dataset = list(dev_corpus(nlp))
|
||||||
train_dataset = list(Corpus(train_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
|
||||||
try:
|
|
||||||
dev_dataset = list(Corpus(dev_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
|
||||||
if loading_train_error_message or loading_dev_error_message:
|
|
||||||
if loading_train_error_message:
|
|
||||||
msg.fail(loading_train_error_message)
|
|
||||||
if loading_dev_error_message:
|
|
||||||
msg.fail(loading_dev_error_message)
|
|
||||||
sys.exit(1)
|
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
|
nlp.initialize(lambda: train_dataset)
|
||||||
|
msg.good("Pipeline can be initialized with data")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
@ -145,10 +114,10 @@ def debug_data(
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
frozen_components = C["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text(f"Language: {C['nlp']['lang']}")
|
msg.text(f"Language: {nlp.lang}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
|
@ -355,17 +324,12 @@ def debug_data(
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
# TODO: does this need to be updated?
|
||||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
msg.info(f"{len(labels)} label(s) in data")
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_data["tags"].most_common(), counts=True
|
gold_train_data["tags"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
msg.text(labels_with_counts, show=verbose)
|
msg.text(labels_with_counts, show=verbose)
|
||||||
non_tagmap = [l for l in labels if l not in tag_map]
|
|
||||||
if not non_tagmap:
|
|
||||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
|
||||||
for label in non_tagmap:
|
|
||||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
|
||||||
|
|
||||||
if "parser" in factory_names:
|
if "parser" in factory_names:
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
|
|
|
@ -2,18 +2,23 @@ from typing import Dict, Any, Optional, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import dot_to_object
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||||
from ._util import parse_config_overrides, string_to_list
|
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("model")
|
@debug_cli.command(
|
||||||
|
"model",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
@ -37,11 +42,7 @@ def debug_model_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
layers = string_to_list(layers, intify=True)
|
layers = string_to_list(layers, intify=True)
|
||||||
print_settings = {
|
print_settings = {
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
|
@ -59,14 +60,15 @@ def debug_model_cli(
|
||||||
raw_config = util.load_config(
|
raw_config = util.load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=False
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
)
|
)
|
||||||
config = raw_config.iterpolate()
|
config = raw_config.interpolate()
|
||||||
allocator = config["training"]["gpu_allocator"]
|
allocator = config["training"]["gpu_allocator"]
|
||||||
if use_gpu >= 0 and allocator:
|
if use_gpu >= 0 and allocator:
|
||||||
set_gpu_allocator(allocator)
|
set_gpu_allocator(allocator)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
nlp = util.load_model_from_config(raw_config)
|
nlp = util.load_model_from_config(raw_config)
|
||||||
C = util.resolve_training_config(nlp.config)
|
config = nlp.config.interpolate()
|
||||||
seed = C["training"]["seed"]
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
seed = T["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
@ -77,11 +79,16 @@ def debug_model_cli(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
debug_model(C, nlp, model, print_settings=print_settings)
|
debug_model(config, T, nlp, model, print_settings=print_settings)
|
||||||
|
|
||||||
|
|
||||||
def debug_model(
|
def debug_model(
|
||||||
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
config,
|
||||||
|
resolved_train_config,
|
||||||
|
nlp,
|
||||||
|
model: Model,
|
||||||
|
*,
|
||||||
|
print_settings: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
if not isinstance(model, Model):
|
if not isinstance(model, Model):
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -102,13 +109,16 @@ def debug_model(
|
||||||
# The output vector might differ from the official type of the output layer
|
# The output vector might differ from the official type of the output layer
|
||||||
with data_validation(False):
|
with data_validation(False):
|
||||||
try:
|
try:
|
||||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
dot_names = [resolved_train_config["train_corpus"]]
|
||||||
nlp.begin_training(lambda: train_corpus(nlp))
|
with show_validation_error():
|
||||||
|
(train_corpus,) = resolve_dot_names(config, dot_names)
|
||||||
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
msg.info("Initialized the model with the training corpus.")
|
msg.info("Initialized the model with the training corpus.")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
_set_output_dim(nO=7, model=model)
|
_set_output_dim(nO=7, model=model)
|
||||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
with show_validation_error():
|
||||||
|
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||||
msg.info("Initialized the model with dummy data.")
|
msg.info("Initialized the model with dummy data.")
|
||||||
except Exception:
|
except Exception:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import require_gpu, fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
@ -19,6 +19,7 @@ def evaluate_cli(
|
||||||
model: str = Arg(..., help="Model name or path"),
|
model: str = Arg(..., help="Model name or path"),
|
||||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
|
@ -37,6 +38,7 @@ def evaluate_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||||
"""
|
"""
|
||||||
|
import_code(code_path)
|
||||||
evaluate(
|
evaluate(
|
||||||
model,
|
model,
|
||||||
data_path,
|
data_path,
|
||||||
|
@ -61,8 +63,7 @@ def evaluate(
|
||||||
) -> Scorer:
|
) -> Scorer:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
require_gpu(use_gpu)
|
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
|
|
@ -1,360 +0,0 @@
|
||||||
from typing import Optional, List, Dict, Any, Union, IO
|
|
||||||
import math
|
|
||||||
from tqdm import tqdm
|
|
||||||
import numpy
|
|
||||||
from ast import literal_eval
|
|
||||||
from pathlib import Path
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
import tarfile
|
|
||||||
import gzip
|
|
||||||
import zipfile
|
|
||||||
import srsly
|
|
||||||
import warnings
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import app, init_cli, Arg, Opt
|
|
||||||
from ..vectors import Vectors
|
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from ..language import Language
|
|
||||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ftfy
|
|
||||||
except ImportError:
|
|
||||||
ftfy = None
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OOV_PROB = -20
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vocab")
|
|
||||||
@app.command(
|
|
||||||
"init-model",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
|
||||||
)
|
|
||||||
def init_model_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
lang: str = Arg(..., help="Pipeline language"),
|
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
|
||||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
|
||||||
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
|
||||||
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Create a new blank pipeline directory with vocab and vectors from raw data.
|
|
||||||
If vectors are provided in Word2Vec format, they can be either a .txt or
|
|
||||||
zipped as a .zip or .tar.gz.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
|
||||||
"""
|
|
||||||
if ctx.command.name == "init-model":
|
|
||||||
msg.warn(
|
|
||||||
"The init-model command is now called 'init vocab'. You can run "
|
|
||||||
"'python -m spacy init --help' for an overview of the other "
|
|
||||||
"available initialization commands."
|
|
||||||
)
|
|
||||||
init_model(
|
|
||||||
lang,
|
|
||||||
output_dir,
|
|
||||||
freqs_loc=freqs_loc,
|
|
||||||
clusters_loc=clusters_loc,
|
|
||||||
jsonl_loc=jsonl_loc,
|
|
||||||
vectors_loc=vectors_loc,
|
|
||||||
prune_vectors=prune_vectors,
|
|
||||||
truncate_vectors=truncate_vectors,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
model_name=model_name,
|
|
||||||
base_model=base_model,
|
|
||||||
silent=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init_model(
|
|
||||||
lang: str,
|
|
||||||
output_dir: Path,
|
|
||||||
freqs_loc: Optional[Path] = None,
|
|
||||||
clusters_loc: Optional[Path] = None,
|
|
||||||
jsonl_loc: Optional[Path] = None,
|
|
||||||
vectors_loc: Optional[Path] = None,
|
|
||||||
prune_vectors: int = -1,
|
|
||||||
truncate_vectors: int = 0,
|
|
||||||
vectors_name: Optional[str] = None,
|
|
||||||
model_name: Optional[str] = None,
|
|
||||||
base_model: Optional[str] = None,
|
|
||||||
silent: bool = True,
|
|
||||||
) -> Language:
|
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
|
||||||
if jsonl_loc is not None:
|
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
|
||||||
settings = ["-j"]
|
|
||||||
if freqs_loc:
|
|
||||||
settings.append("-f")
|
|
||||||
if clusters_loc:
|
|
||||||
settings.append("-c")
|
|
||||||
msg.warn(
|
|
||||||
"Incompatible arguments",
|
|
||||||
"The -f and -c arguments are deprecated, and not compatible "
|
|
||||||
"with the -j argument, which should specify the same "
|
|
||||||
"information. Either merge the frequencies and clusters data "
|
|
||||||
"into the JSONL-formatted file (recommended), or use only the "
|
|
||||||
"-f and -c files, without the other lexical attributes.",
|
|
||||||
)
|
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
|
||||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
|
||||||
else:
|
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
|
||||||
|
|
||||||
with msg.loading("Creating blank pipeline..."):
|
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
|
||||||
|
|
||||||
msg.good("Successfully created blank pipeline")
|
|
||||||
if vectors_loc is not None:
|
|
||||||
add_vectors(
|
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
|
||||||
)
|
|
||||||
vec_added = len(nlp.vocab.vectors)
|
|
||||||
lex_added = len(nlp.vocab)
|
|
||||||
msg.good(
|
|
||||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
|
||||||
)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc: Union[str, Path]) -> IO:
|
|
||||||
"""Handle .gz, .tar.gz or unzipped files"""
|
|
||||||
loc = ensure_path(loc)
|
|
||||||
if tarfile.is_tarfile(str(loc)):
|
|
||||||
return tarfile.open(str(loc), "r:gz")
|
|
||||||
elif loc.parts[-1].endswith("gz"):
|
|
||||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
|
||||||
elif loc.parts[-1].endswith("zip"):
|
|
||||||
zip_file = zipfile.ZipFile(str(loc))
|
|
||||||
names = zip_file.namelist()
|
|
||||||
file_ = zip_file.open(names[0])
|
|
||||||
return (line.decode("utf8") for line in file_)
|
|
||||||
else:
|
|
||||||
return loc.open("r", encoding="utf8")
|
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(
|
|
||||||
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
if freqs_loc is not None:
|
|
||||||
with msg.loading("Counting frequencies..."):
|
|
||||||
probs, _ = read_freqs(freqs_loc)
|
|
||||||
msg.good("Counted frequencies")
|
|
||||||
else:
|
|
||||||
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
|
|
||||||
if clusters_loc:
|
|
||||||
with msg.loading("Reading clusters..."):
|
|
||||||
clusters = read_clusters(clusters_loc)
|
|
||||||
msg.good("Read clusters")
|
|
||||||
else:
|
|
||||||
clusters = {}
|
|
||||||
lex_attrs = []
|
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
|
||||||
if len(sorted_probs):
|
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
|
||||||
attrs = {"orth": word, "id": i, "prob": prob}
|
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
|
||||||
# the first 4 bits. See _parse_features.pyx
|
|
||||||
if word in clusters:
|
|
||||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
|
||||||
else:
|
|
||||||
attrs["cluster"] = 0
|
|
||||||
lex_attrs.append(attrs)
|
|
||||||
return lex_attrs
|
|
||||||
|
|
||||||
|
|
||||||
def create_model(
|
|
||||||
lang: str,
|
|
||||||
lex_attrs: List[Dict[str, Any]],
|
|
||||||
name: Optional[str] = None,
|
|
||||||
base_model: Optional[Union[str, Path]] = None,
|
|
||||||
) -> Language:
|
|
||||||
if base_model:
|
|
||||||
nlp = load_model(base_model)
|
|
||||||
# keep the tokenizer but remove any existing pipeline components due to
|
|
||||||
# potentially conflicting vectors
|
|
||||||
for pipe in nlp.pipe_names:
|
|
||||||
nlp.remove_pipe(pipe)
|
|
||||||
else:
|
|
||||||
lang_class = get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
for lexeme in nlp.vocab:
|
|
||||||
lexeme.rank = OOV_RANK
|
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
if len(nlp.vocab):
|
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
|
||||||
else:
|
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
|
||||||
if name:
|
|
||||||
nlp.meta["name"] = name
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(
|
|
||||||
msg: Printer,
|
|
||||||
nlp: Language,
|
|
||||||
vectors_loc: Optional[Path],
|
|
||||||
truncate_vectors: int,
|
|
||||||
prune_vectors: int,
|
|
||||||
name: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
|
||||||
for lex in nlp.vocab:
|
|
||||||
if lex.rank and lex.rank != OOV_RANK:
|
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
|
||||||
else:
|
|
||||||
if vectors_loc:
|
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
|
||||||
vectors_data, vector_keys = read_vectors(
|
|
||||||
msg, vectors_loc, truncate_vectors
|
|
||||||
)
|
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
|
||||||
else:
|
|
||||||
vectors_data, vector_keys = (None, None)
|
|
||||||
if vector_keys is not None:
|
|
||||||
for word in vector_keys:
|
|
||||||
if word not in nlp.vocab:
|
|
||||||
nlp.vocab[word]
|
|
||||||
if vectors_data is not None:
|
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
|
||||||
if name is None:
|
|
||||||
# TODO: Is this correct? Does this matter?
|
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
|
||||||
else:
|
|
||||||
nlp.vocab.vectors.name = name
|
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
|
||||||
if prune_vectors >= 1:
|
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
|
||||||
f = open_file(vectors_loc)
|
|
||||||
f = ensure_shape(f)
|
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
|
||||||
if truncate_vectors >= 1:
|
|
||||||
shape = (truncate_vectors, shape[1])
|
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
|
||||||
vectors_keys = []
|
|
||||||
for i, line in enumerate(tqdm(f)):
|
|
||||||
line = line.rstrip()
|
|
||||||
pieces = line.rsplit(" ", vectors_data.shape[1])
|
|
||||||
word = pieces.pop(0)
|
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
|
||||||
vectors_keys.append(word)
|
|
||||||
if i == truncate_vectors - 1:
|
|
||||||
break
|
|
||||||
return vectors_data, vectors_keys
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_shape(lines):
|
|
||||||
"""Ensure that the first line of the data is the vectors shape.
|
|
||||||
|
|
||||||
If it's not, we read in the data and output the shape as the first result,
|
|
||||||
so that the reader doesn't have to deal with the problem.
|
|
||||||
"""
|
|
||||||
first_line = next(lines)
|
|
||||||
try:
|
|
||||||
shape = tuple(int(size) for size in first_line.split())
|
|
||||||
except ValueError:
|
|
||||||
shape = None
|
|
||||||
if shape is not None:
|
|
||||||
# All good, give the data
|
|
||||||
yield first_line
|
|
||||||
yield from lines
|
|
||||||
else:
|
|
||||||
# Figure out the shape, make it the first value, and then give the
|
|
||||||
# rest of the data.
|
|
||||||
width = len(first_line.split()) - 1
|
|
||||||
captured = [first_line] + list(lines)
|
|
||||||
length = len(captured)
|
|
||||||
yield f"{length} {width}"
|
|
||||||
yield from captured
|
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(
|
|
||||||
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
|
||||||
):
|
|
||||||
counts = PreshCounter()
|
|
||||||
total = 0
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for i, line in enumerate(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
freq = int(freq)
|
|
||||||
counts.inc(i + 1, freq)
|
|
||||||
total += freq
|
|
||||||
counts.smooth()
|
|
||||||
log_total = math.log(total)
|
|
||||||
probs = {}
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
doc_freq = int(doc_freq)
|
|
||||||
freq = int(freq)
|
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
|
||||||
try:
|
|
||||||
word = literal_eval(key)
|
|
||||||
except SyntaxError:
|
|
||||||
# Take odd strings literally.
|
|
||||||
word = literal_eval(f"'{key}'")
|
|
||||||
smooth_count = counts.smoother(int(freq))
|
|
||||||
probs[word] = math.log(smooth_count) - log_total
|
|
||||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
|
||||||
return probs, oov_prob
|
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc: Path) -> dict:
|
|
||||||
clusters = {}
|
|
||||||
if ftfy is None:
|
|
||||||
warnings.warn(Warnings.W004)
|
|
||||||
with clusters_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
try:
|
|
||||||
cluster, word, freq = line.split()
|
|
||||||
if ftfy is not None:
|
|
||||||
word = ftfy.fix_text(word)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
# If the clusterer has only seen the word a few times, its
|
|
||||||
# cluster is unreliable.
|
|
||||||
if int(freq) >= 3:
|
|
||||||
clusters[word] = cluster
|
|
||||||
else:
|
|
||||||
clusters[word] = "0"
|
|
||||||
# Expand clusters with re-casing
|
|
||||||
for word, cluster in list(clusters.items()):
|
|
||||||
if word.lower() not in clusters:
|
|
||||||
clusters[word.lower()] = cluster
|
|
||||||
if word.title() not in clusters:
|
|
||||||
clusters[word.title()] = cluster
|
|
||||||
if word.upper() not in clusters:
|
|
||||||
clusters[word.upper()] = cluster
|
|
||||||
return clusters
|
|
117
spacy/cli/init_pipeline.py
Normal file
117
spacy/cli/init_pipeline.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ..language import Language
|
||||||
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("vectors")
|
||||||
|
def init_vectors_cli(
|
||||||
|
# fmt: off
|
||||||
|
lang: str = Arg(..., help="The language of the nlp object to create"),
|
||||||
|
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
|
||||||
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
you can use in the [initialize] block of your config to initialize
|
||||||
|
a model with vectors.
|
||||||
|
"""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
|
nlp = util.get_lang_class(lang)()
|
||||||
|
if jsonl_loc is not None:
|
||||||
|
update_lexemes(nlp, jsonl_loc)
|
||||||
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
msg.good(
|
||||||
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
|
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
||||||
|
output_dir.resolve(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
|
# Mostly used for backwards-compatibility and may be removed in the future
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"nlp",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
hidden=True,
|
||||||
|
)
|
||||||
|
def init_pipeline_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
nlp.to_disk(output_path)
|
||||||
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"labels",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def init_labels_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||||
|
training process, since spaCy won't have to preprocess the data to
|
||||||
|
extract the labels."""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
for name, component in nlp.pipeline:
|
||||||
|
if getattr(component, "label_data", None) is not None:
|
||||||
|
output_file = output_path / f"{name}.json"
|
||||||
|
srsly.write_json(output_file, component.label_data)
|
||||||
|
msg.good(f"Saving {name} labels to {output_file}")
|
||||||
|
else:
|
||||||
|
msg.info(f"No labels found for {name}")
|
|
@ -1,25 +1,13 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import require_gpu, set_gpu_allocator
|
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
|
||||||
from thinc.api import Config, CosineDistance, L2Distance
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
|
||||||
from functools import partial
|
|
||||||
import typer
|
import typer
|
||||||
|
import re
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code, setup_gpu
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
from ..training.pretrain import pretrain
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..util import load_config
|
||||||
from ..tokens import Doc
|
|
||||||
from ..attrs import ID
|
|
||||||
from .. import util
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -61,15 +49,11 @@ def pretrain_cli(
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
raw_config = util.load_config(
|
raw_config = load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=False
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
)
|
)
|
||||||
config = raw_config.interpolate()
|
config = raw_config.interpolate()
|
||||||
|
@ -89,250 +73,11 @@ def pretrain_cli(
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
|
||||||
config: Config,
|
|
||||||
output_dir: Path,
|
|
||||||
resume_path: Optional[Path] = None,
|
|
||||||
epoch_resume: Optional[int] = None,
|
|
||||||
use_gpu: int = -1,
|
|
||||||
):
|
|
||||||
if config["training"]["seed"] is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
nlp = util.load_model_from_config(config)
|
|
||||||
C = util.resolve_training_config(nlp.config)
|
|
||||||
P_cfg = C["pretraining"]
|
|
||||||
corpus = dot_to_object(C, P_cfg["corpus"])
|
|
||||||
batcher = P_cfg["batcher"]
|
|
||||||
model = create_pretraining_model(nlp, C["pretraining"])
|
|
||||||
optimizer = C["pretraining"]["optimizer"]
|
|
||||||
# Load in pretrained weights to resume from
|
|
||||||
if resume_path is not None:
|
|
||||||
_resume_model(model, resume_path, epoch_resume)
|
|
||||||
else:
|
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
|
||||||
epoch_resume = 0
|
|
||||||
|
|
||||||
tracker = ProgressTracker(frequency=10000)
|
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
|
||||||
log = {
|
|
||||||
"nr_word": tracker.nr_word,
|
|
||||||
"loss": tracker.loss,
|
|
||||||
"epoch_loss": tracker.epoch_loss,
|
|
||||||
"epoch": epoch,
|
|
||||||
}
|
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
|
||||||
|
|
||||||
objective = create_objective(P_cfg["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
|
||||||
# 'create_train_batches' function?
|
|
||||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
|
||||||
docs = ensure_docs(batch)
|
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
msg.row(progress, **row_settings)
|
|
||||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
|
||||||
_save_model(epoch, is_temp=True)
|
|
||||||
_save_model(epoch)
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs):
|
|
||||||
docs = []
|
|
||||||
for eg_or_doc in examples_or_docs:
|
|
||||||
if isinstance(eg_or_doc, Doc):
|
|
||||||
docs.append(eg_or_doc)
|
|
||||||
else:
|
|
||||||
docs.append(eg_or_doc.reference)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(model, resume_path, epoch_resume):
|
|
||||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
|
||||||
with resume_path.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
|
||||||
# Parse the epoch number from the given weight file
|
|
||||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
|
||||||
if model_name:
|
|
||||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
|
||||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
else:
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, objective_func):
|
|
||||||
"""Perform an update over a single batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
|
||||||
optimizer (callable): An optimizer.
|
|
||||||
RETURNS loss: A float for the loss.
|
|
||||||
"""
|
|
||||||
predictions, backprop = model.begin_update(docs)
|
|
||||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
|
||||||
backprop(gradients)
|
|
||||||
model.finish_update(optimizer)
|
|
||||||
# Don't want to return a cupy object here
|
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
|
||||||
# so we get an accurate loss
|
|
||||||
return float(loss)
|
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss,
|
|
||||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
|
||||||
)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected loss type", config["loss"])
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected objective_type", objective_type)
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
|
||||||
Each array in the output needs to have one row per token in the doc.
|
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
|
||||||
serialized to file and read back in when calling the 'train' command.
|
|
||||||
"""
|
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
|
||||||
if pretrain_config.get("layer"):
|
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
|
||||||
else:
|
|
||||||
tok2vec = component.model
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
maxout_pieces = 3
|
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker:
|
|
||||||
def __init__(self, frequency=1000000):
|
|
||||||
self.loss = 0.0
|
|
||||||
self.prev_loss = 0.0
|
|
||||||
self.nr_word = 0
|
|
||||||
self.words_per_epoch = Counter()
|
|
||||||
self.frequency = frequency
|
|
||||||
self.last_time = time.time()
|
|
||||||
self.last_update = 0
|
|
||||||
self.epoch_loss = 0.0
|
|
||||||
|
|
||||||
def update(self, epoch, loss, docs):
|
|
||||||
self.loss += loss
|
|
||||||
self.epoch_loss += loss
|
|
||||||
words_in_batch = sum(len(doc) for doc in docs)
|
|
||||||
self.words_per_epoch[epoch] += words_in_batch
|
|
||||||
self.nr_word += words_in_batch
|
|
||||||
words_since_update = self.nr_word - self.last_update
|
|
||||||
if words_since_update >= self.frequency:
|
|
||||||
wps = words_since_update / (time.time() - self.last_time)
|
|
||||||
self.last_update = self.nr_word
|
|
||||||
self.last_time = time.time()
|
|
||||||
loss_per_word = self.loss - self.prev_loss
|
|
||||||
status = (
|
|
||||||
epoch,
|
|
||||||
self.nr_word,
|
|
||||||
_smart_round(self.loss, width=10),
|
|
||||||
_smart_round(loss_per_word, width=6),
|
|
||||||
int(wps),
|
|
||||||
)
|
|
||||||
self.prev_loss = float(self.loss)
|
|
||||||
return status
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _smart_round(figure, width=10, max_decimal=4):
|
|
||||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
|
||||||
n_digits = len(str(int(figure)))
|
|
||||||
n_decimal = width - (n_digits + 1)
|
|
||||||
if n_decimal <= 1:
|
|
||||||
return str(int(figure))
|
|
||||||
else:
|
|
||||||
n_decimal = min(n_decimal, max_decimal)
|
|
||||||
format_str = "%." + str(n_decimal) + "f"
|
|
||||||
return format_str % figure
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
def run_dvc_commands(
|
||||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
|
||||||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
|
@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
|
||||||
window = 128
|
window = 128
|
||||||
stride = 96
|
stride = 96
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -257,7 +286,7 @@ no_output_layer = false
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% for pipe in components %}
|
{% for pipe in components %}
|
||||||
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
|
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
||||||
{# Other components defined by the user: we just assume they're factories #}
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
[components.{{ pipe }}]
|
[components.{{ pipe }}]
|
||||||
factory = "{{ pipe }}"
|
factory = "{{ pipe }}"
|
||||||
|
@ -270,7 +299,6 @@ factory = "{{ pipe }}"
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
@ -278,11 +306,6 @@ path = ${paths.dev}
|
||||||
max_length = 0
|
max_length = 0
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = null
|
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
@ -318,3 +341,10 @@ start = 100
|
||||||
stop = 1000
|
stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
|
|
@ -1,23 +1,14 @@
|
||||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
from typing import Optional
|
||||||
from timeit import default_timer as timer
|
|
||||||
import srsly
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
|
||||||
import thinc.schedules
|
|
||||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
|
||||||
import random
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, setup_gpu
|
||||||
from ..language import Language
|
from ..training.loop import train
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.example import Example
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -30,8 +21,7 @@ def train_cli(
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -48,393 +38,19 @@ def train_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#train
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
train(
|
setup_gpu(use_gpu)
|
||||||
config_path,
|
|
||||||
output_path=output_path,
|
|
||||||
config_overrides=overrides,
|
|
||||||
use_gpu=use_gpu,
|
|
||||||
resume_training=resume,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
config_path: Path,
|
|
||||||
output_path: Optional[Path] = None,
|
|
||||||
config_overrides: Dict[str, Any] = {},
|
|
||||||
use_gpu: int = -1,
|
|
||||||
resume_training: bool = False,
|
|
||||||
) -> None:
|
|
||||||
if use_gpu >= 0:
|
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
# Keep an un-interpolated config so we can preserve variables in
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
# the final nlp object we train and serialize
|
msg.divider("Initializing pipeline")
|
||||||
raw_config = util.load_config(
|
with show_validation_error(config_path, hint_fill=False):
|
||||||
config_path, overrides=config_overrides, interpolate=False
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
)
|
msg.good("Initialized pipeline")
|
||||||
config = raw_config.interpolate()
|
msg.divider("Training pipeline")
|
||||||
if config["training"]["seed"] is not None:
|
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
# Use original config here before it's resolved to functions
|
|
||||||
sourced_components = get_sourced_components(config)
|
|
||||||
with show_validation_error(config_path):
|
|
||||||
nlp = util.load_model_from_config(raw_config)
|
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
|
||||||
C = util.resolve_training_config(nlp.config)
|
|
||||||
util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"])
|
|
||||||
if C["training"]["vectors"] is not None:
|
|
||||||
add_vectors(nlp, C["training"]["vectors"])
|
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(C)
|
|
||||||
T_cfg = C["training"]
|
|
||||||
optimizer = T_cfg["optimizer"]
|
|
||||||
train_corpus = dot_to_object(C, T_cfg["train_corpus"])
|
|
||||||
dev_corpus = dot_to_object(C, T_cfg["dev_corpus"])
|
|
||||||
batcher = T_cfg["batcher"]
|
|
||||||
train_logger = T_cfg["logger"]
|
|
||||||
before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
|
|
||||||
# Components that shouldn't be updated during training
|
|
||||||
frozen_components = T_cfg["frozen_components"]
|
|
||||||
# Sourced components that require resume_training
|
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
|
||||||
if resume_components:
|
|
||||||
with nlp.select_pipes(enable=resume_components):
|
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
|
||||||
nlp.resume_training(sgd=optimizer)
|
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
|
||||||
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
|
|
||||||
verify_config(nlp)
|
|
||||||
|
|
||||||
if tag_map:
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
if morph_rules:
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
|
||||||
if weights_data is not None:
|
|
||||||
tok2vec_component = C["pretraining"]["component"]
|
|
||||||
if tok2vec_component is None:
|
|
||||||
msg.fail(
|
|
||||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
|
||||||
f"needs to specify the component that should load them.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
layer = nlp.get_pipe(tok2vec_component).model
|
|
||||||
tok2vec_layer = C["pretraining"]["layer"]
|
|
||||||
if tok2vec_layer:
|
|
||||||
layer = layer.get_ref(tok2vec_layer)
|
|
||||||
layer.from_bytes(weights_data)
|
|
||||||
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
|
||||||
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
|
||||||
msg.info("Start training")
|
|
||||||
score_weights = T_cfg["score_weights"]
|
|
||||||
training_step_iterator = train_while_improving(
|
|
||||||
nlp,
|
|
||||||
optimizer,
|
|
||||||
create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
|
|
||||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
|
||||||
dropout=T_cfg["dropout"],
|
|
||||||
accumulate_gradient=T_cfg["accumulate_gradient"],
|
|
||||||
patience=T_cfg["patience"],
|
|
||||||
max_steps=T_cfg["max_steps"],
|
|
||||||
eval_frequency=T_cfg["eval_frequency"],
|
|
||||||
raw_text=None,
|
|
||||||
exclude=frozen_components,
|
|
||||||
)
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
|
||||||
|
|
||||||
try:
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch 1")
|
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
|
||||||
progress.update(1)
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
progress.close()
|
|
||||||
print_row(info)
|
|
||||||
if is_best_checkpoint and output_path is not None:
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
update_meta(T_cfg, nlp, info)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-best")
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
|
||||||
except Exception as e:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
# We don't want to swallow the traceback if we don't have a
|
|
||||||
# specific error.
|
|
||||||
msg.warn(
|
|
||||||
f"Aborting and saving the final best model. "
|
|
||||||
f"Encountered exception: {str(e)}"
|
|
||||||
)
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-final")
|
|
||||||
raise e
|
|
||||||
finally:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
final_model_path = output_path / "model-final"
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
|
||||||
title = f"Config validation error for vectors {vectors}"
|
|
||||||
desc = (
|
|
||||||
"This typically means that there's a problem in the config.cfg included "
|
|
||||||
"with the packaged vectors. Make sure that the vectors package you're "
|
|
||||||
"loading is compatible with the current version of spaCy."
|
|
||||||
)
|
|
||||||
with show_validation_error(
|
|
||||||
title=title, desc=desc, hint_fill=False, show_config=False
|
|
||||||
):
|
|
||||||
util.load_vectors_into_model(nlp, vectors)
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
|
||||||
epoch = 0
|
|
||||||
examples = list(iterator)
|
|
||||||
if not examples:
|
|
||||||
# Raise error if no data
|
|
||||||
raise ValueError(Errors.E986)
|
|
||||||
while max_epochs < 1 or epoch != max_epochs:
|
|
||||||
random.shuffle(examples)
|
|
||||||
for batch in batcher(examples):
|
|
||||||
yield epoch, batch
|
|
||||||
epoch += 1
|
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(
|
|
||||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
|
||||||
weights = {key: value for key, value in weights.items() if value is not None}
|
|
||||||
|
|
||||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
|
||||||
dev_examples = list(dev_corpus(nlp))
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
|
||||||
# Calculate a weighted sum based on score_weights for the main score.
|
|
||||||
# We can only consider scores that are ints/floats, not dicts like
|
|
||||||
# entity scores per type etc.
|
|
||||||
for key, value in scores.items():
|
|
||||||
if key in weights and not isinstance(value, (int, float)):
|
|
||||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
|
||||||
try:
|
|
||||||
weighted_score = sum(
|
|
||||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
|
||||||
)
|
|
||||||
except KeyError as e:
|
|
||||||
keys = list(scores.keys())
|
|
||||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
|
||||||
raise KeyError(err) from None
|
|
||||||
return weighted_score, scores
|
|
||||||
|
|
||||||
return evaluate
|
|
||||||
|
|
||||||
|
|
||||||
def create_before_to_disk_callback(
|
|
||||||
callback: Optional[Callable[[Language], Language]]
|
|
||||||
) -> Callable[[Language], Language]:
|
|
||||||
def before_to_disk(nlp: Language) -> Language:
|
|
||||||
if not callback:
|
|
||||||
return nlp
|
|
||||||
modified_nlp = callback(nlp)
|
|
||||||
if not isinstance(modified_nlp, Language):
|
|
||||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
|
||||||
raise ValueError(err)
|
|
||||||
return modified_nlp
|
|
||||||
|
|
||||||
return before_to_disk
|
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
|
||||||
nlp: Language,
|
|
||||||
optimizer: Optimizer,
|
|
||||||
train_data,
|
|
||||||
evaluate,
|
|
||||||
*,
|
|
||||||
dropout: float,
|
|
||||||
eval_frequency: int,
|
|
||||||
accumulate_gradient: int,
|
|
||||||
patience: int,
|
|
||||||
max_steps: int,
|
|
||||||
raw_text: List[Dict[str, str]],
|
|
||||||
exclude: List[str],
|
|
||||||
):
|
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
|
||||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
|
||||||
None indicating that the iteration was not evaluated as a checkpoint.
|
|
||||||
The evaluation is conducted by calling the evaluate callback.
|
|
||||||
|
|
||||||
Positional arguments:
|
|
||||||
nlp: The spaCy pipeline to evaluate.
|
|
||||||
optimizer: The optimizer callable.
|
|
||||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
|
||||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
|
||||||
data iterable needs to take care of iterating over the epochs and
|
|
||||||
shuffling.
|
|
||||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
|
||||||
The callback should take no arguments and return a tuple
|
|
||||||
`(main_score, other_scores)`. The main_score should be a float where
|
|
||||||
higher is better. other_scores can be any object.
|
|
||||||
|
|
||||||
Every iteration, the function yields out a tuple with:
|
|
||||||
|
|
||||||
* batch: A list of Example objects.
|
|
||||||
* info: A dict with various information about the last update (see below).
|
|
||||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
|
||||||
was the best evaluation so far. You should use this to save the model
|
|
||||||
checkpoints during training. If None, evaluation was not conducted on
|
|
||||||
that iteration. False means evaluation was conducted, but a previous
|
|
||||||
evaluation was better.
|
|
||||||
|
|
||||||
The info dict provides the following information:
|
|
||||||
|
|
||||||
epoch (int): How many passes over the data have been completed.
|
|
||||||
step (int): How many steps have been completed.
|
|
||||||
score (float): The main score from the last evaluation.
|
|
||||||
other_scores: : The other scores from the last evaluation.
|
|
||||||
losses: The accumulated losses throughout training.
|
|
||||||
checkpoints: A list of previous results, where each result is a
|
|
||||||
(score, step, epoch) tuple.
|
|
||||||
"""
|
|
||||||
if isinstance(dropout, float):
|
|
||||||
dropouts = thinc.schedules.constant(dropout)
|
|
||||||
else:
|
|
||||||
dropouts = dropout
|
|
||||||
results = []
|
|
||||||
losses = {}
|
|
||||||
if raw_text:
|
|
||||||
random.shuffle(raw_text)
|
|
||||||
raw_examples = [
|
|
||||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
|
||||||
]
|
|
||||||
raw_batches = util.minibatch(raw_examples, size=8)
|
|
||||||
|
|
||||||
words_seen = 0
|
|
||||||
start_time = timer()
|
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
|
||||||
dropout = next(dropouts)
|
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
|
||||||
|
|
||||||
nlp.update(
|
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
|
||||||
)
|
|
||||||
if raw_text:
|
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
|
||||||
# which use unlabelled data to reduce overfitting.
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if (
|
|
||||||
name not in exclude
|
|
||||||
and hasattr(proc, "model")
|
|
||||||
and proc.model not in (True, False, None)
|
|
||||||
):
|
|
||||||
proc.model.finish_update(optimizer)
|
|
||||||
optimizer.step_schedules()
|
|
||||||
if not (step % eval_frequency):
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
else:
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
results.append((score, step))
|
|
||||||
is_best_checkpoint = score == max(results)[0]
|
|
||||||
else:
|
|
||||||
score, other_scores = (None, None)
|
|
||||||
is_best_checkpoint = None
|
|
||||||
words_seen += sum(len(eg) for eg in batch)
|
|
||||||
info = {
|
|
||||||
"epoch": epoch,
|
|
||||||
"step": step,
|
|
||||||
"score": score,
|
|
||||||
"other_scores": other_scores,
|
|
||||||
"losses": losses,
|
|
||||||
"checkpoints": results,
|
|
||||||
"seconds": int(timer() - start_time),
|
|
||||||
"words": words_seen,
|
|
||||||
}
|
|
||||||
yield batch, info, is_best_checkpoint
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
losses = {}
|
|
||||||
# Stop if no improvement in `patience` updates (if specified)
|
|
||||||
best_score, best_step = max(results)
|
|
||||||
if patience and (step - best_step) >= patience:
|
|
||||||
break
|
|
||||||
# Stop if we've exhausted our max steps (if specified)
|
|
||||||
if max_steps and step >= max_steps:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def subdivide_batch(batch, accumulate_gradient):
|
|
||||||
batch = list(batch)
|
|
||||||
batch.sort(key=lambda eg: len(eg.predicted))
|
|
||||||
sub_len = len(batch) // accumulate_gradient
|
|
||||||
start = 0
|
|
||||||
for i in range(accumulate_gradient):
|
|
||||||
subbatch = batch[start : start + sub_len]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
start += len(subbatch)
|
|
||||||
subbatch = batch[start:]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
|
|
||||||
|
|
||||||
def update_meta(
|
|
||||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
nlp.meta["performance"] = {}
|
|
||||||
for metric in training["score_weights"]:
|
|
||||||
if metric is not None:
|
|
||||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
|
||||||
for pipe_name in nlp.pipe_names:
|
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
|
||||||
|
|
||||||
|
|
||||||
def load_from_paths(
|
|
||||||
config: Config,
|
|
||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
|
||||||
# TODO: separate checks from loading
|
|
||||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
|
||||||
if raw_text is not None:
|
|
||||||
if not raw_text.exists():
|
|
||||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
|
||||||
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
|
|
||||||
tag_map = {}
|
|
||||||
morph_rules = {}
|
|
||||||
weights_data = None
|
|
||||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
if not init_tok2vec.exists():
|
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
return raw_text, tag_map, morph_rules, weights_data
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
||||||
|
@ -445,30 +61,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
def verify_config(nlp: Language) -> None:
|
|
||||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
|
||||||
# in config["nlp"]["pipeline"] instead?
|
|
||||||
for pipe_config in nlp.config["components"].values():
|
|
||||||
# We can't assume that the component name == the factory
|
|
||||||
factory = pipe_config["factory"]
|
|
||||||
if factory == "textcat":
|
|
||||||
verify_textcat_config(nlp, pipe_config)
|
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
|
||||||
# the task is binary
|
|
||||||
if pipe_config.get("positive_label"):
|
|
||||||
textcat_labels = nlp.get_pipe("textcat").labels
|
|
||||||
pos_label = pipe_config.get("positive_label")
|
|
||||||
if pos_label not in textcat_labels:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
if len(list(textcat_labels)) != 2:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
raw = null
|
vectors = null
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
|
@ -10,8 +10,13 @@ gpu_allocator = null
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
|
# List of pipeline component names, in order. The names should correspond to
|
||||||
|
# components defined in the [components block]
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
# Components that are loaded but disabled by default
|
||||||
disabled = []
|
disabled = []
|
||||||
|
# Optional callbacks to modify the nlp object before it's initialized, after
|
||||||
|
# it's created and after the pipeline has been set up
|
||||||
before_creation = null
|
before_creation = null
|
||||||
after_creation = null
|
after_creation = null
|
||||||
after_pipeline_creation = null
|
after_pipeline_creation = null
|
||||||
|
@ -19,6 +24,7 @@ after_pipeline_creation = null
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
# The pipeline components and their models
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
# Readers for corpora like dev and train.
|
# Readers for corpora like dev and train.
|
||||||
|
@ -37,9 +43,8 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Apply some simply data augmentation, where we replace tokens with variations.
|
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||||
# This is especially useful for punctuation and case replacement, to help
|
# This is especially useful for punctuation and case replacement, to help
|
||||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
# generalize beyond corpora that don't/only have smart quotes etc.
|
||||||
# quotes, etc.
|
augmenter = null
|
||||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
@ -52,6 +57,8 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
# Optional callback for data augmentation
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
# Training hyper-parameters and additional features.
|
# Training hyper-parameters and additional features.
|
||||||
[training]
|
[training]
|
||||||
|
@ -59,11 +66,6 @@ seed = ${system.seed}
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
raw_text = ${paths.raw}
|
|
||||||
vectors = null
|
|
||||||
lookups = null
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
|
@ -104,3 +106,19 @@ grad_clip = 1.0
|
||||||
use_averages = false
|
use_averages = false
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
# These settings are used when nlp.initialize() is called (typically before
|
||||||
|
# training or pretraining). Components and the tokenizer can each define their
|
||||||
|
# own arguments via their initialize methods that are populated by the config.
|
||||||
|
# This lets them gather data resources, build label sets etc.
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
# Data and lookups for vocabulary
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
# Arguments passed to the tokenizer's initialize method
|
||||||
|
tokenizer = {}
|
||||||
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
|
components = {}
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
[paths]
|
||||||
|
raw_text = null
|
||||||
|
|
||||||
[pretraining]
|
[pretraining]
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
|
@ -31,8 +34,8 @@ learn_rate = 0.001
|
||||||
[corpora]
|
[corpora]
|
||||||
|
|
||||||
[corpora.pretrain]
|
[corpora.pretrain]
|
||||||
@readers = "spacy.JsonlReader.v1"
|
@readers = "spacy.JsonlCorpus.v1"
|
||||||
path = ${paths.raw}
|
path = ${paths.raw_text}
|
||||||
min_length = 5
|
min_length = 5
|
||||||
max_length = 500
|
max_length = 500
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
|
@ -85,6 +85,7 @@ class Warnings:
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
|
||||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
|
@ -306,7 +307,7 @@ class Errors:
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call begin_training()?")
|
"call initialize()?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||||
"of the parent Doc and can't exist on their own. A pickled token "
|
"of the parent Doc and can't exist on their own. A pickled token "
|
||||||
|
@ -376,7 +377,7 @@ class Errors:
|
||||||
"provided {found}.")
|
"provided {found}.")
|
||||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||||
"by calling add_label, or by providing a representative batch of "
|
"by calling add_label, or by providing a representative batch of "
|
||||||
"examples to the component's begin_training method.")
|
"examples to the component's initialize method.")
|
||||||
E145 = ("Error reading `{param}` from input file.")
|
E145 = ("Error reading `{param}` from input file.")
|
||||||
E146 = ("Could not access `{path}`.")
|
E146 = ("Could not access `{path}`.")
|
||||||
E147 = ("Unexpected error in the {method} functionality of the "
|
E147 = ("Unexpected error in the {method} functionality of the "
|
||||||
|
@ -418,7 +419,7 @@ class Errors:
|
||||||
E164 = ("x is neither increasing nor decreasing: {}.")
|
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||||
"that case.")
|
"that case.")
|
||||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
|
@ -476,6 +477,10 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||||
|
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||||
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
|
"config.cfg or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
"the modified object in your function?")
|
"the modified object in your function?")
|
||||||
|
@ -517,7 +522,7 @@ class Errors:
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||||
"not seem to exist.")
|
"not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
E930 = ("Received invalid get_examples callback in {name}.initialize. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||||
|
@ -553,7 +558,10 @@ class Errors:
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
"component.")
|
"component.")
|
||||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
E955 = ("Can't find table(s) {table} for language '{lang}' in "
|
||||||
|
"spacy-lookups-data. Make sure you have the package installed or "
|
||||||
|
"provide your own lookup tables if no default lookups are available "
|
||||||
|
"for your language.")
|
||||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
"Available components: {opts}")
|
"Available components: {opts}")
|
||||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||||
|
@ -670,18 +678,17 @@ class Errors:
|
||||||
"'{token_attrs}'.")
|
"'{token_attrs}'.")
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||||
"initializing the pipeline:\n"
|
"loaded. Provide the name of a pretrained model or the path to "
|
||||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
|
"a model and initialize the pipeline:\n\n"
|
||||||
'nlp = Chinese(config=cfg)')
|
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||||
E1001 = ("Target token outside of matched span for match with tokens "
|
E1001 = ("Target token outside of matched span for match with tokens "
|
||||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||||
E1002 = ("Span index out of range.")
|
E1002 = ("Span index out of range.")
|
||||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||||
"Required tables '{tables}', found '{found}'. If you are not "
|
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
|
||||||
"providing custom lookups, make sure you have the package "
|
"call nlp.initialize() to load in the data?")
|
||||||
"spacy-lookups-data installed.")
|
|
||||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||||
"`ORTH` and `NORM`.")
|
"`ORTH` and `NORM`.")
|
||||||
|
@ -698,6 +705,9 @@ class Errors:
|
||||||
"options: {modes}")
|
"options: {modes}")
|
||||||
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||||
"provided to doc.set_ents as lists of `Span` objects.")
|
"provided to doc.set_ents as lists of `Span` objects.")
|
||||||
|
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||||
|
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||||
|
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,18 +23,11 @@ class Bengali(Language):
|
||||||
@Bengali.factory(
|
@Bengali.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,18 +28,11 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -9,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
|
@ -28,18 +26,11 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
class EnglishLemmatizer(Lemmatizer):
|
class EnglishLemmatizer(Lemmatizer):
|
||||||
"""English lemmatizer. Only overrides is_base_form.
|
"""English lemmatizer. Only overrides is_base_form."""
|
||||||
"""
|
|
||||||
|
|
||||||
def is_base_form(self, token: Token) -> bool:
|
def is_base_form(self, token: Token) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -58,7 +58,7 @@ def noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,18 +26,11 @@ class Persian(Language):
|
||||||
@Persian.factory(
|
@Persian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import FrenchLemmatizer
|
from .lemmatizer import FrenchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,18 +31,11 @@ class French(Language):
|
||||||
@French.factory(
|
@French.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -16,7 +15,7 @@ from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Optional, Any, Dict
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
@ -10,7 +9,7 @@ from ...compat import copy_reg
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,18 +26,11 @@ class Norwegian(Language):
|
||||||
@Norwegian.factory(
|
@Norwegian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,18 +27,11 @@ class Dutch(Language):
|
||||||
@Dutch.factory(
|
@Dutch.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -34,18 +34,11 @@ class Polish(Language):
|
||||||
@Polish.factory(
|
@Polish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
default_config={"model": None, "mode": "pos_lookup"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -7,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
|
@ -24,17 +22,11 @@ class Russian(Language):
|
||||||
@Russian.factory(
|
@Russian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
Check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,18 +29,11 @@ class Swedish(Language):
|
||||||
@Swedish.factory(
|
@Swedish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
class ThaiDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
|
@ -24,17 +23,11 @@ class Ukrainian(Language):
|
||||||
@Ukrainian.factory(
|
@Ukrainian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .stop_words import STOP_WORDS
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...util import DummyTokenizer, registry
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -17,7 +15,7 @@ use_pyvi = true
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,23 +1,25 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples, Example
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
# fmt: off
|
||||||
|
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
||||||
|
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -25,6 +27,10 @@ DEFAULT_CONFIG = """
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmenter = "char"
|
segmenter = "char"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
pkuseg_model = null
|
pkuseg_model = null
|
||||||
pkuseg_user_dict = "default"
|
pkuseg_user_dict = "default"
|
||||||
"""
|
"""
|
||||||
|
@ -41,41 +47,23 @@ class Segmenter(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = "default",
|
|
||||||
):
|
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(
|
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||||
nlp,
|
|
||||||
segmenter=segmenter,
|
|
||||||
pkuseg_model=pkuseg_model,
|
|
||||||
pkuseg_user_dict=pkuseg_user_dict,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||||
nlp: Language,
|
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
self.segmenter = segmenter
|
self.segmenter = segmenter
|
||||||
self.pkuseg_model = pkuseg_model
|
|
||||||
self.pkuseg_user_dict = pkuseg_user_dict
|
|
||||||
self.pkuseg_seg = None
|
self.pkuseg_seg = None
|
||||||
self.jieba_seg = None
|
self.jieba_seg = None
|
||||||
self.configure_segmenter(segmenter)
|
|
||||||
|
|
||||||
def configure_segmenter(self, segmenter: str):
|
|
||||||
if segmenter not in Segmenter.values():
|
if segmenter not in Segmenter.values():
|
||||||
warn_msg = Warnings.W103.format(
|
warn_msg = Warnings.W103.format(
|
||||||
lang="Chinese",
|
lang="Chinese",
|
||||||
|
@ -85,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
self.segmenter = Segmenter.char
|
self.segmenter = Segmenter.char
|
||||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
if segmenter == Segmenter.jieba:
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.jieba_seg = try_jieba_import()
|
||||||
self.segmenter,
|
|
||||||
pkuseg_model=self.pkuseg_model,
|
def initialize(
|
||||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
self,
|
||||||
)
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
pkuseg_model: Optional[str] = None,
|
||||||
|
pkuseg_user_dict: str = "default",
|
||||||
|
):
|
||||||
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.segmenter == Segmenter.jieba:
|
if self.segmenter == Segmenter.jieba:
|
||||||
|
@ -145,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"segmenter": self.segmenter,
|
"segmenter": self.segmenter,
|
||||||
"pkuseg_model": self.pkuseg_model,
|
|
||||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
|
||||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
|
@ -163,6 +156,22 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
self.pkuseg_seg.model.save(tempdir)
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
|
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
||||||
|
# means that it will be saved with pickle protocol 5 with
|
||||||
|
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
||||||
|
# To try to make the model compatible with python 3.6+, reload
|
||||||
|
# the data with pickle5 and convert it back to protocol 4.
|
||||||
|
try:
|
||||||
|
import pickle5
|
||||||
|
|
||||||
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
|
features = pickle5.load(fileh)
|
||||||
|
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||||
|
pickle5.dump(features, fileh, protocol=4)
|
||||||
|
except ImportError as e:
|
||||||
|
raise e
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
pkuseg_features_b = fileh.read()
|
pkuseg_features_b = fileh.read()
|
||||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
|
@ -235,6 +244,18 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
path.mkdir(parents=True)
|
path.mkdir(parents=True)
|
||||||
self.pkuseg_seg.model.save(path)
|
self.pkuseg_seg.model.save(path)
|
||||||
self.pkuseg_seg.feature_extractor.save(path)
|
self.pkuseg_seg.feature_extractor.save(path)
|
||||||
|
# try to convert features.pkl to pickle protocol 4
|
||||||
|
try:
|
||||||
|
import pickle5
|
||||||
|
|
||||||
|
with open(path / "features.pkl", "rb") as fileh:
|
||||||
|
features = pickle5.load(fileh)
|
||||||
|
with open(path / "features.pkl", "wb") as fileh:
|
||||||
|
pickle5.dump(features, fileh, protocol=4)
|
||||||
|
except ImportError as e:
|
||||||
|
raise e
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||||
|
|
||||||
def save_pkuseg_processors(path):
|
def save_pkuseg_processors(path):
|
||||||
if self.pkuseg_seg:
|
if self.pkuseg_seg:
|
||||||
|
@ -291,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
@ -302,47 +323,33 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults
|
Defaults = ChineseDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(segmenter: str) -> None:
|
def try_jieba_import() -> None:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
if segmenter == Segmenter.jieba:
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
list(jieba.cut("作为", cut_all=False))
|
||||||
list(jieba.cut("作为", cut_all=False))
|
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.jieba:
|
msg = (
|
||||||
msg = (
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
"Jieba not installed. To use jieba, install it with `pip "
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
)
|
||||||
)
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
if pkuseg_model:
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
elif segmenter == Segmenter.pkuseg:
|
|
||||||
msg = (
|
|
||||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
|
||||||
"was specified. Please provide the name of a pretrained model "
|
|
||||||
"or the path to a model with:\n"
|
|
||||||
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
|
||||||
"nlp = Chinese.from_config(cfg)"
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
raise FileNotFoundError(msg) from None
|
||||||
raise FileNotFoundError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pkuseg_trie_data(node, path=""):
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
from thinc.api import Model, get_current_ops, Config, Optimizer
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .training import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
|
from .training.initialize import init_vocab, init_tok2vec
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
from .util import registry, SimpleFrozenList
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema, ConfigSchemaNlp
|
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
|
||||||
|
from .schemas import ConfigSchemaPretrain, validate_init_settings
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
@ -1066,7 +1068,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.update")
|
validate_examples(examples, "Language.update")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
@ -1124,7 +1126,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.rehearse")
|
validate_examples(examples, "Language.rehearse")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
@ -1154,61 +1156,73 @@ class Language:
|
||||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
*,
|
*,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
device: int = -1,
|
) -> Optimizer:
|
||||||
|
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||||
|
return self.initialize(get_examples, sgd=sgd)
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
returns gold-standard Example objects.
|
returns gold-standard Example objects.
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||||
create_optimizer if it doesn't exist.
|
provided, will be created using the .create_optimizer() method.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||||
"""
|
"""
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
util.logger.debug(
|
util.logger.debug(
|
||||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||||
)
|
)
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
# Populate vocab
|
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
valid_examples = False
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
for example in get_examples():
|
config = self.config.interpolate()
|
||||||
if not isinstance(example, Example):
|
# These are the settings provided in the [initialize] block in the config
|
||||||
err = Errors.E978.format(
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
name="Language.begin_training", types=type(example)
|
init_vocab(
|
||||||
)
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
raise ValueError(err)
|
)
|
||||||
else:
|
pretrain_cfg = config.get("pretraining")
|
||||||
valid_examples = True
|
if pretrain_cfg:
|
||||||
for word in [t.text for t in example.reference]:
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
_ = self.vocab[word] # noqa: F841
|
init_tok2vec(self, P, I)
|
||||||
if not valid_examples:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
err = Errors.E930.format(name="Language", obj="empty list")
|
ops = get_current_ops()
|
||||||
raise ValueError(err)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if device >= 0: # TODO: do we need this here?
|
if hasattr(self.tokenizer, "initialize"):
|
||||||
require_gpu(device)
|
tok_settings = validate_init_settings(
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
self.tokenizer.initialize,
|
||||||
ops = get_current_ops()
|
I["tokenizer"],
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
section="tokenizer",
|
||||||
if sgd is None:
|
name="tokenizer",
|
||||||
sgd = create_default_optimizer()
|
)
|
||||||
self._optimizer = sgd
|
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "begin_training"):
|
if hasattr(proc, "initialize"):
|
||||||
proc.begin_training(
|
p_settings = I["components"].get(name, {})
|
||||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
p_settings = validate_init_settings(
|
||||||
|
proc.initialize, p_settings, section="components", name=name
|
||||||
)
|
)
|
||||||
|
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._optimizer = sgd
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
self, *, sgd: Optional[Optimizer] = None, device: int = -1
|
|
||||||
) -> Optimizer:
|
|
||||||
"""Continue training a pretrained model.
|
"""Continue training a pretrained model.
|
||||||
|
|
||||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||||
|
@ -1217,22 +1231,20 @@ class Language:
|
||||||
rehearsal, collect samples of text you want the models to retain performance
|
rehearsal, collect samples of text you want the models to retain performance
|
||||||
on, and call nlp.rehearse() with a batch of Example objects.
|
on, and call nlp.rehearse() with a batch of Example objects.
|
||||||
|
|
||||||
sgd (Optional[Optimizer]): An optimizer.
|
|
||||||
RETURNS (Optimizer): The optimizer.
|
RETURNS (Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#resume_training
|
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
if device >= 0: # TODO: do we need this here?
|
ops = get_current_ops()
|
||||||
require_gpu(device)
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = create_default_optimizer()
|
|
||||||
self._optimizer = sgd
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "_rehearsal_model"):
|
if hasattr(proc, "_rehearsal_model"):
|
||||||
proc._rehearsal_model = deepcopy(proc.model)
|
proc._rehearsal_model = deepcopy(proc.model)
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
|
@ -1294,6 +1306,11 @@ class Language:
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def create_optimizer(self):
|
||||||
|
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||||
|
subconfig = {"optimizer": self.config["training"]["optimizer"]}
|
||||||
|
return registry.resolve(subconfig)["optimizer"]
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params: Optional[dict]):
|
def use_params(self, params: Optional[dict]):
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
|
@ -1502,7 +1519,7 @@ class Language:
|
||||||
).merge(config)
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"].get("lang")
|
||||||
if config_lang is not None and config_lang != cls.lang:
|
if config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E958.format(
|
Errors.E958.format(
|
||||||
|
|
28
spacy/ml/featureextractor.py
Normal file
28
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from typing import List, Union, Callable, Tuple
|
||||||
|
from thinc.types import Ints2d
|
||||||
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
|
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||||
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
|
columns = model.attrs["columns"]
|
||||||
|
features: List[Ints2d] = []
|
||||||
|
for doc in docs:
|
||||||
|
if hasattr(doc, "to_array"):
|
||||||
|
attrs = doc.to_array(columns)
|
||||||
|
else:
|
||||||
|
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||||
|
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||||
|
|
||||||
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
|
return features, backprop
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
|
||||||
from thinc.api import FeatureExtractor, HashEmbed
|
|
||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -98,7 +98,7 @@ def MultiHashEmbed(
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
varying definitions depending on the Vocab of the Doc object passed in.
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
Vectors from pretrained static vectors can also be incorporated into the
|
||||||
concatenated representation.
|
concatenated representation.
|
||||||
|
@ -115,7 +115,7 @@ def MultiHashEmbed(
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(feature):
|
||||||
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(
|
||||||
width,
|
width,
|
||||||
rows if feature == NORM else rows // 2,
|
rows if feature == LOWER else rows // 2,
|
||||||
column=cols.index(feature),
|
column=cols.index(feature),
|
||||||
seed=seed,
|
seed=seed,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
|
@ -131,13 +131,13 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
if also_embed_subwords:
|
if also_embed_subwords:
|
||||||
embeddings = [
|
embeddings = [
|
||||||
make_hash_embed(NORM),
|
make_hash_embed(LOWER),
|
||||||
make_hash_embed(PREFIX),
|
make_hash_embed(PREFIX),
|
||||||
make_hash_embed(SUFFIX),
|
make_hash_embed(SUFFIX),
|
||||||
make_hash_embed(SHAPE),
|
make_hash_embed(SHAPE),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
embeddings = [make_hash_embed(NORM)]
|
embeddings = [make_hash_embed(LOWER)]
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
||||||
if also_use_static_vectors:
|
if also_use_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
|
@ -165,7 +165,8 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(
|
def CharacterEmbed(
|
||||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
|
||||||
|
feature: Union[int, str]="LOWER"
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
|
@ -179,12 +180,13 @@ def CharacterEmbed(
|
||||||
of being in an arbitrary position depending on the word length.
|
of being in an arbitrary position depending on the word length.
|
||||||
|
|
||||||
The characters are embedded in a embedding table with a given number of rows,
|
The characters are embedded in a embedding table with a given number of rows,
|
||||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||||
also concatenated on, and the result is then passed through a feed-forward
|
also concatenated on, and the result is then passed through a feed-forward
|
||||||
network to construct a single vector to represent the information.
|
network to construct a single vector to represent the information.
|
||||||
|
|
||||||
width (int): The width of the output vector and the NORM hash embedding.
|
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||||
rows (int): The number of rows in the NORM hash embedding table.
|
width (int): The width of the output vector and the feature embedding.
|
||||||
|
rows (int): The number of rows in the LOWER hash embedding table.
|
||||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||||
are between 16 and 64.
|
are between 16 and 64.
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
|
@ -193,12 +195,15 @@ def CharacterEmbed(
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
|
feature = intify_attr(feature)
|
||||||
|
if feature is None:
|
||||||
|
raise ValueError("Invalid feature: Must be a token attribute.")
|
||||||
if also_use_static_vectors:
|
if also_use_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([feature]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
),
|
),
|
||||||
|
@ -214,7 +219,7 @@ def CharacterEmbed(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([feature]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
),
|
),
|
||||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
def analyze_pipes(
|
||||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
|
|
|
@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
|
||||||
matches = self.matcher(doc, allow_missing=True)
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
# Sort by the attribute ID, so that later rules have precendence
|
# Sort by the attribute ID, so that later rules have precendence
|
||||||
matches = [
|
matches = [
|
||||||
(int(self.vocab.strings[m_id]), m_id, s, e)
|
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||||
for m_id, s, e in matches
|
|
||||||
]
|
]
|
||||||
matches.sort()
|
matches.sort()
|
||||||
for attr_id, match_id, start, end in matches:
|
for attr_id, match_id, start, end in matches:
|
||||||
|
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
|
||||||
try:
|
try:
|
||||||
# The index can be negative, which makes it annoying to do
|
# The index can be negative, which makes it annoying to do
|
||||||
# the boundscheck. Let Span do it instead.
|
# the boundscheck. Let Span do it instead.
|
||||||
token = span[index]
|
token = span[index] # noqa: F841
|
||||||
except IndexError:
|
except IndexError:
|
||||||
# The original exception is just our conditional logic, so we
|
# The original exception is just our conditional logic, so we
|
||||||
# raise from.
|
# raise from.
|
||||||
|
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
|
||||||
span=[t.text for t in span],
|
span=[t.text for t in span],
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
) from None
|
) from None
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
labeller.initialize(get_examples, nlp=nlp)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
|
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
import random
|
import random
|
||||||
|
@ -140,26 +140,20 @@ class EntityLinker(Pipe):
|
||||||
if len(self.kb) == 0:
|
if len(self.kb) == 0:
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
):
|
||||||
) -> Optimizer:
|
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self._require_kb()
|
self._require_kb()
|
||||||
|
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
|
||||||
self.model.initialize(
|
self.model.initialize(
|
||||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||||
)
|
)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -1,26 +1,25 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
|
||||||
|
from typing import Tuple
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..training import Example
|
||||||
from ..lookups import Lookups, load_lookups
|
from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
|
from ..util import logger, SimpleFrozenList
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||||
"model": None,
|
|
||||||
"mode": "lookup",
|
|
||||||
"lookups": None,
|
|
||||||
"overwrite": False,
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
|
@ -28,13 +27,9 @@ def make_lemmatizer(
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
lookups: Optional[Lookups],
|
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
):
|
):
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
return Lemmatizer(
|
|
||||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(Pipe):
|
class Lemmatizer(Pipe):
|
||||||
|
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Dict:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
"""Returns the lookups configuration settings for a given mode for use
|
"""Returns the lookups configuration settings for a given mode for use
|
||||||
in Lemmatizer.load_lookups.
|
in Lemmatizer.load_lookups.
|
||||||
|
|
||||||
mode (str): The lemmatizer mode.
|
mode (str): The lemmatizer mode.
|
||||||
RETURNS (dict): The lookups configuration settings for this mode.
|
RETURNS (Tuple[List[str], List[str]]): The required and optional
|
||||||
|
lookup tables for this mode.
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
|
||||||
"""
|
"""
|
||||||
if mode == "lookup":
|
if mode == "lookup":
|
||||||
return {
|
return (["lemma_lookup"], [])
|
||||||
"required_tables": ["lemma_lookup"],
|
|
||||||
}
|
|
||||||
elif mode == "rule":
|
elif mode == "rule":
|
||||||
return {
|
return (["lemma_rules"], ["lemma_exc", "lemma_index"])
|
||||||
"required_tables": ["lemma_rules"],
|
return ([], [])
|
||||||
"optional_tables": ["lemma_exc", "lemma_index"],
|
|
||||||
}
|
|
||||||
return {}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
|
||||||
"""Load and validate lookups tables. If the provided lookups is None,
|
|
||||||
load the default lookups tables according to the language and mode
|
|
||||||
settings. Confirm that all required tables for the language and mode
|
|
||||||
are present.
|
|
||||||
|
|
||||||
lang (str): The language code.
|
|
||||||
mode (str): The lemmatizer mode.
|
|
||||||
lookups (Lookups): The provided lookups, may be None if the default
|
|
||||||
lookups should be loaded.
|
|
||||||
RETURNS (Lookups): The Lookups object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
|
||||||
"""
|
|
||||||
config = cls.get_lookups_config(mode)
|
|
||||||
required_tables = config.get("required_tables", [])
|
|
||||||
optional_tables = config.get("optional_tables", [])
|
|
||||||
if lookups is None:
|
|
||||||
lookups = load_lookups(lang=lang, tables=required_tables)
|
|
||||||
optional_lookups = load_lookups(
|
|
||||||
lang=lang, tables=optional_tables, strict=False
|
|
||||||
)
|
|
||||||
for table in optional_lookups.tables:
|
|
||||||
lookups.set_table(table, optional_lookups.get_table(table))
|
|
||||||
for table in required_tables:
|
|
||||||
if table not in lookups:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E1004.format(
|
|
||||||
mode=mode, tables=required_tables, found=lookups.tables
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return lookups
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "lookup",
|
mode: str = "lookup",
|
||||||
lookups: Optional[Lookups] = None,
|
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
|
||||||
model (Model): A model (not yet implemented).
|
model (Model): A model (not yet implemented).
|
||||||
name (str): The component name. Defaults to "lemmatizer".
|
name (str): The component name. Defaults to "lemmatizer".
|
||||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
|
||||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
|
||||||
"lemma_lookup". Defaults to None
|
|
||||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||||
`False`.
|
`False`.
|
||||||
|
|
||||||
|
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._mode = mode
|
self._mode = mode
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = Lookups()
|
||||||
self.overwrite = overwrite
|
self.overwrite = overwrite
|
||||||
|
self._validated = False
|
||||||
if self.mode == "lookup":
|
if self.mode == "lookup":
|
||||||
self.lemmatize = self.lookup_lemmatize
|
self.lemmatize = self.lookup_lemmatize
|
||||||
elif self.mode == "rule":
|
elif self.mode == "rule":
|
||||||
|
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||||
"""
|
"""
|
||||||
|
if not self._validated:
|
||||||
|
self._validate_tables(Errors.E1004)
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if self.overwrite or token.lemma == 0:
|
if self.overwrite or token.lemma == 0:
|
||||||
token.lemma_ = self.lemmatize(token)[0]
|
token.lemma_ = self.lemmatize(token)[0]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, *, batch_size=128):
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
lookups: Optional[Lookups] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the lemmatizer and load in data.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
|
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||||
|
"lemma_lookup". Defaults to None.
|
||||||
|
"""
|
||||||
|
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||||
|
if lookups is None:
|
||||||
|
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
|
||||||
|
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
|
||||||
|
optional_lookups = load_lookups(
|
||||||
|
lang=self.vocab.lang, tables=optional_tables, strict=False
|
||||||
|
)
|
||||||
|
for table in optional_lookups.tables:
|
||||||
|
lookups.set_table(table, optional_lookups.get_table(table))
|
||||||
|
self.lookups = lookups
|
||||||
|
self._validate_tables(Errors.E1004)
|
||||||
|
|
||||||
|
def _validate_tables(self, error_message: str = Errors.E912) -> None:
|
||||||
|
"""Check that the lookups are correct for the current mode."""
|
||||||
|
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||||
|
for table in required_tables:
|
||||||
|
if table not in self.lookups:
|
||||||
|
raise ValueError(
|
||||||
|
error_message.format(
|
||||||
|
mode=self.mode,
|
||||||
|
tables=required_tables,
|
||||||
|
found=self.lookups.tables,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._validated = True
|
||||||
|
|
||||||
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def score(self, examples, **kwargs) -> Dict[str, Any]:
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
validate_examples(examples, "Lemmatizer.score")
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(
|
||||||
"""Save the current state to a directory.
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (str / Path): Path to a directory.
|
||||||
it doesn't exist.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
returns it.
|
) -> "Lemmatizer":
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Lemmatizer): The modified Lemmatizer object.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
self._validate_tables()
|
||||||
|
return self
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()) -> bytes:
|
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["lookups"] = self.lookups.to_bytes
|
serialize["lookups"] = self.lookups.to_bytes
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
|
def from_bytes(
|
||||||
"""Load state from a binary string.
|
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> "Lemmatizer":
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The serialized pipe.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Lemmatizer): The loaded Lemmatizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
self._validate_tables()
|
||||||
|
return self
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional
|
from typing import Optional, Union, Dict
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||||
return tuple(self.cfg["labels_morph"].keys())
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||||
|
"""A dictionary with all labels data."""
|
||||||
|
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
@ -129,27 +134,22 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
# First, fetch all labels from the data
|
# First, fetch all labels from the data
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
morph = token.morph_
|
morph = str(token.morph)
|
||||||
# create and add the combined morph+POS label
|
# create and add the combined morph+POS label
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
|
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
|
||||||
gold_array = []
|
gold_array = []
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
morph = token.morph_
|
morph = str(token.morph)
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
morph_dict[self.POS_FEAT] = pos
|
morph_dict[self.POS_FEAT] = pos
|
||||||
|
@ -178,9 +178,6 @@ class Morphologizer(Tagger):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
|
@ -177,13 +174,10 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.initialize(X)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
|
|
|
@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
|
||||||
"""Register another component as a multi-task objective. Experimental."""
|
"""Register another component as a multi-task objective. Experimental."""
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||||
"""Setup multi-task objective components. Experimental and internal."""
|
"""Setup multi-task objective components. Experimental and internal."""
|
||||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline)
|
labeller.initialize(get_examples, nlp=nlp)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
from typing import Optional, Tuple
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate, Model
|
from thinc.api import set_dropout_rate, Model
|
||||||
|
|
||||||
|
@ -32,6 +33,17 @@ cdef class Pipe:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Optional[Tuple[str]]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
the label set if provided to the `pipe.initialize()` method.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Apply the pipe to one document. The document is modified in place,
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
and returned. This usually happens under the hood when the nlp object
|
and returned. This usually happens under the hood when the nlp object
|
||||||
|
@ -183,7 +195,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each Pipe component,
|
This method needs to be implemented by each Pipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
@ -191,16 +203,11 @@ cdef class Pipe:
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
pass
|
||||||
|
|
||||||
def _ensure_examples(self, get_examples):
|
def _ensure_examples(self, get_examples):
|
||||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
|
|
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.labels
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -90,6 +90,11 @@ class Tagger(Pipe):
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Data about the labels currently added to the component."""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the pipe to a Doc.
|
"""Apply the pipe to a Doc.
|
||||||
|
|
||||||
|
@ -256,31 +261,33 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None, labels=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects..
|
returns a representative sample of gold-standard Example objects..
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
labels: The labels to add to the component, typically generated by the
|
||||||
nlp.pipeline.
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
callback is used to extract the labels from the data.
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
|
if labels is not None:
|
||||||
|
for tag in labels:
|
||||||
|
self.add_label(tag)
|
||||||
|
else:
|
||||||
|
tags = set()
|
||||||
|
for example in get_examples():
|
||||||
|
for token in example.y:
|
||||||
|
if token.tag_:
|
||||||
|
tags.add(token.tag_)
|
||||||
|
for tag in sorted(tags):
|
||||||
|
self.add_label(tag)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
tags = set()
|
|
||||||
for example in get_examples():
|
|
||||||
for token in example.y:
|
|
||||||
if token.tag_:
|
|
||||||
tags.add(token.tag_)
|
|
||||||
for tag in sorted(tags):
|
|
||||||
self.add_label(tag)
|
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
doc_sample.append(example.x)
|
doc_sample.append(example.x)
|
||||||
gold_tags = example.get_aligned("TAG", as_string=True)
|
gold_tags = example.get_aligned("TAG", as_string=True)
|
||||||
|
@ -289,9 +296,6 @@ class Tagger(Pipe):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
|
@ -154,8 +154,16 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value: List[str]) -> None:
|
def labels(self, value: List[str]) -> None:
|
||||||
|
# TODO: This really shouldn't be here. I had a look and I added it when
|
||||||
|
# I added the labels property, but it's pretty nasty to have this, and
|
||||||
|
# will lead to problems.
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> List[str]:
|
||||||
|
"""RETURNS (List[str]): Information about the component's labels."""
|
||||||
|
return self.labels
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
@ -334,43 +342,40 @@ class TextCategorizer(Pipe):
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
labels: Optional[Dict] = None,
|
||||||
) -> Optimizer:
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
labels: The labels to add to the component, typically generated by the
|
||||||
nlp.pipeline.
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
callback is used to extract the labels from the data.
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
subbatch = [] # Select a subbatch of examples to initialize the model
|
if labels is None:
|
||||||
for example in islice(get_examples(), 10):
|
for example in get_examples():
|
||||||
if len(subbatch) < 2:
|
for cat in example.y.cats:
|
||||||
subbatch.append(example)
|
self.add_label(cat)
|
||||||
for cat in example.y.cats:
|
else:
|
||||||
self.add_label(cat)
|
for label in labels:
|
||||||
|
self.add_label(label)
|
||||||
|
subbatch = list(islice(get_examples(), 10))
|
||||||
doc_sample = [eg.reference for eg in subbatch]
|
doc_sample = [eg.reference for eg in subbatch]
|
||||||
label_sample, _ = self._examples_to_truth(subbatch)
|
label_sample, _ = self._examples_to_truth(subbatch)
|
||||||
self._require_labels()
|
self._require_labels()
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
|
@ -203,26 +203,20 @@ class Tok2Vec(Pipe):
|
||||||
def get_loss(self, examples, scores) -> None:
|
def get_loss(self, examples, scores) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
@ -7,6 +7,7 @@ from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
import random
|
import random
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
@ -95,6 +96,10 @@ cdef class Parser(Pipe):
|
||||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||||
return class_names
|
return class_names
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.moves.labels
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
"""Return the embedding and convolutional layer of the model."""
|
"""Return the embedding and convolutional layer of the model."""
|
||||||
|
@ -354,7 +359,7 @@ cdef class Parser(Pipe):
|
||||||
# If all weights for an output are 0 in the original model, don't
|
# If all weights for an output are 0 in the original model, don't
|
||||||
# supervise that output. This allows us to add classes.
|
# supervise that output. This allows us to add classes.
|
||||||
loss += (d_scores**2).sum()
|
loss += (d_scores**2).sum()
|
||||||
backprop(d_scores, sgd=sgd)
|
backprop(d_scores)
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, guesses)
|
self.transition_states(states, guesses)
|
||||||
states = [state for state in states if not state.is_final()]
|
states = [state for state in states if not state.is_final()]
|
||||||
|
@ -405,18 +410,20 @@ cdef class Parser(Pipe):
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self.cfg.update(kwargs)
|
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||||
actions = self.moves.get_actions(
|
if labels is not None:
|
||||||
examples=get_examples(),
|
actions = dict(labels)
|
||||||
min_freq=self.cfg['min_action_freq'],
|
else:
|
||||||
learn_tokens=self.cfg["learn_tokens"]
|
actions = self.moves.get_actions(
|
||||||
)
|
examples=get_examples(),
|
||||||
|
min_freq=self.cfg['min_action_freq'],
|
||||||
|
learn_tokens=self.cfg["learn_tokens"]
|
||||||
|
)
|
||||||
for action, labels in self.moves.labels.items():
|
for action, labels in self.moves.labels.items():
|
||||||
actions.setdefault(action, {})
|
actions.setdefault(action, {})
|
||||||
for label, freq in labels.items():
|
for label, freq in labels.items():
|
||||||
|
@ -425,11 +432,9 @@ cdef class Parser(Pipe):
|
||||||
self.moves.initialize_actions(actions)
|
self.moves.initialize_actions(actions)
|
||||||
# make sure we resize so we have an appropriate upper layer
|
# make sure we resize so we have an appropriate upper layer
|
||||||
self._resize()
|
self._resize()
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
for name, component in pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if component is self:
|
if component is self:
|
||||||
break
|
break
|
||||||
if hasattr(component, "pipe"):
|
if hasattr(component, "pipe"):
|
||||||
|
@ -441,9 +446,8 @@ cdef class Parser(Pipe):
|
||||||
doc_sample.append(example.predicted)
|
doc_sample.append(example.predicted)
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(doc_sample)
|
self.model.initialize(doc_sample)
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||||
return sgd
|
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
|
|
144
spacy/schemas.py
144
spacy/schemas.py
|
@ -1,15 +1,17 @@
|
||||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic import root_validator
|
from pydantic.main import ModelMetaclass
|
||||||
|
from thinc.api import Optimizer, ConfigValidationError
|
||||||
from thinc.config import Promise
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from thinc.api import Optimizer
|
import inspect
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
|
from .util import is_cython_func
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
@ -44,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
||||||
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
|
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
|
||||||
|
|
||||||
|
|
||||||
|
# Initialization
|
||||||
|
|
||||||
|
|
||||||
|
class ArgSchemaConfig:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
class ArgSchemaConfigExtra:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
def get_arg_model(
|
||||||
|
func: Callable,
|
||||||
|
*,
|
||||||
|
exclude: Iterable[str] = tuple(),
|
||||||
|
name: str = "ArgModel",
|
||||||
|
strict: bool = True,
|
||||||
|
) -> ModelMetaclass:
|
||||||
|
"""Generate a pydantic model for function arguments.
|
||||||
|
|
||||||
|
func (Callable): The function to generate the schema for.
|
||||||
|
exclude (Iterable[str]): Parameter names to ignore.
|
||||||
|
name (str): Name of created model class.
|
||||||
|
strict (bool): Don't allow extra arguments if no variable keyword arguments
|
||||||
|
are allowed on the function.
|
||||||
|
RETURNS (ModelMetaclass): A pydantic model.
|
||||||
|
"""
|
||||||
|
sig_args = {}
|
||||||
|
try:
|
||||||
|
sig = inspect.signature(func)
|
||||||
|
except ValueError:
|
||||||
|
# Typically happens if the method is part of a Cython module without
|
||||||
|
# binding=True. Here we just use an empty model that allows everything.
|
||||||
|
return create_model(name, __config__=ArgSchemaConfigExtra)
|
||||||
|
has_variable = False
|
||||||
|
for param in sig.parameters.values():
|
||||||
|
if param.name in exclude:
|
||||||
|
continue
|
||||||
|
if param.kind == param.VAR_KEYWORD:
|
||||||
|
# The function allows variable keyword arguments so we shouldn't
|
||||||
|
# include **kwargs etc. in the schema and switch to non-strict
|
||||||
|
# mode and pass through all other values
|
||||||
|
has_variable = True
|
||||||
|
continue
|
||||||
|
# If no annotation is specified assume it's anything
|
||||||
|
annotation = param.annotation if param.annotation != param.empty else Any
|
||||||
|
# If no default value is specified assume that it's required. Cython
|
||||||
|
# functions/methods will have param.empty for default value None so we
|
||||||
|
# need to treat them differently
|
||||||
|
default_empty = None if is_cython_func(func) else ...
|
||||||
|
default = param.default if param.default != param.empty else default_empty
|
||||||
|
sig_args[param.name] = (annotation, default)
|
||||||
|
is_strict = strict and not has_variable
|
||||||
|
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
|
||||||
|
return create_model(name, **sig_args)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_init_settings(
|
||||||
|
func: Callable,
|
||||||
|
settings: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
section: Optional[str] = None,
|
||||||
|
name: str = "",
|
||||||
|
exclude: Iterable[str] = ("get_examples", "nlp"),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Validate initialization settings against the expected arguments in
|
||||||
|
the method signature. Will parse values if possible (e.g. int to string)
|
||||||
|
and return the updated settings dict. Will raise a ConfigValidationError
|
||||||
|
if types don't match or required values are missing.
|
||||||
|
|
||||||
|
func (Callable): The initialize method of a given component etc.
|
||||||
|
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
||||||
|
section (str): Initialize section, for error message.
|
||||||
|
name (str): Name of the block in the section.
|
||||||
|
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||||
|
RETURNS (Dict[str, Any]): The validated settings.
|
||||||
|
"""
|
||||||
|
schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
|
||||||
|
try:
|
||||||
|
return schema(**settings).dict()
|
||||||
|
except ValidationError as e:
|
||||||
|
block = "initialize" if not section else f"initialize.{section}"
|
||||||
|
title = f"Error validating initialization settings in [{block}]"
|
||||||
|
raise ConfigValidationError(
|
||||||
|
title=title, errors=e.errors(), config=settings, parent=name
|
||||||
|
) from None
|
||||||
|
|
||||||
|
|
||||||
# Matcher token patterns
|
# Matcher token patterns
|
||||||
|
|
||||||
|
|
||||||
|
@ -190,7 +282,7 @@ class ModelMetaSchema(BaseModel):
|
||||||
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
||||||
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
||||||
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
||||||
performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
|
performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
|
||||||
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -205,8 +297,6 @@ class ModelMetaSchema(BaseModel):
|
||||||
|
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
|
||||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
|
||||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
|
@ -219,8 +309,6 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
|
||||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
logger: Logger = Field(..., title="The logger to track training progress")
|
logger: Logger = Field(..., title="The logger to track training progress")
|
||||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||||
|
@ -273,36 +361,40 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigSchemaInit(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||||
|
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||||
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchema(BaseModel):
|
class ConfigSchema(BaseModel):
|
||||||
training: ConfigSchemaTraining
|
training: ConfigSchemaTraining
|
||||||
nlp: ConfigSchemaNlp
|
nlp: ConfigSchemaNlp
|
||||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||||
components: Dict[str, Dict[str, Any]]
|
components: Dict[str, Dict[str, Any]]
|
||||||
corpora: Dict[str, Reader]
|
corpora: Dict[str, Reader]
|
||||||
|
initialize: ConfigSchemaInit
|
||||||
@root_validator(allow_reuse=True)
|
|
||||||
def validate_config(cls, values):
|
|
||||||
"""Perform additional validation for settings with dependencies."""
|
|
||||||
pt = values.get("pretraining")
|
|
||||||
if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
|
|
||||||
if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
|
|
||||||
err = "Need nlp.vectors if pretraining.objective.type is vectors"
|
|
||||||
raise ValueError(err)
|
|
||||||
return values
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class TrainingSchema(BaseModel):
|
CONFIG_SCHEMAS = {
|
||||||
training: ConfigSchemaTraining
|
"nlp": ConfigSchemaNlp,
|
||||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
"training": ConfigSchemaTraining,
|
||||||
corpora: Dict[str, Reader]
|
"pretraining": ConfigSchemaPretrain,
|
||||||
|
"initialize": ConfigSchemaInit,
|
||||||
class Config:
|
}
|
||||||
extra = "allow"
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
|
|
||||||
# Project config Schema
|
# Project config Schema
|
||||||
|
|
|
@ -32,9 +32,7 @@ class PRFScore:
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
return PRFScore(
|
return PRFScore(
|
||||||
tp=self.tp+other.tp,
|
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||||
fp=self.fp+other.fp,
|
|
||||||
fn=self.fn+other.fn
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def score_set(self, cand: set, gold: set) -> None:
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
|
@ -485,7 +483,7 @@ class Scorer:
|
||||||
(pred_ent.start_char, pred_ent.end_char), None
|
(pred_ent.start_char, pred_ent.end_char), None
|
||||||
)
|
)
|
||||||
label = gold_span.label_
|
label = gold_span.label_
|
||||||
if not label in f_per_type:
|
if label not in f_per_type:
|
||||||
f_per_type[label] = PRFScore()
|
f_per_type[label] = PRFScore()
|
||||||
gold = gold_span.kb_id_
|
gold = gold_span.kb_id_
|
||||||
# only evaluating entities that overlap between gold and pred,
|
# only evaluating entities that overlap between gold and pred,
|
||||||
|
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
||||||
continue
|
continue
|
||||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||||
align_x2y = eg.alignment.x2y
|
align_x2y = eg.alignment.x2y
|
||||||
preds = set()
|
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in scores:
|
if pred_ent.label_ not in scores:
|
||||||
scores[pred_ent.label_] = PRFScore()
|
scores[pred_ent.label_] = PRFScore()
|
||||||
|
|
|
@ -272,22 +272,35 @@ def zh_tokenizer_char():
|
||||||
def zh_tokenizer_jieba():
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "jieba",
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "jieba",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
|
pytest.importorskip("pickle5")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "pkuseg",
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "default",
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
|
||||||
doc.ents = [("ANIMAL", 3, 4)]
|
doc.ents = [("ANIMAL", 3, 4)]
|
||||||
|
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
orig_iobs = [t.ent_iob_ for t in doc]
|
orig_iobs = [t.ent_iob_ for t in doc]
|
||||||
doc.ents = list(doc.ents)
|
doc.ents = list(doc.ents)
|
||||||
|
|
|
@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
|
||||||
words = ["Eat", "blue", "ham"]
|
words = ["Eat", "blue", "ham"]
|
||||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||||
assert morph[0] == doc[0].morph_
|
assert morph[0] == str(doc[0].morph)
|
||||||
assert morph[1] == doc[1].morph_
|
assert morph[1] == str(doc[1].morph)
|
||||||
assert morph[2] == doc[2].morph_
|
assert morph[2] == str(doc[2].morph)
|
||||||
|
|
||||||
feats_array = doc.to_array((ORTH, MORPH))
|
feats_array = doc.to_array((ORTH, MORPH))
|
||||||
assert feats_array[0][1] == doc[0].morph.key
|
assert feats_array[0][1] == doc[0].morph.key
|
||||||
|
|
|
@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
# heads override sent_starts
|
# heads override sent_starts
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
|
||||||
)
|
)
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York", "."]
|
words = ["I", "live", "in", "New", "York", "."]
|
||||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words, morphs=morphs)
|
||||||
for i, morph in enumerate(morphs):
|
|
||||||
doc[i].morph_ = morph
|
|
||||||
attrs = [MORPH]
|
attrs = [MORPH]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
new_doc = Doc(en_vocab, words=words)
|
new_doc = Doc(en_vocab, words=words)
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert [t.morph_ for t in new_doc] == morphs
|
assert [str(t.morph) for t in new_doc] == morphs
|
||||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
|
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
|
||||||
|
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
doc[0].pos_ = "X"
|
doc[0].pos_ = "X"
|
||||||
doc[0].morph_ = "Feat=Val"
|
doc[0].set_morph("Feat=Val")
|
||||||
doc[0].lemma_ = "a"
|
doc[0].lemma_ = "a"
|
||||||
doc[0].dep_ = "dep"
|
doc[0].dep_ = "dep"
|
||||||
doc[0].head = doc[1]
|
doc[0].head = doc[1]
|
||||||
|
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
|
||||||
|
|
||||||
doc[1].tag_ = "A"
|
doc[1].tag_ = "A"
|
||||||
doc[1].pos_ = "X"
|
doc[1].pos_ = "X"
|
||||||
doc[1].morph_ = ""
|
doc[1].set_morph("")
|
||||||
doc[1].lemma_ = "a"
|
doc[1].lemma_ = "a"
|
||||||
doc[1].dep_ = "dep"
|
doc[1].dep_ = "dep"
|
||||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||||
|
@ -533,5 +531,78 @@ def test_doc_ents_setter():
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||||
|
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||||
doc = Doc(vocab, words=words, ents=ents)
|
doc = Doc(vocab, words=words, ents=ents)
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
|
||||||
|
doc1 = en_tokenizer("a b")
|
||||||
|
doc1b = en_tokenizer("c d")
|
||||||
|
doc2 = de_tokenizer("a b")
|
||||||
|
|
||||||
|
# unset values can be copied
|
||||||
|
doc1[0].morph = doc1[1].morph
|
||||||
|
assert doc1[0].morph.key == 0
|
||||||
|
assert doc1[1].morph.key == 0
|
||||||
|
|
||||||
|
# morph values from the same vocab can be copied
|
||||||
|
doc1[0].set_morph("Feat=Val")
|
||||||
|
doc1[1].morph = doc1[0].morph
|
||||||
|
assert doc1[0].morph == doc1[1].morph
|
||||||
|
|
||||||
|
# ... also across docs
|
||||||
|
doc1b[0].morph = doc1[0].morph
|
||||||
|
assert doc1[0].morph == doc1b[0].morph
|
||||||
|
|
||||||
|
doc2[0].set_morph("Feat2=Val2")
|
||||||
|
|
||||||
|
# the morph value must come from the same vocab
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc1[0].morph = doc2[0].morph
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_init_iob():
|
||||||
|
"""Test ents validation/normalization in Doc.__init__"""
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
ents = ["O"] * len(words)
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert doc.ents == ()
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 3
|
||||||
|
|
||||||
|
# None is missing
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# empty tag is missing
|
||||||
|
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# invalid IOB
|
||||||
|
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no dash
|
||||||
|
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no ent type
|
||||||
|
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# not strings or None
|
||||||
|
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
|
@ -4,13 +4,13 @@ import pytest
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def i_has(en_tokenizer):
|
def i_has(en_tokenizer):
|
||||||
doc = en_tokenizer("I has")
|
doc = en_tokenizer("I has")
|
||||||
doc[0].morph_ = {"PronType": "prs"}
|
doc[0].set_morph({"PronType": "prs"})
|
||||||
doc[1].morph_ = {
|
doc[1].set_morph({
|
||||||
"VerbForm": "fin",
|
"VerbForm": "fin",
|
||||||
"Tense": "pres",
|
"Tense": "pres",
|
||||||
"Number": "sing",
|
"Number": "sing",
|
||||||
"Person": "three",
|
"Person": "three",
|
||||||
}
|
})
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
@ -47,20 +47,20 @@ def test_morph_get(i_has):
|
||||||
def test_morph_set(i_has):
|
def test_morph_set(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
# set by string
|
# set by string
|
||||||
i_has[0].morph_ = "PronType=unk"
|
i_has[0].set_morph("PronType=unk")
|
||||||
assert i_has[0].morph.get("PronType") == ["unk"]
|
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||||
# set by string, fields are alphabetized
|
# set by string, fields are alphabetized
|
||||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
i_has[0].set_morph("PronType=123|NounType=unk")
|
||||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
assert str(i_has[0].morph) == "NounType=unk|PronType=123"
|
||||||
# set by dict
|
# set by dict
|
||||||
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
i_has[0].set_morph({"AType": "123", "BType": "unk"})
|
||||||
assert i_has[0].morph_ == "AType=123|BType=unk"
|
assert str(i_has[0].morph) == "AType=123|BType=unk"
|
||||||
# set by string with multiple values, fields and values are alphabetized
|
# set by string with multiple values, fields and values are alphabetized
|
||||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
i_has[0].set_morph("BType=c|AType=b,a")
|
||||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||||
# set by dict with multiple values, fields and values are alphabetized
|
# set by dict with multiple values, fields and values are alphabetized
|
||||||
i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
|
i_has[0].set_morph({"AType": "b,a", "BType": "c"})
|
||||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||||
|
|
||||||
|
|
||||||
def test_morph_str(i_has):
|
def test_morph_str(i_has):
|
||||||
|
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
|
||||||
doc = tokenizer("a dog")
|
doc = tokenizer("a dog")
|
||||||
|
|
||||||
# set through token.morph_
|
# set through token.morph_
|
||||||
doc[0].morph_ = "PronType=prs"
|
doc[0].set_morph("PronType=prs")
|
||||||
assert doc[0].morph_ == "PronType=prs"
|
assert str(doc[0].morph) == "PronType=prs"
|
||||||
assert doc.to_array(["MORPH"])[0] != 0
|
assert doc.to_array(["MORPH"])[0] != 0
|
||||||
|
|
||||||
# unset with token.morph
|
# unset with token.morph
|
||||||
doc[0].morph = 0
|
doc[0].set_morph(None)
|
||||||
assert doc.to_array(["MORPH"])[0] == 0
|
assert doc.to_array(["MORPH"])[0] == 0
|
||||||
|
|
||||||
# empty morph is equivalent to "_"
|
# empty morph is equivalent to "_"
|
||||||
doc[0].morph_ = ""
|
doc[0].set_morph("")
|
||||||
assert doc[0].morph_ == ""
|
assert str(doc[0].morph) == ""
|
||||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||||
|
|
||||||
# "_" morph is also equivalent to empty morph
|
# "_" morph is also equivalent to empty morph
|
||||||
doc[0].morph_ = "_"
|
doc[0].set_morph("_")
|
||||||
assert doc[0].morph_ == ""
|
assert str(doc[0].morph) == ""
|
||||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||||
|
|
||||||
# set through existing hash with token.morph
|
# set through existing hash with token.morph
|
||||||
tokenizer.vocab.strings.add("Feat=Val")
|
tokenizer.vocab.strings.add("Feat=Val")
|
||||||
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
|
doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
|
||||||
assert doc[0].morph_ == "Feat=Val"
|
assert str(doc[0].morph) == "Feat=Val"
|
||||||
|
|
|
@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
||||||
assert doc[4].text == "the beach boys"
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
assert doc[4].tag_ == "NAMED"
|
assert doc[4].tag_ == "NAMED"
|
||||||
assert doc[4].morph_ == "Number=Plur"
|
assert str(doc[4].morph) == "Number=Plur"
|
||||||
assert doc[5].text == "all night"
|
assert doc[5].text == "all night"
|
||||||
assert doc[5].text_with_ws == "all night"
|
assert doc[5].text_with_ws == "all night"
|
||||||
assert doc[5].tag_ == "NAMED"
|
assert doc[5].tag_ == "NAMED"
|
||||||
assert doc[5].morph_ == "Number=Plur"
|
assert str(doc[5].morph) == "Number=Plur"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||||
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||||
|
ents = ["O"] * len(heads)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[10] = "B-GPE"
|
||||||
|
ents[13] = "B-PERSON"
|
||||||
|
ents[14] = "I-PERSON"
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# if there is a parse, span.root provides default values
|
# if there is a parse, span.root provides default values
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-fg"
|
||||||
|
ents[6] = "I-ent-fg"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
en_vocab.strings.add("ent-de")
|
en_vocab.strings.add("ent-de")
|
||||||
en_vocab.strings.add("ent-fg")
|
en_vocab.strings.add("ent-fg")
|
||||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# check that B is preserved if span[start] is B
|
# check that B is preserved if span[start] is B
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-de"
|
||||||
|
ents[6] = "I-ent-de"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
|
||||||
assert doc[0].text == "Los"
|
assert doc[0].text == "Los"
|
||||||
assert doc[0].head.text == "Angeles"
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
assert doc[0].morph_ == "Number=Sing"
|
assert str(doc[0].morph) == "Number=Sing"
|
||||||
assert doc[1].idx == 3
|
assert doc[1].idx == 3
|
||||||
assert doc[1].text == "Angeles"
|
assert doc[1].text == "Angeles"
|
||||||
assert doc[1].head.text == "start"
|
assert doc[1].head.text == "start"
|
||||||
assert doc[1].morph_ == "Number=Sing"
|
assert str(doc[1].morph) == "Number=Sing"
|
||||||
assert doc[2].text == "start"
|
assert doc[2].text == "start"
|
||||||
assert doc[2].head.text == "."
|
assert doc[2].head.text == "."
|
||||||
assert doc[3].text == "."
|
assert doc[3].text == "."
|
||||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 0]
|
heads = [0, 0, 0]
|
||||||
deps = ["ROOT", "dobj", "dobj"]
|
deps = ["ROOT", "dobj", "dobj"]
|
||||||
ents = [("ORG", 1, 2)]
|
ents = ["O", "B-ORG", "O"]
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = de_tokenizer("Er lag auf seinem")
|
doc = de_tokenizer("Er lag auf seinem")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -7,8 +7,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = es_tokenizer("en Oxford este verano")
|
doc = es_tokenizer("en Oxford este verano")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
|
|
||||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
|
||||||
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
|
|
||||||
)
|
|
||||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = id_tokenizer("sebelas")
|
doc = id_tokenizer("sebelas")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
|
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_sub_tokens(
|
def test_ja_tokenizer_sub_tokens(
|
||||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
|
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
|
||||||
)
|
)
|
||||||
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
||||||
tokens = ne_tokenizer(text)
|
tokens = ne_tokenizer(text)
|
||||||
|
|
|
@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
|
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
|
||||||
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,8 +3,7 @@ from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
|
||||||
@registry.misc("lemmatizer_init_lookups")
|
@registry.misc("lemmatizer_init_lookups")
|
||||||
def lemmatizer_init_lookups():
|
def lemmatizer_init_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
"""Test that languages can be initialized."""
|
# Test that languages can be initialized
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
assert not lemmatizer.lookups.tables
|
||||||
|
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||||
|
"lookups": {"@misc": "lemmatizer_init_lookups"}
|
||||||
|
}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp("x")
|
||||||
|
nlp.initialize()
|
||||||
|
assert lemmatizer.lookups.tables
|
||||||
|
doc = nlp("x")
|
||||||
# Check for stray print statements (see #3342)
|
# Check for stray print statements (see #3342)
|
||||||
doc = nlp("test") # noqa: F841
|
|
||||||
captured = capfd.readouterr()
|
captured = capfd.readouterr()
|
||||||
assert not captured.out
|
assert not captured.out
|
||||||
|
assert doc[0].lemma_ == "y"
|
||||||
|
|
||||||
|
# Test initialization by calling .initialize() directly
|
||||||
|
nlp = get_lang_class(lang)()
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
|
||||||
|
assert nlp("x")[0].lemma_ == "y"
|
||||||
|
|
|
@ -27,9 +27,18 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
nlp = Chinese(
|
config = {
|
||||||
meta={
|
"nlp": {
|
||||||
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
|
"tokenizer": {
|
||||||
}
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
)
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "medicine",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
nlp = Chinese.from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
zh_tokenizer_serialize(nlp.tokenizer)
|
zh_tokenizer_serialize(nlp.tokenizer)
|
||||||
|
|
|
@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
|
||||||
matcher.add("M", [pattern])
|
matcher.add("M", [pattern])
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val"
|
doc[0].set_morph("Feat=Val")
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
||||||
|
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
|
||||||
matcher.add("M", [pattern])
|
matcher.add("M", [pattern])
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
# IS_SUPERSET with more than one value only matches for MORPH
|
# IS_SUPERSET with more than one value only matches for MORPH
|
||||||
|
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
|
doc[0].set_morph("Feat2=Val2|Feat1=Val1")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
|
doc[0].set_morph("Feat1=Val1|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
# multiple values are split
|
# multiple values are split
|
||||||
|
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
|
doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
|
doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc2[0].lemma_ = "LEMMA"
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires DEP
|
# DEP requires DEP
|
||||||
|
|
|
@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc2[0].lemma_ = "LEMMA"
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires DEP
|
# DEP requires DEP
|
||||||
|
|
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
||||||
def _train_parser(parser):
|
def _train_parser(parser):
|
||||||
fix_random_seed(1)
|
fix_random_seed(1)
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training(lambda: [_ner_example(ner1)])
|
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
|
|
|
@ -202,7 +202,7 @@ def test_train_empty():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
ner = nlp.add_pipe("ner", last=True)
|
ner = nlp.add_pipe("ner", last=True)
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(2):
|
for itn in range(2):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_examples, size=8)
|
batches = util.minibatch(train_examples, size=8)
|
||||||
|
@ -213,7 +213,7 @@ def test_train_empty():
|
||||||
def test_overwrite_token():
|
def test_overwrite_token():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# The untrained NER will predict O for each token
|
# The untrained NER will predict O for each token
|
||||||
doc = nlp("I live in New York")
|
doc = nlp("I live in New York")
|
||||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||||
|
@ -235,7 +235,7 @@ def test_empty_ner():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("MY_LABEL")
|
ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("John is watching the news about Croatia's elections")
|
doc = nlp("John is watching the news about Croatia's elections")
|
||||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||||
|
@ -254,7 +254,7 @@ def test_ruler_before_ner():
|
||||||
# 2: untrained NER - should set everything else to O
|
# 2: untrained NER - should set everything else to O
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
@ -269,7 +269,7 @@ def test_ner_before_ruler():
|
||||||
# 1: untrained NER - should set everything to O
|
# 1: untrained NER - should set everything to O
|
||||||
untrained_ner = nlp.add_pipe("ner", name="uner")
|
untrained_ner = nlp.add_pipe("ner", name="uner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||||
patterns = [{"label": "THING", "pattern": "This"}]
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
|
@ -290,7 +290,7 @@ def test_block_ner():
|
||||||
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||||
expected_types = ["", "", "", "", "", "", "", ""]
|
expected_types = ["", "", "", "", "", "", "", ""]
|
||||||
|
@ -307,7 +307,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for ent in annotations.get("entities"):
|
for ent in annotations.get("entities"):
|
||||||
ner.add_label(ent[2])
|
ner.add_label(ent[2])
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" in caplog.text
|
assert "W033" in caplog.text
|
||||||
caplog.clear()
|
caplog.clear()
|
||||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" not in caplog.text
|
assert "W033" not in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -191,7 +191,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for dep in annotations.get("deps", []):
|
for dep in annotations.get("deps", []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
|
@ -34,7 +34,7 @@ def parser(vocab):
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
|
|
@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
a.add(**p)
|
a.add(**p)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
)
|
)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
|
|
||||||
dev_examples = [
|
dev_examples = [
|
||||||
Example.from_dict(
|
Example.from_dict(
|
||||||
|
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i == 4:
|
if i == 4:
|
||||||
assert doc[i].pos_ == "PUNCT"
|
assert doc[i].pos_ == "PUNCT"
|
||||||
assert doc[i].morph_ == "PunctType=peri"
|
assert str(doc[i].morph) == "PunctType=peri"
|
||||||
else:
|
else:
|
||||||
assert doc[i].pos_ == ""
|
assert doc[i].pos_ == ""
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
|
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i != 2:
|
if i != 2:
|
||||||
assert doc[i].pos_ == ""
|
assert doc[i].pos_ == ""
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
else:
|
else:
|
||||||
assert doc[2].pos_ == "DET"
|
assert doc[2].pos_ == "DET"
|
||||||
assert doc[2].lemma_ == "a"
|
assert doc[2].lemma_ == "a"
|
||||||
assert doc[2].morph_ == "Case=Nom"
|
assert str(doc[2].morph) == "Case=Nom"
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_indices(nlp):
|
def test_attributeruler_indices(nlp):
|
||||||
|
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i == 1:
|
if i == 1:
|
||||||
assert doc[i].lemma_ == "was"
|
assert doc[i].lemma_ == "was"
|
||||||
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
|
||||||
elif i == 2:
|
elif i == 2:
|
||||||
assert doc[i].lemma_ == "the"
|
assert doc[i].lemma_ == "the"
|
||||||
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
|
||||||
elif i == 3:
|
elif i == 3:
|
||||||
assert doc[i].lemma_ == "cat"
|
assert doc[i].lemma_ == "cat"
|
||||||
else:
|
else:
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
# raises an error when trying to modify a token outside of the match
|
# raises an error when trying to modify a token outside of the match
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
|
||||||
"""Test that the EL can't train without defining a KB"""
|
"""Test that the EL can't train without defining a KB"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
def test_kb_empty(nlp):
|
||||||
|
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialize(nlp):
|
def test_kb_serialize(nlp):
|
||||||
|
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
|
|
||||||
# adding aliases
|
# adding aliases
|
||||||
douglas_hash = mykb.add_alias(
|
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
|
||||||
)
|
|
||||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
candidates = mykb.get_alias_candidates("adam")
|
candidates = mykb.get_alias_candidates("adam")
|
||||||
|
@ -360,7 +358,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
|
|
||||||
# test whether the entity links are preserved by the `as_doc()` function
|
# test whether the entity links are preserved by the `as_doc()` function
|
||||||
|
@ -463,7 +461,7 @@ def test_overfitting_IO():
|
||||||
)
|
)
|
||||||
|
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
||||||
|
|
||||||
|
|
69
spacy/tests/pipeline/test_initialize.py
Normal file
69
spacy/tests/pipeline/test_initialize.py
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.training import Example
|
||||||
|
from thinc.api import ConfigValidationError
|
||||||
|
from pydantic import StrictBool
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialize_arguments():
|
||||||
|
name = "test_initialize_arguments"
|
||||||
|
|
||||||
|
class CustomTokenizer:
|
||||||
|
def __init__(self, tokenizer):
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.from_initialize = None
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
def initialize(self, get_examples, nlp, custom: int):
|
||||||
|
self.from_initialize = custom
|
||||||
|
|
||||||
|
class Component:
|
||||||
|
def __init__(self):
|
||||||
|
self.from_initialize = None
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
|
||||||
|
):
|
||||||
|
self.from_initialize = (custom1, custom2)
|
||||||
|
|
||||||
|
Language.factory(name, func=lambda nlp, name: Component())
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
|
||||||
|
example = Example.from_dict(nlp("x"), {})
|
||||||
|
get_examples = lambda: [example]
|
||||||
|
nlp.add_pipe(name)
|
||||||
|
# The settings here will typically come from the [initialize] block
|
||||||
|
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
# Empty config for component, no required custom1 argument
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ("custom1",)
|
||||||
|
assert errors[0]["type"] == "value_error.missing"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x", "custom2": 1}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
# Wrong type of custom 2
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ("custom2",)
|
||||||
|
assert errors[0]["type"] == "value_error.strictbool"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x"}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
assert nlp.tokenizer.from_initialize == 1
|
||||||
|
pipe = nlp.get_pipe(name)
|
||||||
|
assert pipe.from_initialize == ("x", False)
|
|
@ -8,61 +8,52 @@ from ..util import make_tempdir
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp():
|
def nlp():
|
||||||
return English()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lemmatizer(nlp):
|
|
||||||
@registry.misc("cope_lookups")
|
@registry.misc("cope_lookups")
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
nlp = English()
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||||
)
|
"lookups": {"@misc": "cope_lookups"}
|
||||||
return lemmatizer
|
}
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_init(nlp):
|
def test_lemmatizer_init(nlp):
|
||||||
@registry.misc("cope_lookups")
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
def cope_lookups():
|
|
||||||
lookups = Lookups()
|
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
|
||||||
return lookups
|
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
|
||||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
|
||||||
)
|
|
||||||
assert isinstance(lemmatizer.lookups, Lookups)
|
assert isinstance(lemmatizer.lookups, Lookups)
|
||||||
|
assert not lemmatizer.lookups.tables
|
||||||
assert lemmatizer.mode == "lookup"
|
assert lemmatizer.mode == "lookup"
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp("test")
|
||||||
|
nlp.initialize()
|
||||||
|
assert lemmatizer.lookups.tables
|
||||||
|
assert nlp("cope")[0].lemma_ == "cope"
|
||||||
|
assert nlp("coped")[0].lemma_ == "cope"
|
||||||
# replace any tables from spacy-lookups-data
|
# replace any tables from spacy-lookups-data
|
||||||
lemmatizer.lookups = Lookups()
|
lemmatizer.lookups = Lookups()
|
||||||
doc = nlp("coping")
|
|
||||||
# lookup with no tables sets text as lemma
|
# lookup with no tables sets text as lemma
|
||||||
assert doc[0].lemma_ == "coping"
|
assert nlp("cope")[0].lemma_ == "cope"
|
||||||
|
assert nlp("coped")[0].lemma_ == "coped"
|
||||||
nlp.remove_pipe("lemmatizer")
|
nlp.remove_pipe("lemmatizer")
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
@registry.misc("empty_lookups")
|
|
||||||
def empty_lookups():
|
|
||||||
return Lookups()
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(
|
# Can't initialize without required tables
|
||||||
"lemmatizer",
|
lemmatizer.initialize(lookups=Lookups())
|
||||||
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
lookups = Lookups()
|
||||||
)
|
lookups.add_table("lemma_lookup", {})
|
||||||
|
lemmatizer.initialize(lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_config(nlp, lemmatizer):
|
def test_lemmatizer_config(nlp):
|
||||||
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
doc = nlp.make_doc("coping")
|
doc = nlp.make_doc("coping")
|
||||||
doc[0].pos_ = "VERB"
|
doc[0].pos_ = "VERB"
|
||||||
assert doc[0].lemma_ == ""
|
assert doc[0].lemma_ == ""
|
||||||
|
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
||||||
assert doc[0].lemma_ == "cope"
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
def test_lemmatizer_serialize(nlp):
|
||||||
@registry.misc("cope_lookups")
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
lemmatizer2 = nlp2.add_pipe(
|
lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
lemmatizer2.initialize(lookups=cope_lookups())
|
||||||
)
|
|
||||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||||
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
||||||
|
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2.make_doc("coping")
|
doc2 = nlp2.make_doc("coping")
|
||||||
doc2[0].pos_ = "VERB"
|
doc2[0].pos_ = "VERB"
|
||||||
assert doc2[0].lemma_ == ""
|
assert doc2[0].lemma_ == ""
|
||||||
doc2 = lemmatizer(doc2)
|
doc2 = lemmatizer(doc2)
|
||||||
assert doc2[0].text == "coping"
|
assert doc2[0].text == "coping"
|
||||||
assert doc2[0].lemma_ == "cope"
|
assert doc2[0].lemma_ == "cope"
|
||||||
|
|
|
@ -33,7 +33,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("morphologizer")
|
nlp.add_pipe("morphologizer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -42,7 +42,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -50,13 +50,13 @@ def test_no_resize():
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# this throws an error because the morphologizer can't be resized after initialization
|
# this throws an error because the morphologizer can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
|
@ -64,12 +64,12 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -79,7 +79,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for inst in TRAIN_DATA:
|
for inst in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||||
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
||||||
assert [t.morph_ for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
|
@ -99,5 +99,5 @@ def test_overfitting_IO():
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert [t.morph_ for t in doc2] == gold_morphs
|
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||||
|
|
|
@ -31,19 +31,19 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -58,7 +58,7 @@ def test_overfitting_IO():
|
||||||
train_examples[1].reference[11].is_sent_start = False
|
train_examples[1].reference[11].is_sent_start = False
|
||||||
|
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(200):
|
for i in range(200):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
|
|
@ -15,14 +15,14 @@ def test_label_types():
|
||||||
tagger.add_label(9)
|
tagger.add_label(9)
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_begin_training_tag_map():
|
def test_tagger_initialize_tag_map():
|
||||||
"""Test that Tagger.begin_training() without gold tuples does not clobber
|
"""Test that Tagger.initialize() without gold tuples does not clobber
|
||||||
the tag map."""
|
the tag map."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
orig_tag_count = len(tagger.labels)
|
orig_tag_count = len(tagger.labels)
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -47,7 +47,7 @@ def test_no_resize():
|
||||||
tagger.add_label("N")
|
tagger.add_label("N")
|
||||||
tagger.add_label("V")
|
tagger.add_label("V")
|
||||||
assert tagger.labels == ("N", "V")
|
assert tagger.labels == ("N", "V")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tagger.model.get_dim("nO") == 2
|
assert tagger.model.get_dim("nO") == 2
|
||||||
# this throws an error because the tagger can't be resized after initialization
|
# this throws an error because the tagger can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -60,10 +60,10 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -72,16 +72,16 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=lambda: [])
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: [])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert tagger.model.get_dim("nO") == len(TAGS)
|
assert tagger.model.get_dim("nO") == len(TAGS)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
|
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.training.initialize import verify_textcat_config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...cli.train import verify_textcat_config
|
|
||||||
from ...training import Example
|
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
@ -26,7 +26,7 @@ def test_simple_train():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("answer")
|
textcat.add_label("answer")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
for text, answer in [
|
for text, answer in [
|
||||||
("aaaa", 1.0),
|
("aaaa", 1.0),
|
||||||
|
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
|
||||||
textcat = TextCategorizer(nlp.vocab, width=8)
|
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
textcat.add_label(letter)
|
textcat.add_label(letter)
|
||||||
optimizer = textcat.begin_training(lambda: [])
|
optimizer = textcat.initialize(lambda: [])
|
||||||
for i in range(30):
|
for i in range(30):
|
||||||
losses = {}
|
losses = {}
|
||||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||||
|
@ -86,7 +86,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -95,7 +95,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -103,14 +103,14 @@ def test_no_resize():
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("POSITIVE")
|
textcat.add_label("POSITIVE")
|
||||||
textcat.add_label("NEGATIVE")
|
textcat.add_label("NEGATIVE")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
# this throws an error because the textcat can't be resized after initialization
|
# this throws an error because the textcat can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.add_label("NEUTRAL")
|
textcat.add_label("NEUTRAL")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -119,12 +119,12 @@ def test_begin_training_examples():
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -139,7 +139,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
verify_textcat_config(nlp, pipe_config)
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_evaluation():
|
def test_textcat_evaluation():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -241,15 +242,17 @@ def test_textcat_evaluation():
|
||||||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||||
train_examples.append(Example(pred2, ref2))
|
train_examples.append(Example(pred2, ref2))
|
||||||
|
|
||||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
scores = Scorer().score_cats(
|
||||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
|
||||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
)
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
|
||||||
|
|
||||||
assert scores["cats_micro_p"] == 4/5
|
assert scores["cats_micro_p"] == 4 / 5
|
||||||
assert scores["cats_micro_r"] == 4/6
|
assert scores["cats_micro_r"] == 4 / 6
|
||||||
|
|
|
@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
||||||
encode_config["width"] = width
|
encode_config["width"] = width
|
||||||
docs = get_batch(3)
|
docs = get_batch(3)
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
embed_arch(**embed_config),
|
embed_arch(**embed_config), encode_arch(**encode_config)
|
||||||
encode_arch(**encode_config)
|
|
||||||
)
|
)
|
||||||
tok2vec.initialize(docs)
|
tok2vec.initialize(docs)
|
||||||
vectors, backprop = tok2vec.begin_update(docs)
|
vectors, backprop = tok2vec.begin_update(docs)
|
||||||
|
@ -88,7 +87,7 @@ def test_init_tok2vec():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tok2vec = nlp.add_pipe("tok2vec")
|
tok2vec = nlp.add_pipe("tok2vec")
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tok2vec.model.get_dim("nO")
|
assert tok2vec.model.get_dim("nO")
|
||||||
|
|
||||||
|
|
||||||
|
@ -154,7 +153,7 @@ def test_tok2vec_listener():
|
||||||
|
|
||||||
# Check that the Tok2Vec component finds it listeners
|
# Check that the Tok2Vec component finds it listeners
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
optimizer = nlp.begin_training(lambda: train_examples)
|
optimizer = nlp.initialize(lambda: train_examples)
|
||||||
assert tok2vec.listeners == [tagger_tok2vec]
|
assert tok2vec.listeners == [tagger_tok2vec]
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
|
|
@ -428,7 +428,7 @@ def test_issue999():
|
||||||
for _, offsets in TRAIN_DATA:
|
for _, offsets in TRAIN_DATA:
|
||||||
for start, end, label in offsets:
|
for start, end, label in offsets:
|
||||||
ner.add_label(label)
|
ner.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(20):
|
for itn in range(20):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user