Merge remote-tracking branch 'upstream/develop' into feature/small-fixes

This commit is contained in:
svlandeg 2020-10-02 20:48:11 +02:00
commit 02247cccaf
175 changed files with 4000 additions and 2934 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash SHELL := /bin/bash
ifndef SPACY_EXTRAS ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
endif endif
ifndef PYVER ifndef PYVER

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a41,<8.0.0a50", "thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,12 +1,12 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0 catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0
pathy pathy

View File

@ -34,16 +34,16 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0 catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0
pathy pathy
@ -65,7 +65,7 @@ console_scripts =
[options.extras_require] [options.extras_require]
lookups = lookups =
spacy_lookups_data==0.4.0.dev0 spacy_lookups_data==1.0.0rc0
cuda = cuda =
cupy>=5.0.0b4,<9.0.0 cupy>=5.0.0b4,<9.0.0
cuda80 = cuda80 =
@ -84,7 +84,7 @@ cuda102 =
cupy-cuda102>=5.0.0b4,<9.0.0 cupy-cuda102>=5.0.0b4,<9.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.4.5 sudachipy>=0.4.9
sudachidict_core>=20200330 sudachidict_core>=20200330
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
@ -98,7 +98,7 @@ universal = false
formats = gztar formats = gztar
[flake8] [flake8]
ignore = E203, E266, E501, E731, W503 ignore = E203, E266, E501, E731, W503, E741
max-line-length = 80 max-line-length = 80
select = B,C,E,F,W,T4,B9 select = B,C,E,F,W,T4,B9
exclude = exclude =

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a26" __version__ = "3.0.0a29"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401 from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401 from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401 from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401 from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401 from .project.clone import project_clone # noqa: F401

View File

@ -10,12 +10,13 @@ from click import NoSuchOption
from click.parser import split_arg_string from click.parser import split_arg_string
from typer.main import get_command from typer.main import get_command
from contextlib import contextmanager from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError from thinc.api import Config, ConfigValidationError, require_gpu
from configparser import InterpolationError from configparser import InterpolationError
import os import os
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ENV_VARS
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
and custom model implementations. and custom model implementations.
""" """
INIT_HELP = """Commands for initializing configs and pipeline packages.""" INIT_HELP = """Commands for initializing configs and pipeline packages."""
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
# Wrappers for Typer's annotations. Initially created to set defaults and to # Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment. # keep the names short, but not needed at the moment.
@ -65,7 +65,7 @@ def setup_cli() -> None:
def parse_config_overrides( def parse_config_overrides(
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate a dictionary of config overrides based on the extra arguments """Generate a dictionary of config overrides based on the extra arguments
provided on the CLI, e.g. --training.batch_size to override provided on the CLI, e.g. --training.batch_size to override
@ -275,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
"""RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
"""
return [
name
for name, cfg in config.get("components", {}).items()
if "factory" not in cfg and "source" in cfg
]
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""Upload a file. """Upload a file.
@ -458,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
p = int(p) p = int(p)
result.append(p) result.append(p)
return result return result
def setup_gpu(use_gpu: int) -> None:
"""Configure the GPU and log info."""
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")

View File

@ -9,7 +9,8 @@ import sys
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are

View File

@ -1,12 +1,14 @@
from typing import Optional, Dict, Any, Union, List from typing import Optional, Dict, Any, Union, List
from pathlib import Path from pathlib import Path
from wasabi import msg, table from wasabi import msg, table
from thinc.api import Config, ConfigValidationError from thinc.api import Config
from thinc.config import VARIABLE_RE from thinc.config import VARIABLE_RE
import typer import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli from ._util import import_code, debug_cli
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util from .. import util
@ -52,10 +54,10 @@ def debug_config(
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides) config = util.load_config(config_path, overrides=overrides)
nlp = util.load_model_from_config(config) nlp = util.load_model_from_config(config)
# Use the resolved config here in case user has one function returning config = nlp.config.interpolate()
# a dict of corpora etc. T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
resolved = util.resolve_training_config(nlp.config) dot_names = [T["train_corpus"], T["dev_corpus"]]
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) util.resolve_dot_names(config, dot_names)
msg.good("Config is valid") msg.good("Config is valid")
if show_vars: if show_vars:
variables = get_variables(config) variables = get_variables(config)
@ -97,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
value = util.dot_to_object(config, path) value = util.dot_to_object(config, path)
result[variable] = repr(value) result[variable] = repr(value)
return result return result
def check_section_refs(config: Config, fields: List[str]) -> None:
"""Validate fields in the config that refer to other sections or values
(e.g. in the corpora) and make sure that those references exist.
"""
errors = []
for field in fields:
# If the field doesn't exist in the config, we ignore it
try:
value = util.dot_to_object(config, field)
except KeyError:
continue
try:
util.dot_to_object(config, value)
except KeyError:
msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config=config, errors=errors)

View File

@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
import typer import typer
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components from ._util import import_code, debug_cli
from ..training import Corpus, Example from ..training import Example
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..language import Language from ..language import Language
from ..util import registry, resolve_dot_names
from .. import util from .. import util
@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
@debug_cli.command( @debug_cli.command(
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
) )
@app.command( @app.command(
"debug-data", "debug-data",
@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
def debug_data_cli( def debug_data_cli(
# fmt: off # fmt: off
ctx: typer.Context, # This is only used to read additional arguments ctx: typer.Context, # This is only used to read additional arguments
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
@ -59,8 +60,6 @@ def debug_data_cli(
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
debug_data( debug_data(
train_path,
dev_path,
config_path, config_path,
config_overrides=overrides, config_overrides=overrides,
ignore_warnings=ignore_warnings, ignore_warnings=ignore_warnings,
@ -71,8 +70,6 @@ def debug_data_cli(
def debug_data( def debug_data(
train_path: Path,
dev_path: Path,
config_path: Path, config_path: Path,
*, *,
config_overrides: Dict[str, Any] = {}, config_overrides: Dict[str, Any] = {},
@ -85,57 +82,29 @@ def debug_data(
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
) )
# Make sure all files and paths exists if they are needed # Make sure all files and paths exists if they are needed
if not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
if not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1)
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path): with show_validation_error(config_path):
cfg = util.load_config(config_path, overrides=config_overrides) cfg = util.load_config(config_path, overrides=config_overrides)
nlp = util.load_model_from_config(cfg) nlp = util.load_model_from_config(cfg)
C = util.resolve_training_config(nlp.config) config = nlp.config.interpolate()
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
# Use original config here, not resolved version # Use original config here, not resolved version
sourced_components = get_sourced_components(cfg) sourced_components = get_sourced_components(cfg)
frozen_components = C["training"]["frozen_components"] frozen_components = T["frozen_components"]
resume_components = [p for p in sourced_components if p not in frozen_components] resume_components = [p for p in sourced_components if p not in frozen_components]
pipeline = nlp.pipe_names pipeline = nlp.pipe_names
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
tag_map_path = util.ensure_path(C["training"]["tag_map"])
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
morph_rules = {}
if morph_rules_path is not None:
morph_rules = srsly.read_json(morph_rules_path)
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
msg.divider("Data file validation") msg.divider("Data file validation")
# Create the gold corpus to be able to better analyze data # Create the gold corpus to be able to better analyze data
loading_train_error_message = "" dot_names = [T["train_corpus"], T["dev_corpus"]]
loading_dev_error_message = "" train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
with msg.loading("Loading corpus..."): train_dataset = list(train_corpus(nlp))
try: dev_dataset = list(dev_corpus(nlp))
train_dataset = list(Corpus(train_path)(nlp))
except ValueError as e:
loading_train_error_message = f"Training data cannot be loaded: {e}"
try:
dev_dataset = list(Corpus(dev_path)(nlp))
except ValueError as e:
loading_dev_error_message = f"Development data cannot be loaded: {e}"
if loading_train_error_message or loading_dev_error_message:
if loading_train_error_message:
msg.fail(loading_train_error_message)
if loading_dev_error_message:
msg.fail(loading_dev_error_message)
sys.exit(1)
msg.good("Corpus is loadable") msg.good("Corpus is loadable")
nlp.initialize(lambda: train_dataset)
msg.good("Pipeline can be initialized with data")
# Create all gold data here to avoid iterating over the train_dataset constantly # Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold( gold_train_unpreprocessed_data = _compile_gold(
@ -145,10 +114,10 @@ def debug_data(
train_texts = gold_train_data["texts"] train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"] dev_texts = gold_dev_data["texts"]
frozen_components = C["training"]["frozen_components"] frozen_components = T["frozen_components"]
msg.divider("Training stats") msg.divider("Training stats")
msg.text(f"Language: {C['nlp']['lang']}") msg.text(f"Language: {nlp.lang}")
msg.text(f"Training pipeline: {', '.join(pipeline)}") msg.text(f"Training pipeline: {', '.join(pipeline)}")
if resume_components: if resume_components:
msg.text(f"Components from other pipelines: {', '.join(resume_components)}") msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@ -355,17 +324,12 @@ def debug_data(
if "tagger" in factory_names: if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]] labels = [label for label in gold_train_data["tags"]]
tag_map = nlp.vocab.morphology.tag_map # TODO: does this need to be updated?
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") msg.info(f"{len(labels)} label(s) in data")
labels_with_counts = _format_labels( labels_with_counts = _format_labels(
gold_train_data["tags"].most_common(), counts=True gold_train_data["tags"].most_common(), counts=True
) )
msg.text(labels_with_counts, show=verbose) msg.text(labels_with_counts, show=verbose)
non_tagmap = [l for l in labels if l not in tag_map]
if not non_tagmap:
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
for label in non_tagmap:
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
if "parser" in factory_names: if "parser" in factory_names:
has_low_data_warning = False has_low_data_warning = False

View File

@ -2,18 +2,23 @@ from typing import Dict, Any, Optional, Iterable
from pathlib import Path from pathlib import Path
from spacy.training import Example from spacy.training import Example
from spacy.util import dot_to_object from spacy.util import resolve_dot_names
from wasabi import msg from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam from thinc.api import fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation, set_gpu_allocator from thinc.api import Model, data_validation, set_gpu_allocator
import typer import typer
from ._util import Arg, Opt, debug_cli, show_validation_error from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list from ._util import parse_config_overrides, string_to_list, setup_gpu
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util from .. import util
@debug_cli.command("model") @debug_cli.command(
"model",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_model_cli( def debug_model_cli(
# fmt: off # fmt: off
ctx: typer.Context, # This is only used to read additional arguments ctx: typer.Context, # This is only used to read additional arguments
@ -37,11 +42,7 @@ def debug_model_cli(
DOCS: https://nightly.spacy.io/api/cli#debug-model DOCS: https://nightly.spacy.io/api/cli#debug-model
""" """
if use_gpu >= 0: setup_gpu(use_gpu)
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
layers = string_to_list(layers, intify=True) layers = string_to_list(layers, intify=True)
print_settings = { print_settings = {
"dimensions": dimensions, "dimensions": dimensions,
@ -59,14 +60,15 @@ def debug_model_cli(
raw_config = util.load_config( raw_config = util.load_config(
config_path, overrides=config_overrides, interpolate=False config_path, overrides=config_overrides, interpolate=False
) )
config = raw_config.iterpolate() config = raw_config.interpolate()
allocator = config["training"]["gpu_allocator"] allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator: if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator) set_gpu_allocator(allocator)
with show_validation_error(config_path): with show_validation_error(config_path):
nlp = util.load_model_from_config(raw_config) nlp = util.load_model_from_config(raw_config)
C = util.resolve_training_config(nlp.config) config = nlp.config.interpolate()
seed = C["training"]["seed"] T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
seed = T["seed"]
if seed is not None: if seed is not None:
msg.info(f"Fixing random seed: {seed}") msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed) fix_random_seed(seed)
@ -77,11 +79,16 @@ def debug_model_cli(
exits=1, exits=1,
) )
model = pipe.model model = pipe.model
debug_model(C, nlp, model, print_settings=print_settings) debug_model(config, T, nlp, model, print_settings=print_settings)
def debug_model( def debug_model(
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None config,
resolved_train_config,
nlp,
model: Model,
*,
print_settings: Optional[Dict[str, Any]] = None,
): ):
if not isinstance(model, Model): if not isinstance(model, Model):
msg.fail( msg.fail(
@ -102,13 +109,16 @@ def debug_model(
# The output vector might differ from the official type of the output layer # The output vector might differ from the official type of the output layer
with data_validation(False): with data_validation(False):
try: try:
train_corpus = dot_to_object(config, config["training"]["train_corpus"]) dot_names = [resolved_train_config["train_corpus"]]
nlp.begin_training(lambda: train_corpus(nlp)) with show_validation_error():
(train_corpus,) = resolve_dot_names(config, dot_names)
nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.") msg.info("Initialized the model with the training corpus.")
except ValueError: except ValueError:
try: try:
_set_output_dim(nO=7, model=model) _set_output_dim(nO=7, model=model)
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X]) with show_validation_error():
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
msg.info("Initialized the model with dummy data.") msg.info("Initialized the model with dummy data.")
except Exception: except Exception:
msg.fail( msg.fail(

View File

@ -3,11 +3,11 @@ from wasabi import Printer
from pathlib import Path from pathlib import Path
import re import re
import srsly import srsly
from thinc.api import require_gpu, fix_random_seed from thinc.api import fix_random_seed
from ..training import Corpus from ..training import Corpus
from ..tokens import Doc from ..tokens import Doc
from ._util import app, Arg, Opt from ._util import app, Arg, Opt, setup_gpu, import_code
from ..scorer import Scorer from ..scorer import Scorer
from .. import util from .. import util
from .. import displacy from .. import displacy
@ -19,6 +19,7 @@ def evaluate_cli(
model: str = Arg(..., help="Model name or path"), model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@ -37,6 +38,7 @@ def evaluate_cli(
DOCS: https://nightly.spacy.io/api/cli#evaluate DOCS: https://nightly.spacy.io/api/cli#evaluate
""" """
import_code(code_path)
evaluate( evaluate(
model, model,
data_path, data_path,
@ -61,8 +63,7 @@ def evaluate(
) -> Scorer: ) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed() fix_random_seed()
if use_gpu >= 0: setup_gpu(use_gpu)
require_gpu(use_gpu)
data_path = util.ensure_path(data_path) data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output) output_path = util.ensure_path(output)
displacy_path = util.ensure_path(displacy_path) displacy_path = util.ensure_path(displacy_path)

View File

@ -1,360 +0,0 @@
from typing import Optional, List, Dict, Any, Union, IO
import math
from tqdm import tqdm
import numpy
from ast import literal_eval
from pathlib import Path
from preshed.counter import PreshCounter
import tarfile
import gzip
import zipfile
import srsly
import warnings
from wasabi import msg, Printer
import typer
from ._util import app, init_cli, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
try:
import ftfy
except ImportError:
ftfy = None
DEFAULT_OOV_PROB = -20
@init_cli.command("vocab")
@app.command(
"init-model",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True, # hide this from main CLI help but still allow it to work with warning
)
def init_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
lang: str = Arg(..., help="Pipeline language"),
output_dir: Path = Arg(..., help="Pipeline output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
# fmt: on
):
"""
Create a new blank pipeline directory with vocab and vectors from raw data.
If vectors are provided in Word2Vec format, they can be either a .txt or
zipped as a .zip or .tar.gz.
DOCS: https://nightly.spacy.io/api/cli#init-vocab
"""
if ctx.command.name == "init-model":
msg.warn(
"The init-model command is now called 'init vocab'. You can run "
"'python -m spacy init --help' for an overview of the other "
"available initialization commands."
)
init_model(
lang,
output_dir,
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
vectors_loc=vectors_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
model_name=model_name,
base_model=base_model,
silent=False,
)
def init_model(
lang: str,
output_dir: Path,
freqs_loc: Optional[Path] = None,
clusters_loc: Optional[Path] = None,
jsonl_loc: Optional[Path] = None,
vectors_loc: Optional[Path] = None,
prune_vectors: int = -1,
truncate_vectors: int = 0,
vectors_name: Optional[str] = None,
model_name: Optional[str] = None,
base_model: Optional[str] = None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent, pretty=not silent)
if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"]
if freqs_loc:
settings.append("-f")
if clusters_loc:
settings.append("-c")
msg.warn(
"Incompatible arguments",
"The -f and -c arguments are deprecated, and not compatible "
"with the -j argument, which should specify the same "
"information. Either merge the frequencies and clusters data "
"into the JSONL-formatted file (recommended), or use only the "
"-f and -c files, without the other lexical attributes.",
)
jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = srsly.read_jsonl(jsonl_loc)
else:
clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
with msg.loading("Creating blank pipeline..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
msg.good("Successfully created blank pipeline")
if vectors_loc is not None:
add_vectors(
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
return nlp
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
return tarfile.open(str(loc), "r:gz")
elif loc.parts[-1].endswith("gz"):
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
elif loc.parts[-1].endswith("zip"):
zip_file = zipfile.ZipFile(str(loc))
names = zip_file.namelist()
file_ = zip_file.open(names[0])
return (line.decode("utf8") for line in file_)
else:
return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
) -> List[Dict[str, Any]]:
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
msg.good("Counted frequencies")
else:
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
if len(sorted_probs):
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
attrs["cluster"] = int(clusters[word][::-1], 2)
else:
attrs["cluster"] = 0
lex_attrs.append(attrs)
return lex_attrs
def create_model(
lang: str,
lex_attrs: List[Dict[str, Any]],
name: Optional[str] = None,
base_model: Optional[Union[str, Path]] = None,
) -> Language:
if base_model:
nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to
# potentially conflicting vectors
for pipe in nlp.pipe_names:
nlp.remove_pipe(pipe)
else:
lang_class = get_lang_class(lang)
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
if name:
nlp.meta["name"] = name
return nlp
def add_vectors(
msg: Printer,
nlp: Language,
vectors_loc: Optional[Path],
truncate_vectors: int,
prune_vectors: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
for lex in nlp.vocab:
if lex.rank and lex.rank != OOV_RANK:
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else:
if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(
msg, vectors_loc, truncate_vectors
)
msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None:
for word in vector_keys:
if word not in nlp.vocab:
nlp.vocab[word]
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
f = open_file(vectors_loc)
f = ensure_shape(f)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
shape = (truncate_vectors, shape[1])
vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = []
for i, line in enumerate(tqdm(f)):
line = line.rstrip()
pieces = line.rsplit(" ", vectors_data.shape[1])
word = pieces.pop(0)
if len(pieces) != vectors_data.shape[1]:
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
vectors_data[i] = numpy.asarray(pieces, dtype="f")
vectors_keys.append(word)
if i == truncate_vectors - 1:
break
return vectors_data, vectors_keys
def ensure_shape(lines):
"""Ensure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
"""
first_line = next(lines)
try:
shape = tuple(int(size) for size in first_line.split())
except ValueError:
shape = None
if shape is not None:
# All good, give the data
yield first_line
yield from lines
else:
# Figure out the shape, make it the first value, and then give the
# rest of the data.
width = len(first_line.split()) - 1
captured = [first_line] + list(lines)
length = len(captured)
yield f"{length} {width}"
yield from captured
def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
try:
word = literal_eval(key)
except SyntaxError:
# Take odd strings literally.
word = literal_eval(f"'{key}'")
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_loc: Path) -> dict:
clusters = {}
if ftfy is None:
warnings.warn(Warnings.W004)
with clusters_loc.open() as f:
for line in tqdm(f):
try:
cluster, word, freq = line.split()
if ftfy is not None:
word = ftfy.fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = "0"
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters

117
spacy/cli/init_pipeline.py Normal file
View File

@ -0,0 +1,117 @@
from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
@init_cli.command("vectors")
def init_vectors_cli(
# fmt: off
lang: str = Arg(..., help="The language of the nlp object to create"),
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
update_lexemes(nlp, jsonl_loc)
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)
msg.good(
"Saved nlp object with vectors to output directory. You can now use the "
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
output_dir.resolve(),
)
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
# Mostly used for backwards-compatibility and may be removed in the future
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
@init_cli.command(
"nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True,
)
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}")
@init_cli.command(
"labels",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir()
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
output_file = output_path / f"{name}.json"
srsly.write_json(output_file, component.label_data)
msg.good(f"Saving {name} labels to {output_file}")
else:
msg.info(f"No labels found for {name}")

View File

@ -1,25 +1,13 @@
from typing import Optional from typing import Optional
import numpy
import time
import re
from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.api import require_gpu, set_gpu_allocator
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import Config, CosineDistance, L2Distance
from wasabi import msg from wasabi import msg
import srsly
from functools import partial
import typer import typer
import re
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code from ._util import import_code, setup_gpu
from ..ml.models.multi_task import build_cloze_multi_task_model from ..training.pretrain import pretrain
from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..util import load_config
from ..tokens import Doc
from ..attrs import ID
from .. import util
from ..util import dot_to_object
@app.command( @app.command(
@ -61,15 +49,11 @@ def pretrain_cli(
config_overrides = parse_config_overrides(ctx.args) config_overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume) verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
if use_gpu >= 0: setup_gpu(use_gpu)
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path): with show_validation_error(config_path):
raw_config = util.load_config( raw_config = load_config(
config_path, overrides=config_overrides, interpolate=False config_path, overrides=config_overrides, interpolate=False
) )
config = raw_config.interpolate() config = raw_config.interpolate()
@ -89,250 +73,11 @@ def pretrain_cli(
resume_path=resume_path, resume_path=resume_path,
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu, use_gpu=use_gpu,
silent=False,
) )
def pretrain(
config: Config,
output_dir: Path,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
):
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
nlp = util.load_model_from_config(config)
C = util.resolve_training_config(nlp.config)
P_cfg = C["pretraining"]
corpus = dot_to_object(C, P_cfg["corpus"])
batcher = P_cfg["batcher"]
model = create_pretraining_model(nlp, C["pretraining"])
optimizer = C["pretraining"]["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
_resume_model(model, resume_path, epoch_resume)
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
def _save_model(epoch, is_temp=False):
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
file_.write(model.get_ref("tok2vec").to_bytes())
log = {
"nr_word": tracker.nr_word,
"loss": tracker.loss,
"epoch_loss": tracker.epoch_loss,
"epoch": epoch,
}
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
objective = create_objective(P_cfg["objective"])
# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
msg.good("Successfully finished pretrain") msg.good("Successfully finished pretrain")
def ensure_docs(examples_or_docs):
docs = []
for eg_or_doc in examples_or_docs:
if isinstance(eg_or_doc, Doc):
docs.append(eg_or_doc)
else:
docs.append(eg_or_doc.reference)
return docs
def _resume_model(model, resume_path, epoch_resume):
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
# Parse the epoch number from the given weight file
model_name = re.search(r"model\d+\.bin", str(resume_path))
if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
msg.info(f"Resuming from epoch: {epoch_resume}")
else:
msg.info(f"Resuming from epoch: {epoch_resume}")
def make_update(model, docs, optimizer, objective_func):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
"""
predictions, backprop = model.begin_update(docs)
loss, gradients = objective_func(model.ops, docs, predictions)
backprop(gradients)
model.finish_update(optimizer)
# Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss
return float(loss)
def create_objective(config):
"""Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss.
Bleh.
"""
objective_type = config["type"]
if objective_type == "characters":
return partial(get_characters_loss, nr_char=config["n_characters"])
elif objective_type == "vectors":
if config["loss"] == "cosine":
return partial(
get_vectors_loss,
distance=CosineDistance(normalize=True, ignore_zeros=True),
)
elif config["loss"] == "L2":
return partial(
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
)
else:
raise ValueError("Unexpected loss type", config["loss"])
else:
raise ValueError("Unexpected objective_type", objective_type)
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc.
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO
maxout_pieces = 3
hidden_size = 300
if pretrain_config["objective"]["type"] == "vectors":
model = build_cloze_multi_task_model(
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
elif pretrain_config["objective"]["type"] == "characters":
model = build_cloze_characters_multi_task_model(
nlp.vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=pretrain_config["objective"]["n_characters"],
)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
set_dropout_rate(model, pretrain_config["dropout"])
return model
class ProgressTracker:
def __init__(self, frequency=1000000):
self.loss = 0.0
self.prev_loss = 0.0
self.nr_word = 0
self.words_per_epoch = Counter()
self.frequency = frequency
self.last_time = time.time()
self.last_update = 0
self.epoch_loss = 0.0
def update(self, epoch, loss, docs):
self.loss += loss
self.epoch_loss += loss
words_in_batch = sum(len(doc) for doc in docs)
self.words_per_epoch[epoch] += words_in_batch
self.nr_word += words_in_batch
words_since_update = self.nr_word - self.last_update
if words_since_update >= self.frequency:
wps = words_since_update / (time.time() - self.last_time)
self.last_update = self.nr_word
self.last_time = time.time()
loss_per_word = self.loss - self.prev_loss
status = (
epoch,
self.nr_word,
_smart_round(self.loss, width=10),
_smart_round(loss_per_word, width=6),
int(wps),
)
self.prev_loss = float(self.loss)
return status
else:
return None
def _smart_round(figure, width=10, max_decimal=4):
"""Round large numbers as integers, smaller numbers as decimals."""
n_digits = len(str(int(figure)))
n_decimal = width - (n_digits + 1)
if n_decimal <= 1:
return str(int(figure))
else:
n_decimal = min(n_decimal, max_decimal)
format_str = "%." + str(n_decimal) + "f"
return format_str % figure
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)

View File

@ -134,7 +134,7 @@ def update_dvc_config(
def run_dvc_commands( def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}, commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None: ) -> None:
"""Run a sequence of DVC commands in a subprocess, in order. """Run a sequence of DVC commands in a subprocess, in order.

View File

@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
{%- set use_transformer = (transformer_data and hardware != "cpu") -%} {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths] [paths]
train = "" train = null
dev = "" dev = null
[system] [system]
{% if use_transformer -%} {% if use_transformer -%}
@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
window = 128 window = 128
stride = 96 stride = 96
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.morphologizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "tagger" in components %} {% if "tagger" in components %}
[components.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "tagger" in components %} {% if "tagger" in components %}
[components.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
@ -257,7 +286,7 @@ no_output_layer = false
{% endif %} {% endif %}
{% for pipe in components %} {% for pipe in components %}
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} {% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
{# Other components defined by the user: we just assume they're factories #} {# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}] [components.{{ pipe }}]
factory = "{{ pipe }}" factory = "{{ pipe }}"
@ -270,7 +299,6 @@ factory = "{{ pipe }}"
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths.train} path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }} max_length = {{ 500 if hardware == "gpu" else 2000 }}
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev] [corpora.dev]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
@ -278,11 +306,6 @@ path = ${paths.dev}
max_length = 0 max_length = 0
[training] [training]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
{% if use_transformer -%} {% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }} accumulate_gradient = {{ transformer["size_factor"] }}
{% endif -%} {% endif -%}
@ -318,3 +341,10 @@ start = 100
stop = 1000 stop = 1000
compound = 1.001 compound = 1.001
{% endif %} {% endif %}
[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -1,23 +1,14 @@
from typing import Optional, Dict, Any, Tuple, Union, Callable, List from typing import Optional
from timeit import default_timer as timer
import srsly
import tqdm
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
import random
import typer import typer
import logging import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, get_sourced_components from ._util import import_code, setup_gpu
from ..language import Language from ..training.loop import train
from ..training.initialize import init_nlp
from .. import util from .. import util
from ..training.example import Example
from ..errors import Errors
from ..util import dot_to_object
@app.command( @app.command(
@ -30,8 +21,7 @@ def train_cli(
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
# fmt: on # fmt: on
): ):
""" """
@ -48,393 +38,19 @@ def train_cli(
DOCS: https://nightly.spacy.io/api/cli#train DOCS: https://nightly.spacy.io/api/cli#train
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
verify_cli_args(config_path, output_path) verify_cli_args(config_path, output_path)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
train( setup_gpu(use_gpu)
config_path,
output_path=output_path,
config_overrides=overrides,
use_gpu=use_gpu,
resume_training=resume,
)
def train(
config_path: Path,
output_path: Optional[Path] = None,
config_overrides: Dict[str, Any] = {},
use_gpu: int = -1,
resume_training: bool = False,
) -> None:
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path): with show_validation_error(config_path):
# Keep an un-interpolated config so we can preserve variables in config = util.load_config(config_path, overrides=overrides, interpolate=False)
# the final nlp object we train and serialize msg.divider("Initializing pipeline")
raw_config = util.load_config( with show_validation_error(config_path, hint_fill=False):
config_path, overrides=config_overrides, interpolate=False nlp = init_nlp(config, use_gpu=use_gpu)
) msg.good("Initialized pipeline")
config = raw_config.interpolate() msg.divider("Training pipeline")
if config["training"]["seed"] is not None: train(nlp, output_path, use_gpu=use_gpu, silent=False)
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
with show_validation_error(config_path):
nlp = util.load_model_from_config(raw_config)
# Resolve all training-relevant sections using the filled nlp config
C = util.resolve_training_config(nlp.config)
util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"])
if C["training"]["vectors"] is not None:
add_vectors(nlp, C["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(C)
T_cfg = C["training"]
optimizer = T_cfg["optimizer"]
train_corpus = dot_to_object(C, T_cfg["train_corpus"])
dev_corpus = dot_to_object(C, T_cfg["dev_corpus"])
batcher = T_cfg["batcher"]
train_logger = T_cfg["logger"]
before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T_cfg["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components]
msg.info(f"Pipeline: {nlp.pipe_names}")
if resume_components:
with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
verify_config(nlp)
if tag_map:
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
if morph_rules:
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
if weights_data is not None:
tok2vec_component = C["pretraining"]["component"]
if tok2vec_component is None:
msg.fail(
f"To use pretrained tok2vec weights, [pretraining.component] "
f"needs to specify the component that should load them.",
exits=1,
)
layer = nlp.get_pipe(tok2vec_component).model
tok2vec_layer = C["pretraining"]["layer"]
if tok2vec_layer:
layer = layer.get_ref(tok2vec_layer)
layer.from_bytes(weights_data)
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
# Create iterator, which yields out info after each optimization step.
msg.info("Start training")
score_weights = T_cfg["score_weights"]
training_step_iterator = train_while_improving(
nlp,
optimizer,
create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
create_evaluation_callback(nlp, dev_corpus, score_weights),
dropout=T_cfg["dropout"],
accumulate_gradient=T_cfg["accumulate_gradient"],
patience=T_cfg["patience"],
max_steps=T_cfg["max_steps"],
eval_frequency=T_cfg["eval_frequency"],
raw_text=None,
exclude=frozen_components,
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
progress.set_description(f"Epoch 1")
for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1)
if is_best_checkpoint is not None:
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
progress.set_description(f"Epoch {info['epoch']}")
except Exception as e:
finalize_logger()
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
# specific error.
msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-final")
raise e
finally:
finalize_logger()
if output_path is not None:
final_model_path = output_path / "model-final"
if optimizer.averages:
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
else:
nlp.to_disk(final_model_path)
msg.good(f"Saved pipeline to output directory {final_model_path}")
def add_vectors(nlp: Language, vectors: str) -> None:
title = f"Config validation error for vectors {vectors}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
with show_validation_error(
title=title, desc=desc, hint_fill=False, show_config=False
):
util.load_vectors_into_model(nlp, vectors)
def create_train_batches(iterator, batcher, max_epochs: int):
epoch = 0
examples = list(iterator)
if not examples:
# Raise error if no data
raise ValueError(Errors.E986)
while max_epochs < 1 or epoch != max_epochs:
random.shuffle(examples)
for batch in batcher(examples):
yield epoch, batch
epoch += 1
def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples)
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key, value in scores.items():
if key in weights and not isinstance(value, (int, float)):
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
try:
weighted_score = sum(
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
)
except KeyError as e:
keys = list(scores.keys())
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
raise KeyError(err) from None
return weighted_score, scores
return evaluate
def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]]
) -> Callable[[Language], Language]:
def before_to_disk(nlp: Language) -> Language:
if not callback:
return nlp
modified_nlp = callback(nlp)
if not isinstance(modified_nlp, Language):
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
raise ValueError(err)
return modified_nlp
return before_to_disk
def train_while_improving(
nlp: Language,
optimizer: Optimizer,
train_data,
evaluate,
*,
dropout: float,
eval_frequency: int,
accumulate_gradient: int,
patience: int,
max_steps: int,
raw_text: List[Dict[str, str]],
exclude: List[str],
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
where info is a dict, and is_best_checkpoint is in [True, False, None] --
None indicating that the iteration was not evaluated as a checkpoint.
The evaluation is conducted by calling the evaluate callback.
Positional arguments:
nlp: The spaCy pipeline to evaluate.
optimizer: The optimizer callable.
train_data (Iterable[Batch]): A generator of batches, with the training
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
data iterable needs to take care of iterating over the epochs and
shuffling.
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
The callback should take no arguments and return a tuple
`(main_score, other_scores)`. The main_score should be a float where
higher is better. other_scores can be any object.
Every iteration, the function yields out a tuple with:
* batch: A list of Example objects.
* info: A dict with various information about the last update (see below).
* is_best_checkpoint: A value in None, False, True, indicating whether this
was the best evaluation so far. You should use this to save the model
checkpoints during training. If None, evaluation was not conducted on
that iteration. False means evaluation was conducted, but a previous
evaluation was better.
The info dict provides the following information:
epoch (int): How many passes over the data have been completed.
step (int): How many steps have been completed.
score (float): The main score from the last evaluation.
other_scores: : The other scores from the last evaluation.
losses: The accumulated losses throughout training.
checkpoints: A list of previous results, where each result is a
(score, step, epoch) tuple.
"""
if isinstance(dropout, float):
dropouts = thinc.schedules.constant(dropout)
else:
dropouts = dropout
results = []
losses = {}
if raw_text:
random.shuffle(raw_text)
raw_examples = [
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
]
raw_batches = util.minibatch(raw_examples, size=8)
words_seen = 0
start_time = timer()
for step, (epoch, batch) in enumerate(train_data):
dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
# TODO: refactor this so we don't have to run it separately in here
for name, proc in nlp.pipeline:
if (
name not in exclude
and hasattr(proc, "model")
and proc.model not in (True, False, None)
):
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
if optimizer.averages:
with nlp.use_params(optimizer.averages):
score, other_scores = evaluate()
else:
score, other_scores = evaluate()
results.append((score, step))
is_best_checkpoint = score == max(results)[0]
else:
score, other_scores = (None, None)
is_best_checkpoint = None
words_seen += sum(len(eg) for eg in batch)
info = {
"epoch": epoch,
"step": step,
"score": score,
"other_scores": other_scores,
"losses": losses,
"checkpoints": results,
"seconds": int(timer() - start_time),
"words": words_seen,
}
yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None:
losses = {}
# Stop if no improvement in `patience` updates (if specified)
best_score, best_step = max(results)
if patience and (step - best_step) >= patience:
break
# Stop if we've exhausted our max steps (if specified)
if max_steps and step >= max_steps:
break
def subdivide_batch(batch, accumulate_gradient):
batch = list(batch)
batch.sort(key=lambda eg: len(eg.predicted))
sub_len = len(batch) // accumulate_gradient
start = 0
for i in range(accumulate_gradient):
subbatch = batch[start : start + sub_len]
if subbatch:
yield subbatch
start += len(subbatch)
subbatch = batch[start:]
if subbatch:
yield subbatch
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
nlp.meta["performance"] = {}
for metric in training["score_weights"]:
if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
def load_from_paths(
config: Config,
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
# TODO: separate checks from loading
raw_text = util.ensure_path(config["training"]["raw_text"])
if raw_text is not None:
if not raw_text.exists():
msg.fail("Can't find raw text", raw_text, exits=1)
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
tag_map = {}
morph_rules = {}
weights_data = None
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
if init_tok2vec is not None:
if not init_tok2vec.exists():
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
return raw_text, tag_map, morph_rules, weights_data
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@ -445,30 +61,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
msg.good(f"Created output directory: {output_path}") msg.good(f"Created output directory: {output_path}")
def verify_config(nlp: Language) -> None:
"""Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
for pipe_config in nlp.config["components"].values():
# We can't assume that the component name == the factory
factory = pipe_config["factory"]
if factory == "textcat":
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if pipe_config.get("positive_label"):
textcat_labels = nlp.get_pipe("textcat").labels
pos_label = pipe_config.get("positive_label")
if pos_label not in textcat_labels:
raise ValueError(
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
)
if len(list(textcat_labels)) != 2:
raise ValueError(
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
)

View File

@ -1,7 +1,7 @@
[paths] [paths]
train = "" train = null
dev = "" dev = null
raw = null vectors = null
init_tok2vec = null init_tok2vec = null
[system] [system]
@ -10,8 +10,13 @@ gpu_allocator = null
[nlp] [nlp]
lang = null lang = null
# List of pipeline component names, in order. The names should correspond to
# components defined in the [components block]
pipeline = [] pipeline = []
# Components that are loaded but disabled by default
disabled = [] disabled = []
# Optional callbacks to modify the nlp object before it's initialized, after
# it's created and after the pipeline has been set up
before_creation = null before_creation = null
after_creation = null after_creation = null
after_pipeline_creation = null after_pipeline_creation = null
@ -19,6 +24,7 @@ after_pipeline_creation = null
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
# The pipeline components and their models
[components] [components]
# Readers for corpora like dev and train. # Readers for corpora like dev and train.
@ -37,9 +43,8 @@ max_length = 0
limit = 0 limit = 0
# Apply some simply data augmentation, where we replace tokens with variations. # Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help # This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't have smart-quotes, or only have smart # generalize beyond corpora that don't/only have smart quotes etc.
# quotes, etc. augmenter = null
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev] [corpora.dev]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
@ -52,6 +57,8 @@ gold_preproc = false
max_length = 0 max_length = 0
# Limitation on number of training examples # Limitation on number of training examples
limit = 0 limit = 0
# Optional callback for data augmentation
augmenter = null
# Training hyper-parameters and additional features. # Training hyper-parameters and additional features.
[training] [training]
@ -59,11 +66,6 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator} gpu_allocator = ${system.gpu_allocator}
dropout = 0.1 dropout = 0.1
accumulate_gradient = 1 accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
max_epochs = 0 max_epochs = 0
@ -104,3 +106,19 @@ grad_clip = 1.0
use_averages = false use_averages = false
eps = 1e-8 eps = 1e-8
learn_rate = 0.001 learn_rate = 0.001
# These settings are used when nlp.initialize() is called (typically before
# training or pretraining). Components and the tokenizer can each define their
# own arguments via their initialize methods that are populated by the config.
# This lets them gather data resources, build label sets etc.
[initialize]
vectors = ${paths.vectors}
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
# Data and lookups for vocabulary
vocab_data = null
lookups = null
# Arguments passed to the tokenizer's initialize method
tokenizer = {}
# Arguments for initialize methods of the components (keyed by component)
components = {}

View File

@ -1,3 +1,6 @@
[paths]
raw_text = null
[pretraining] [pretraining]
max_epochs = 1000 max_epochs = 1000
dropout = 0.2 dropout = 0.2
@ -31,8 +34,8 @@ learn_rate = 0.001
[corpora] [corpora]
[corpora.pretrain] [corpora.pretrain]
@readers = "spacy.JsonlReader.v1" @readers = "spacy.JsonlCorpus.v1"
path = ${paths.raw} path = ${paths.raw_text}
min_length = 5 min_length = 5
max_length = 500 max_length = 500
limit = 0 limit = 0

View File

@ -85,6 +85,7 @@ class Warnings:
"attribute or operator.") "attribute or operator.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
W090 = ("Could not locate any {format} files in path '{path}'.") W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
"settings: {opts}") "settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to " E109 = ("Component '{name}' could not be run. Did you forget to "
"call begin_training()?") "call initialize()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views " E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token " "of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
"provided {found}.") "provided {found}.")
E143 = ("Labels for component '{name}' not initialized. This can be fixed " E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"by calling add_label, or by providing a representative batch of " "by calling add_label, or by providing a representative batch of "
"examples to the component's begin_training method.") "examples to the component's initialize method.")
E145 = ("Error reading `{param}` from input file.") E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.") E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the " E147 = ("Unexpected error in the {method} functionality of the "
@ -418,7 +419,7 @@ class Errors:
E164 = ("x is neither increasing nor decreasing: {}.") E164 = ("x is neither increasing nor decreasing: {}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in " E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.") "that case.")
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}") "Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}") E169 = ("Can't find module: {module}")
E170 = ("Cannot apply transition {name}: invalid for the current state.") E170 = ("Cannot apply transition {name}: invalid for the current state.")
@ -476,6 +477,10 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to " E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return " "return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?") "the modified object in your function?")
@ -517,7 +522,7 @@ class Errors:
"but the provided argument {loc} points to a file.") "but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.") "not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. " E930 = ("Received invalid get_examples callback in {name}.initialize. "
"Expected function that returns an iterable of Example objects but " "Expected function that returns an iterable of Example objects but "
"got: {obj}") "got: {obj}")
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
@ -553,7 +558,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.") "component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") E955 = ("Can't find table(s) {table} for language '{lang}' in "
"spacy-lookups-data. Make sure you have the package installed or "
"provide your own lookup tables if no default lookups are available "
"for your language.")
E956 = ("Can't find component '{name}' in [components] block in the config. " E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}") "Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in " E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -670,18 +678,17 @@ class Errors:
"'{token_attrs}'.") "'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share " E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when " E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
"initializing the pipeline:\n" "loaded. Provide the name of a pretrained model or the path to "
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' "a model and initialize the pipeline:\n\n"
'nlp = Chinese(config=cfg)') 'nlp.tokenizer.initialize(pkuseg_model="default")')
E1001 = ("Target token outside of matched span for match with tokens " E1001 = ("Target token outside of matched span for match with tokens "
"'{span}' and offset '{index}' matched by patterns '{patterns}'.") "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
E1002 = ("Span index out of range.") E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.") E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
"Required tables '{tables}', found '{found}'. If you are not " "Required tables: {tables}. Found: {found}. Maybe you forgot to "
"providing custom lookups, make sure you have the package " "call nlp.initialize() to load in the data?")
"spacy-lookups-data installed.")
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
"'{chunk}'. Tokenizer exceptions are only allowed to specify " "'{chunk}'. Tokenizer exceptions are only allowed to specify "
"`ORTH` and `NORM`.") "`ORTH` and `NORM`.")
@ -698,6 +705,9 @@ class Errors:
"options: {modes}") "options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be " E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to doc.set_ents as lists of `Span` objects.") "provided to doc.set_ents as lists of `Span` objects.")
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
@add_codes @add_codes

View File

@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory( @Bengali.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory( @Greek.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -9,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
@ -28,18 +26,11 @@ class English(Language):
@English.factory( @English.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["English"] __all__ = ["English"]

View File

@ -3,8 +3,7 @@ from ...tokens import Token
class EnglishLemmatizer(Lemmatizer): class EnglishLemmatizer(Lemmatizer):
"""English lemmatizer. Only overrides is_base_form. """English lemmatizer. Only overrides is_base_form."""
"""
def is_base_form(self, token: Token) -> bool: def is_base_form(self, token: Token) -> bool:
""" """

View File

@ -58,7 +58,7 @@ def noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps doc, token, np_left_deps, np_right_deps, stop_deps
) )
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i],)): if list(filter(filter_func, doc[left_bound.i : right.i])):
break break
else: else:
right_bound = right right_bound = right

View File

@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory( @Persian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer from .lemmatizer import FrenchLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -32,18 +31,11 @@ class French(Language):
@French.factory( @French.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["French"] __all__ = ["French"]

View File

@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -16,7 +15,7 @@ from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util from ... import util
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -1,5 +1,4 @@
from typing import Optional, Any, Dict from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
@ -10,7 +9,7 @@ from ...compat import copy_reg
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory( @Norwegian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory( @Dutch.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory( @Polish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "lookups": None}, default_config={"model": None, "mode": "pos_lookup"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -7,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
@ -24,17 +22,11 @@ class Russian(Language):
@Russian.factory( @Russian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -108,8 +108,8 @@ _num_words = [
def like_num(text): def like_num(text):
""" """
Check if text resembles a number Check if text resembles a number
""" """
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
text = text.replace(",", "").replace(".", "") text = text.replace(",", "").replace(".", "")

View File

@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory( @Swedish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class UkrainianDefaults(Language.Defaults): class UkrainianDefaults(Language.Defaults):
@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory( @Ukrainian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,10 +1,8 @@
from thinc.api import Config from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .stop_words import STOP_WORDS from ...util import DummyTokenizer, registry, load_config_from_str
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -17,7 +15,7 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer") @registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True,): def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
class VietnameseDefaults(Language.Defaults): class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,23 +1,25 @@
from typing import Optional, List, Dict, Any from typing import Optional, List, Dict, Any, Callable, Iterable
from enum import Enum from enum import Enum
import tempfile import tempfile
import srsly import srsly
import warnings import warnings
from pathlib import Path from pathlib import Path
from thinc.api import Config
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" # fmt: off
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
# fmt: on
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
@ -25,6 +27,10 @@ DEFAULT_CONFIG = """
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer" @tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char" segmenter = "char"
[initialize]
[initialize.tokenizer]
pkuseg_model = null pkuseg_model = null
pkuseg_user_dict = "default" pkuseg_user_dict = "default"
""" """
@ -41,41 +47,23 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer") @registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer( def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = "default",
):
def chinese_tokenizer_factory(nlp): def chinese_tokenizer_factory(nlp):
return ChineseTokenizer( return ChineseTokenizer(nlp, segmenter=segmenter)
nlp,
segmenter=segmenter,
pkuseg_model=pkuseg_model,
pkuseg_user_dict=pkuseg_user_dict,
)
return chinese_tokenizer_factory return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__( def __init__(
self, self, nlp: Language, segmenter: Segmenter = Segmenter.char,
nlp: Language,
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = None,
): ):
self.vocab = nlp.vocab self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter): # we might have the Enum here if isinstance(segmenter, Segmenter):
segmenter = segmenter.value segmenter = segmenter.value
self.segmenter = segmenter self.segmenter = segmenter
self.pkuseg_model = pkuseg_model
self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None self.pkuseg_seg = None
self.jieba_seg = None self.jieba_seg = None
self.configure_segmenter(segmenter)
def configure_segmenter(self, segmenter: str):
if segmenter not in Segmenter.values(): if segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format( warn_msg = Warnings.W103.format(
lang="Chinese", lang="Chinese",
@ -85,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
) )
warnings.warn(warn_msg) warnings.warn(warn_msg)
self.segmenter = Segmenter.char self.segmenter = Segmenter.char
self.jieba_seg = try_jieba_import(self.segmenter) if segmenter == Segmenter.jieba:
self.pkuseg_seg = try_pkuseg_import( self.jieba_seg = try_jieba_import()
self.segmenter,
pkuseg_model=self.pkuseg_model, def initialize(
pkuseg_user_dict=self.pkuseg_user_dict, self,
) get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: str = "default",
):
if self.segmenter == Segmenter.pkuseg:
self.pkuseg_seg = try_pkuseg_import(
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba: if self.segmenter == Segmenter.jieba:
@ -145,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return { return {
"segmenter": self.segmenter, "segmenter": self.segmenter,
"pkuseg_model": self.pkuseg_model,
"pkuseg_user_dict": self.pkuseg_user_dict,
} }
def _set_config(self, config: Dict[str, Any] = {}) -> None: def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.segmenter = config.get("segmenter", Segmenter.char) self.segmenter = config.get("segmenter", Segmenter.char)
self.pkuseg_model = config.get("pkuseg_model", None)
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
pkuseg_features_b = b"" pkuseg_features_b = b""
@ -163,6 +156,22 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.feature_extractor.save(tempdir)
self.pkuseg_seg.model.save(tempdir) self.pkuseg_seg.model.save(tempdir)
tempdir = Path(tempdir) tempdir = Path(tempdir)
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
# means that it will be saved with pickle protocol 5 with
# python 3.8, which can't be reloaded with python 3.6-3.7.
# To try to make the model compatible with python 3.6+, reload
# the data with pickle5 and convert it back to protocol 4.
try:
import pickle5
with open(tempdir / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(tempdir / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
with open(tempdir / "features.pkl", "rb") as fileh: with open(tempdir / "features.pkl", "rb") as fileh:
pkuseg_features_b = fileh.read() pkuseg_features_b = fileh.read()
with open(tempdir / "weights.npz", "rb") as fileh: with open(tempdir / "weights.npz", "rb") as fileh:
@ -235,6 +244,18 @@ class ChineseTokenizer(DummyTokenizer):
path.mkdir(parents=True) path.mkdir(parents=True)
self.pkuseg_seg.model.save(path) self.pkuseg_seg.model.save(path)
self.pkuseg_seg.feature_extractor.save(path) self.pkuseg_seg.feature_extractor.save(path)
# try to convert features.pkl to pickle protocol 4
try:
import pickle5
with open(path / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(path / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
def save_pkuseg_processors(path): def save_pkuseg_processors(path):
if self.pkuseg_seg: if self.pkuseg_seg:
@ -291,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@ -302,47 +323,33 @@ class Chinese(Language):
Defaults = ChineseDefaults Defaults = ChineseDefaults
def try_jieba_import(segmenter: str) -> None: def try_jieba_import() -> None:
try: try:
import jieba import jieba
if segmenter == Segmenter.jieba: # segment a short text to have jieba initialize its cache in advance
# segment a short text to have jieba initialize its cache in advance list(jieba.cut("作为", cut_all=False))
list(jieba.cut("作为", cut_all=False))
return jieba return jieba
except ImportError: except ImportError:
if segmenter == Segmenter.jieba: msg = (
msg = ( "Jieba not installed. To use jieba, install it with `pip "
"Jieba not installed. To use jieba, install it with `pip " " install jieba` or from https://github.com/fxsjy/jieba"
" install jieba` or from https://github.com/fxsjy/jieba" )
) raise ImportError(msg) from None
raise ImportError(msg) from None
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None: def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
try: try:
import pkuseg import pkuseg
if pkuseg_model: return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif segmenter == Segmenter.pkuseg:
msg = (
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model "
"or the path to a model with:\n"
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
"nlp = Chinese.from_config(cfg)"
)
raise ValueError(msg)
except ImportError: except ImportError:
if segmenter == Segmenter.pkuseg: msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) from None
raise ImportError(msg) from None
except FileNotFoundError: except FileNotFoundError:
if segmenter == Segmenter.pkuseg: msg = "Unable to load pkuseg model from: " + pkuseg_model
msg = "Unable to load pkuseg model from: " + pkuseg_model raise FileNotFoundError(msg) from None
raise FileNotFoundError(msg) from None
def _get_pkuseg_trie_data(node, path=""): def _get_pkuseg_trie_data(node, path=""):

View File

@ -8,7 +8,7 @@ from contextlib import contextmanager
from copy import deepcopy from copy import deepcopy
from pathlib import Path from pathlib import Path
import warnings import warnings
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer from thinc.api import Model, get_current_ops, Config, Optimizer
import srsly import srsly
import multiprocessing as mp import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer from .scorer import Scorer
from .util import create_default_optimizer, registry, SimpleFrozenList from .util import registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc from .tokens import Doc
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
@ -1066,7 +1068,7 @@ class Language:
validate_examples(examples, "Language.update") validate_examples(examples, "Language.update")
if sgd is None: if sgd is None:
if self._optimizer is None: if self._optimizer is None:
self._optimizer = create_default_optimizer() self._optimizer = self.create_optimizer()
sgd = self._optimizer sgd = self._optimizer
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
@ -1124,7 +1126,7 @@ class Language:
validate_examples(examples, "Language.rehearse") validate_examples(examples, "Language.rehearse")
if sgd is None: if sgd is None:
if self._optimizer is None: if self._optimizer is None:
self._optimizer = create_default_optimizer() self._optimizer = self.create_optimizer()
sgd = self._optimizer sgd = self._optimizer
pipes = list(self.pipeline) pipes = list(self.pipeline)
random.shuffle(pipes) random.shuffle(pipes)
@ -1154,61 +1156,73 @@ class Language:
get_examples: Optional[Callable[[], Iterable[Example]]] = None, get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*, *,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
device: int = -1, ) -> Optimizer:
warnings.warn(Warnings.W089, DeprecationWarning)
return self.initialize(get_examples, sgd=sgd)
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Optional function that
returns gold-standard Example objects. returns gold-standard Example objects.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with sgd (Optional[Optimizer]): An optimizer to use for updates. If not
create_optimizer if it doesn't exist. provided, will be created using the .create_optimizer() method.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/language#begin_training DOCS: https://nightly.spacy.io/api/language#initialize
""" """
if get_examples is None: if get_examples is None:
util.logger.debug( util.logger.debug(
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
) )
doc = Doc(self.vocab, words=["x", "y", "z"]) doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})] get_examples = lambda: [Example.from_dict(doc, {})]
# Populate vocab
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples)) err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
valid_examples = False # Make sure the config is interpolated so we can resolve subsections
for example in get_examples(): config = self.config.interpolate()
if not isinstance(example, Example): # These are the settings provided in the [initialize] block in the config
err = Errors.E978.format( I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
name="Language.begin_training", types=type(example) init_vocab(
) self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
raise ValueError(err) )
else: pretrain_cfg = config.get("pretraining")
valid_examples = True if pretrain_cfg:
for word in [t.text for t in example.reference]: P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
_ = self.vocab[word] # noqa: F841 init_tok2vec(self, P, I)
if not valid_examples: if self.vocab.vectors.data.shape[1] >= 1:
err = Errors.E930.format(name="Language", obj="empty list") ops = get_current_ops()
raise ValueError(err) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if device >= 0: # TODO: do we need this here? if hasattr(self.tokenizer, "initialize"):
require_gpu(device) tok_settings = validate_init_settings(
if self.vocab.vectors.data.shape[1] >= 1: self.tokenizer.initialize,
ops = get_current_ops() I["tokenizer"],
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) section="tokenizer",
if sgd is None: name="tokenizer",
sgd = create_default_optimizer() )
self._optimizer = sgd self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "begin_training"): if hasattr(proc, "initialize"):
proc.begin_training( p_settings = I["components"].get(name, {})
get_examples, pipeline=self.pipeline, sgd=self._optimizer p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name
) )
proc.initialize(get_examples, nlp=self, **p_settings)
self._link_components() self._link_components()
self._optimizer = sgd
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
return self._optimizer return self._optimizer
def resume_training( def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
self, *, sgd: Optional[Optimizer] = None, device: int = -1
) -> Optimizer:
"""Continue training a pretrained model. """Continue training a pretrained model.
Create and return an optimizer, and initialize "rehearsal" for any pipeline Create and return an optimizer, and initialize "rehearsal" for any pipeline
@ -1217,22 +1231,20 @@ class Language:
rehearsal, collect samples of text you want the models to retain performance rehearsal, collect samples of text you want the models to retain performance
on, and call nlp.rehearse() with a batch of Example objects. on, and call nlp.rehearse() with a batch of Example objects.
sgd (Optional[Optimizer]): An optimizer.
RETURNS (Optimizer): The optimizer. RETURNS (Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/language#resume_training DOCS: https://nightly.spacy.io/api/language#resume_training
""" """
if device >= 0: # TODO: do we need this here? ops = get_current_ops()
require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if sgd is None:
sgd = create_default_optimizer()
self._optimizer = sgd
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"): if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) proc._rehearsal_model = deepcopy(proc.model)
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
return self._optimizer return self._optimizer
def evaluate( def evaluate(
@ -1294,6 +1306,11 @@ class Language:
results["speed"] = n_words / (end_time - start_time) results["speed"] = n_words / (end_time - start_time)
return results return results
def create_optimizer(self):
"""Create an optimizer, usually using the [training.optimizer] config."""
subconfig = {"optimizer": self.config["training"]["optimizer"]}
return registry.resolve(subconfig)["optimizer"]
@contextmanager @contextmanager
def use_params(self, params: Optional[dict]): def use_params(self, params: Optional[dict]):
"""Replace weights of models in the pipeline with those provided in the """Replace weights of models in the pipeline with those provided in the
@ -1502,7 +1519,7 @@ class Language:
).merge(config) ).merge(config)
if "nlp" not in config: if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config)) raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"] config_lang = config["nlp"].get("lang")
if config_lang is not None and config_lang != cls.lang: if config_lang is not None and config_lang != cls.lang:
raise ValueError( raise ValueError(
Errors.E958.format( Errors.E958.format(

View File

@ -0,0 +1,28 @@
from typing import List, Union, Callable, Tuple
from thinc.types import Ints2d
from thinc.api import Model, registry
from ..tokens import Doc
@registry.layers("spacy.FeatureExtractor.v1")
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns})
def forward(
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
if hasattr(doc, "to_array"):
attrs = doc.to_array(columns)
else:
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
if attrs.ndim == 1:
attrs = attrs.reshape((attrs.shape[0], 1))
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor from thinc.api import Relu, residual, expand_window
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")

View File

@ -1,16 +1,16 @@
from typing import Optional, List from typing import Optional, List, Union
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list
from thinc.api import FeatureExtractor, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ...ml import _character_embed from ...ml import _character_embed
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1") @registry.architectures.register("spacy.Tok2VecListener.v1")
@ -98,7 +98,7 @@ def MultiHashEmbed(
attributes using hash embedding, concatenates the results, and passes it attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations. through a feed-forward subnetwork to build a mixed representations.
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in. varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the Vectors from pretrained static vectors can also be incorporated into the
concatenated representation. concatenated representation.
@ -115,7 +115,7 @@ def MultiHashEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors. also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7 seed = 7
def make_hash_embed(feature): def make_hash_embed(feature):
@ -123,7 +123,7 @@ def MultiHashEmbed(
seed += 1 seed += 1
return HashEmbed( return HashEmbed(
width, width,
rows if feature == NORM else rows // 2, rows if feature == LOWER else rows // 2,
column=cols.index(feature), column=cols.index(feature),
seed=seed, seed=seed,
dropout=0.0, dropout=0.0,
@ -131,13 +131,13 @@ def MultiHashEmbed(
if also_embed_subwords: if also_embed_subwords:
embeddings = [ embeddings = [
make_hash_embed(NORM), make_hash_embed(LOWER),
make_hash_embed(PREFIX), make_hash_embed(PREFIX),
make_hash_embed(SUFFIX), make_hash_embed(SUFFIX),
make_hash_embed(SHAPE), make_hash_embed(SHAPE),
] ]
else: else:
embeddings = [make_hash_embed(NORM)] embeddings = [make_hash_embed(LOWER)]
concat_size = width * (len(embeddings) + also_use_static_vectors) concat_size = width * (len(embeddings) + also_use_static_vectors)
if also_use_static_vectors: if also_use_static_vectors:
model = chain( model = chain(
@ -165,7 +165,8 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1") @registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed( def CharacterEmbed(
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
feature: Union[int, str]="LOWER"
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedded representation based on character embeddings, using """Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for a feed-forward network. A fixed number of UTF-8 byte characters are used for
@ -179,12 +180,13 @@ def CharacterEmbed(
of being in an arbitrary position depending on the word length. of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with a given number of rows, The characters are embedded in a embedding table with a given number of rows,
and the vectors concatenated. A hash-embedded vector of the NORM of the word is and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
also concatenated on, and the result is then passed through a feed-forward also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information. network to construct a single vector to represent the information.
width (int): The width of the output vector and the NORM hash embedding. feature (int or str): An attribute to embed, to concatenate with the characters.
rows (int): The number of rows in the NORM hash embedding table. width (int): The width of the output vector and the feature embedding.
rows (int): The number of rows in the LOWER hash embedding table.
nM (int): The dimensionality of the character embeddings. Recommended values nM (int): The dimensionality of the character embeddings. Recommended values
are between 16 and 64. are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values
@ -193,12 +195,15 @@ def CharacterEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors. also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
feature = intify_attr(feature)
if feature is None:
raise ValueError("Invalid feature: Must be a token attribute.")
if also_use_static_vectors: if also_use_static_vectors:
model = chain( model = chain(
concatenate( concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
chain( chain(
FeatureExtractor([NORM]), FeatureExtractor([feature]),
list2ragged(), list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
), ),
@ -214,7 +219,7 @@ def CharacterEmbed(
concatenate( concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
chain( chain(
FeatureExtractor([NORM]), FeatureExtractor([feature]),
list2ragged(), list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
), ),

View File

@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
def analyze_pipes( def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows """Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as a table with the pipeline components and why they assign and require, as

View File

@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
matches = self.matcher(doc, allow_missing=True) matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence # Sort by the attribute ID, so that later rules have precendence
matches = [ matches = [
(int(self.vocab.strings[m_id]), m_id, s, e) (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
for m_id, s, e in matches
] ]
matches.sort() matches.sort()
for attr_id, match_id, start, end in matches: for attr_id, match_id, start, end in matches:
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
try: try:
# The index can be negative, which makes it annoying to do # The index can be negative, which makes it annoying to do
# the boundscheck. Let Span do it instead. # the boundscheck. Let Span do it instead.
token = span[index] token = span[index] # noqa: F841
except IndexError: except IndexError:
# The original exception is just our conditional logic, so we # The original exception is just our conditional logic, so we
# raise from. # raise from.
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
span=[t.text for t in span], span=[t.text for t in span],
index=index, index=index,
) )
) from None ) from None
set_token_attrs(span[index], attrs) set_token_attrs(span[index], attrs)
return doc return doc

View File

@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
def add_multitask_objective(self, mt_component): def add_multitask_objective(self, mt_component):
self._multitasks.append(mt_component) self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks: for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels)) labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"): if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) labeller.initialize(get_examples, nlp=nlp)
@property @property
def labels(self): def labels(self):

View File

@ -1,5 +1,5 @@
from itertools import islice from itertools import islice
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
from pathlib import Path from pathlib import Path
import srsly import srsly
import random import random
@ -140,26 +140,20 @@ class EntityLinker(Pipe):
if len(self.kb) == 0: if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name)) raise ValueError(Errors.E139.format(name=self.name))
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, nlp: Optional[Language] = None,
sgd: Optional[Optimizer] = None, ):
) -> Optimizer:
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training DOCS: https://nightly.spacy.io/api/entitylinker#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
self._require_kb() self._require_kb()
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
self.model.initialize( self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
) )
if sgd is None:
sgd = self.create_optimizer()
return sgd
def update( def update(
self, self,

View File

@ -1,26 +1,25 @@
from typing import Optional, List, Dict, Any from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
from typing import Tuple
from thinc.api import Model from thinc.api import Model
from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..language import Language from ..language import Language
from ..training import Example
from ..lookups import Lookups, load_lookups from ..lookups import Lookups, load_lookups
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Token from ..tokens import Doc, Token
from ..vocab import Vocab from ..vocab import Vocab
from ..training import validate_examples from ..training import validate_examples
from ..util import logger, SimpleFrozenList
from .. import util from .. import util
@Language.factory( @Language.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={ default_config={"model": None, "mode": "lookup", "overwrite": False},
"model": None,
"mode": "lookup",
"lookups": None,
"overwrite": False,
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
@ -28,13 +27,9 @@ def make_lemmatizer(
model: Optional[Model], model: Optional[Model],
name: str, name: str,
mode: str, mode: str,
lookups: Optional[Lookups],
overwrite: bool = False, overwrite: bool = False,
): ):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
)
class Lemmatizer(Pipe): class Lemmatizer(Pipe):
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
""" """
@classmethod @classmethod
def get_lookups_config(cls, mode: str) -> Dict: def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
"""Returns the lookups configuration settings for a given mode for use """Returns the lookups configuration settings for a given mode for use
in Lemmatizer.load_lookups. in Lemmatizer.load_lookups.
mode (str): The lemmatizer mode. mode (str): The lemmatizer mode.
RETURNS (dict): The lookups configuration settings for this mode. RETURNS (Tuple[List[str], List[str]]): The required and optional
lookup tables for this mode.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
""" """
if mode == "lookup": if mode == "lookup":
return { return (["lemma_lookup"], [])
"required_tables": ["lemma_lookup"],
}
elif mode == "rule": elif mode == "rule":
return { return (["lemma_rules"], ["lemma_exc", "lemma_index"])
"required_tables": ["lemma_rules"], return ([], [])
"optional_tables": ["lemma_exc", "lemma_index"],
}
return {}
@classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode
are present.
lang (str): The language code.
mode (str): The lemmatizer mode.
lookups (Lookups): The provided lookups, may be None if the default
lookups should be loaded.
RETURNS (Lookups): The Lookups object.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
"""
config = cls.get_lookups_config(mode)
required_tables = config.get("required_tables", [])
optional_tables = config.get("optional_tables", [])
if lookups is None:
lookups = load_lookups(lang=lang, tables=required_tables)
optional_lookups = load_lookups(
lang=lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
for table in required_tables:
if table not in lookups:
raise ValueError(
Errors.E1004.format(
mode=mode, tables=required_tables, found=lookups.tables
)
)
return lookups
def __init__( def __init__(
self, self,
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
name: str = "lemmatizer", name: str = "lemmatizer",
*, *,
mode: str = "lookup", mode: str = "lookup",
lookups: Optional[Lookups] = None,
overwrite: bool = False, overwrite: bool = False,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
model (Model): A model (not yet implemented). model (Model): A model (not yet implemented).
name (str): The component name. Defaults to "lemmatizer". name (str): The component name. Defaults to "lemmatizer".
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None
overwrite (bool): Whether to overwrite existing lemmas. Defaults to overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`. `False`.
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
self.model = model self.model = model
self.name = name self.name = name
self._mode = mode self._mode = mode
self.lookups = lookups if lookups is not None else Lookups() self.lookups = Lookups()
self.overwrite = overwrite self.overwrite = overwrite
self._validated = False
if self.mode == "lookup": if self.mode == "lookup":
self.lemmatize = self.lookup_lemmatize self.lemmatize = self.lookup_lemmatize
elif self.mode == "rule": elif self.mode == "rule":
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#call DOCS: https://nightly.spacy.io/api/lemmatizer#call
""" """
if not self._validated:
self._validate_tables(Errors.E1004)
for token in doc: for token in doc:
if self.overwrite or token.lemma == 0: if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0] token.lemma_ = self.lemmatize(token)[0]
return doc return doc
def pipe(self, stream, *, batch_size=128): def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
lookups: Optional[Lookups] = None,
):
"""Initialize the lemmatizer and load in data.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None.
"""
required_tables, optional_tables = self.get_lookups_config(self.mode)
if lookups is None:
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
optional_lookups = load_lookups(
lang=self.vocab.lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
self.lookups = lookups
self._validate_tables(Errors.E1004)
def _validate_tables(self, error_message: str = Errors.E912) -> None:
"""Check that the lookups are correct for the current mode."""
required_tables, optional_tables = self.get_lookups_config(self.mode)
for table in required_tables:
if table not in self.lookups:
raise ValueError(
error_message.format(
mode=self.mode,
tables=required_tables,
found=self.lookups.tables,
)
)
self._validated = True
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
applied to the Doc. applied to the Doc.
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
""" """
return False return False
def score(self, examples, **kwargs) -> Dict[str, Any]: def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples. """Score a batch of examples.
examples (Iterable[Example]): The examples to score. examples (Iterable[Example]): The examples to score.
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
validate_examples(examples, "Lemmatizer.score") validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs) return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(self, path, *, exclude=tuple()): def to_disk(
"""Save the current state to a directory. self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
):
"""Serialize the pipe to disk.
path (unicode or Path): A path to a directory, which will be created if path (str / Path): Path to a directory.
it doesn't exist. exclude (Iterable[str]): String names of serialization fields to exclude.
exclude (list): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/vocab#to_disk DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
""" """
serialize = {} serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()): def from_disk(
"""Loads state from a directory. Modifies the object in place and self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
returns it. ) -> "Lemmatizer":
"""Load the pipe from disk. Modifies the object in place and returns it.
path (unicode or Path): A path to a directory. path (str / Path): Path to a directory.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object. RETURNS (Lemmatizer): The modified Lemmatizer object.
DOCS: https://nightly.spacy.io/api/vocab#to_disk DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
self._validate_tables()
return self
def to_bytes(self, *, exclude=tuple()) -> bytes: def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string. """Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object. RETURNS (bytes): The serialized object.
DOCS: https://nightly.spacy.io/api/vocab#to_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
""" """
serialize = {} serialize = {}
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): def from_bytes(
"""Load state from a binary string. self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Lemmatizer":
"""Load the pipe from a bytestring.
bytes_data (bytes): The data to load from. bytes_data (bytes): The serialized pipe.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object. RETURNS (Lemmatizer): The loaded Lemmatizer.
DOCS: https://nightly.spacy.io/api/vocab#from_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables()
return self

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional from typing import Optional, Union, Dict
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
"""RETURNS (Tuple[str]): The labels currently added to the component.""" """RETURNS (Tuple[str]): The labels currently added to the component."""
return tuple(self.cfg["labels_morph"].keys()) return tuple(self.cfg["labels_morph"].keys())
@property
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
"""A dictionary with all labels data."""
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
def add_label(self, label): def add_label(self, label):
"""Add a new label to the pipe. """Add a new label to the pipe.
@ -129,27 +134,22 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1 return 1
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training DOCS: https://nightly.spacy.io/api/morphologizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
# First, fetch all labels from the data # First, fetch all labels from the data
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
# create and add the combined morph+POS label # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
gold_array = [] gold_array = []
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
morph_dict[self.POS_FEAT] = pos morph_dict[self.POS_FEAT] = pos
@ -178,9 +178,6 @@ class Morphologizer(Tagger):
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores. """Modify a batch of documents, using pre-computed scores.

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, nlp=None):
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
if label is not None and label not in self.labels: if label is not None and label not in self.labels:
self.labels[label] = len(self.labels) self.labels[label] = len(self.labels)
self.model.initialize() # TODO: fix initialization by defining X and Y self.model.initialize() # TODO: fix initialization by defining X and Y
if sgd is None:
sgd = self.create_optimizer()
return sgd
def predict(self, docs): def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs) tokvecs = self.model.get_ref("tok2vec")(docs)
@ -177,13 +174,10 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, nlp=None):
self.model.initialize() # TODO: fix initialization by defining X and Y self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X) self.model.output_layer.initialize(X)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def predict(self, docs): def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs) tokvecs = self.model.get_ref("tok2vec")(docs)

View File

@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
"""Register another component as a multi-task objective. Experimental.""" """Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component) self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
"""Setup multi-task objective components. Experimental and internal.""" """Setup multi-task objective components. Experimental and internal."""
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks: for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels)) labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"): if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline) labeller.initialize(get_examples, nlp=nlp)
@property @property
def labels(self): def labels(self):

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Optional, Tuple
import srsly import srsly
from thinc.api import set_dropout_rate, Model from thinc.api import set_dropout_rate, Model
@ -32,6 +33,17 @@ cdef class Pipe:
self.name = name self.name = name
self.cfg = dict(cfg) self.cfg = dict(cfg)
@property
def labels(self) -> Optional[Tuple[str]]:
return []
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
the label set if provided to the `pipe.initialize()` method.
"""
return None
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is modified in place, """Apply the pipe to one document. The document is modified in place,
and returned. This usually happens under the hood when the nlp object and returned. This usually happens under the hood when the nlp object
@ -183,7 +195,7 @@ cdef class Pipe:
""" """
return util.create_default_optimizer() return util.create_default_optimizer()
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component, This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly ensuring the internal model (if available) is initialized properly
@ -191,16 +203,11 @@ cdef class Pipe:
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#begin_training DOCS: https://nightly.spacy.io/api/pipe#initialize
""" """
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) pass
def _ensure_examples(self, get_examples): def _ensure_examples(self, get_examples):
if get_examples is None or not hasattr(get_examples, "__call__"): if get_examples is None or not hasattr(get_examples, "__call__"):

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else: else:
self.punct_chars = set(self.default_punct_chars) self.punct_chars = set(self.default_punct_chars)
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, nlp=None):
pass pass
def __call__(self, doc): def __call__(self, doc):

View File

@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
# are 0 # are 0
return tuple(["I", "S"]) return tuple(["I", "S"])
@property
def label_data(self):
return self.labels
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores. """Modify a batch of documents, using pre-computed scores.
@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
doc_sample = [] doc_sample = []
@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger):
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def add_label(self, label, values=None): def add_label(self, label, values=None):
raise NotImplementedError raise NotImplementedError

View File

@ -90,6 +90,11 @@ class Tagger(Pipe):
""" """
return tuple(self.cfg["labels"]) return tuple(self.cfg["labels"])
@property
def label_data(self):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to a Doc. """Apply the pipe to a Doc.
@ -256,31 +261,33 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, nlp=None, labels=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.. returns a representative sample of gold-standard Example objects..
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to labels: The labels to add to the component, typically generated by the
nlp.pipeline. `init labels` command. If no labels are provided, the get_examples
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with callback is used to extract the labels from the data.
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tagger#begin_training DOCS: https://nightly.spacy.io/api/tagger#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
if labels is not None:
for tag in labels:
self.add_label(tag)
else:
tags = set()
for example in get_examples():
for token in example.y:
if token.tag_:
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
doc_sample = [] doc_sample = []
label_sample = [] label_sample = []
tags = set()
for example in get_examples():
for token in example.y:
if token.tag_:
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
for example in islice(get_examples(), 10): for example in islice(get_examples(), 10):
doc_sample.append(example.x) doc_sample.append(example.x)
gold_tags = example.get_aligned("TAG", as_string=True) gold_tags = example.get_aligned("TAG", as_string=True)
@ -289,9 +296,6 @@ class Tagger(Pipe):
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def add_label(self, label): def add_label(self, label):
"""Add a new label to the pipe. """Add a new label to the pipe.

View File

@ -154,8 +154,16 @@ class TextCategorizer(Pipe):
@labels.setter @labels.setter
def labels(self, value: List[str]) -> None: def labels(self, value: List[str]) -> None:
# TODO: This really shouldn't be here. I had a look and I added it when
# I added the labels property, but it's pretty nasty to have this, and
# will lead to problems.
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
@property
def label_data(self) -> List[str]:
"""RETURNS (List[str]): Information about the component's labels."""
return self.labels
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
@ -334,43 +342,40 @@ class TextCategorizer(Pipe):
self.labels = tuple(list(self.labels) + [label]) self.labels = tuple(list(self.labels) + [label])
return 1 return 1
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, nlp: Optional[Language] = None,
sgd: Optional[Optimizer] = None, labels: Optional[Dict] = None,
) -> Optimizer: ):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to labels: The labels to add to the component, typically generated by the
nlp.pipeline. `init labels` command. If no labels are provided, the get_examples
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with callback is used to extract the labels from the data.
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
subbatch = [] # Select a subbatch of examples to initialize the model if labels is None:
for example in islice(get_examples(), 10): for example in get_examples():
if len(subbatch) < 2: for cat in example.y.cats:
subbatch.append(example) self.add_label(cat)
for cat in example.y.cats: else:
self.add_label(cat) for label in labels:
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch] doc_sample = [eg.reference for eg in subbatch]
label_sample, _ = self._examples_to_truth(subbatch) label_sample, _ = self._examples_to_truth(subbatch)
self._require_labels() self._require_labels()
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples. """Score a batch of examples.

View File

@ -1,4 +1,4 @@
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
from thinc.api import Model, set_dropout_rate, Optimizer, Config from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice from itertools import islice
@ -203,26 +203,20 @@ class Tok2Vec(Pipe):
def get_loss(self, examples, scores) -> None: def get_loss(self, examples, scores) -> None:
pass pass
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, nlp: Optional[Language] = None,
sgd: Optional[Optimizer] = None,
): ):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training DOCS: https://nightly.spacy.io/api/tok2vec#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
doc_sample = [] doc_sample = []

View File

@ -1,4 +1,4 @@
# cython: infer_types=True, cdivision=True, boundscheck=False # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
from __future__ import print_function from __future__ import print_function
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
cimport numpy as np cimport numpy as np
@ -7,6 +7,7 @@ from libcpp.vector cimport vector
from libc.string cimport memset from libc.string cimport memset
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
import random import random
from typing import Optional
import srsly import srsly
from thinc.api import set_dropout_rate from thinc.api import set_dropout_rate
@ -95,6 +96,10 @@ cdef class Parser(Pipe):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
return class_names return class_names
@property
def label_data(self):
return self.moves.labels
@property @property
def tok2vec(self): def tok2vec(self):
"""Return the embedding and convolutional layer of the model.""" """Return the embedding and convolutional layer of the model."""
@ -354,7 +359,7 @@ cdef class Parser(Pipe):
# If all weights for an output are 0 in the original model, don't # If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes. # supervise that output. This allows us to add classes.
loss += (d_scores**2).sum() loss += (d_scores**2).sum()
backprop(d_scores, sgd=sgd) backprop(d_scores)
# Follow the predicted action # Follow the predicted action
self.transition_states(states, guesses) self.transition_states(states, guesses)
states = [state for state in states if not state.is_final()] states = [state for state in states if not state.is_final()]
@ -405,18 +410,20 @@ cdef class Parser(Pipe):
def set_output(self, nO): def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): def initialize(self, get_examples, nlp=None, labels=None):
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS) langs = ", ".join(util.LEXEME_NORM_LANGS)
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
actions = self.moves.get_actions( if labels is not None:
examples=get_examples(), actions = dict(labels)
min_freq=self.cfg['min_action_freq'], else:
learn_tokens=self.cfg["learn_tokens"] actions = self.moves.get_actions(
) examples=get_examples(),
min_freq=self.cfg['min_action_freq'],
learn_tokens=self.cfg["learn_tokens"]
)
for action, labels in self.moves.labels.items(): for action, labels in self.moves.labels.items():
actions.setdefault(action, {}) actions.setdefault(action, {})
for label, freq in labels.items(): for label, freq in labels.items():
@ -425,11 +432,9 @@ cdef class Parser(Pipe):
self.moves.initialize_actions(actions) self.moves.initialize_actions(actions)
# make sure we resize so we have an appropriate upper layer # make sure we resize so we have an appropriate upper layer
self._resize() self._resize()
if sgd is None:
sgd = self.create_optimizer()
doc_sample = [] doc_sample = []
if pipeline is not None: if nlp is not None:
for name, component in pipeline: for name, component in nlp.pipeline:
if component is self: if component is self:
break break
if hasattr(component, "pipe"): if hasattr(component, "pipe"):
@ -441,9 +446,8 @@ cdef class Parser(Pipe):
doc_sample.append(example.predicted) doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample) self.model.initialize(doc_sample)
if pipeline is not None: if nlp is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, nlp.pipeline)
return sgd
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):
serializers = { serializers = {

View File

@ -1,15 +1,17 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError
from thinc.config import Promise from thinc.config import Promise
from collections import defaultdict from collections import defaultdict
from thinc.api import Optimizer import inspect
from .attrs import NAMES from .attrs import NAMES
from .lookups import Lookups from .lookups import Lookups
from .util import is_cython_func
if TYPE_CHECKING: if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
@ -44,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
# Initialization
class ArgSchemaConfig:
extra = "forbid"
arbitrary_types_allowed = True
class ArgSchemaConfigExtra:
extra = "forbid"
arbitrary_types_allowed = True
def get_arg_model(
func: Callable,
*,
exclude: Iterable[str] = tuple(),
name: str = "ArgModel",
strict: bool = True,
) -> ModelMetaclass:
"""Generate a pydantic model for function arguments.
func (Callable): The function to generate the schema for.
exclude (Iterable[str]): Parameter names to ignore.
name (str): Name of created model class.
strict (bool): Don't allow extra arguments if no variable keyword arguments
are allowed on the function.
RETURNS (ModelMetaclass): A pydantic model.
"""
sig_args = {}
try:
sig = inspect.signature(func)
except ValueError:
# Typically happens if the method is part of a Cython module without
# binding=True. Here we just use an empty model that allows everything.
return create_model(name, __config__=ArgSchemaConfigExtra)
has_variable = False
for param in sig.parameters.values():
if param.name in exclude:
continue
if param.kind == param.VAR_KEYWORD:
# The function allows variable keyword arguments so we shouldn't
# include **kwargs etc. in the schema and switch to non-strict
# mode and pass through all other values
has_variable = True
continue
# If no annotation is specified assume it's anything
annotation = param.annotation if param.annotation != param.empty else Any
# If no default value is specified assume that it's required. Cython
# functions/methods will have param.empty for default value None so we
# need to treat them differently
default_empty = None if is_cython_func(func) else ...
default = param.default if param.default != param.empty else default_empty
sig_args[param.name] = (annotation, default)
is_strict = strict and not has_variable
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
return create_model(name, **sig_args)
def validate_init_settings(
func: Callable,
settings: Dict[str, Any],
*,
section: Optional[str] = None,
name: str = "",
exclude: Iterable[str] = ("get_examples", "nlp"),
) -> Dict[str, Any]:
"""Validate initialization settings against the expected arguments in
the method signature. Will parse values if possible (e.g. int to string)
and return the updated settings dict. Will raise a ConfigValidationError
if types don't match or required values are missing.
func (Callable): The initialize method of a given component etc.
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
section (str): Initialize section, for error message.
name (str): Name of the block in the section.
exclude (Iterable[str]): Parameter names to exclude from schema.
RETURNS (Dict[str, Any]): The validated settings.
"""
schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
try:
return schema(**settings).dict()
except ValidationError as e:
block = "initialize" if not section else f"initialize.{section}"
title = f"Error validating initialization settings in [{block}]"
raise ConfigValidationError(
title=title, errors=e.errors(), config=settings, parent=name
) from None
# Matcher token patterns # Matcher token patterns
@ -190,7 +282,7 @@ class ModelMetaSchema(BaseModel):
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors") vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers") performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
# fmt: on # fmt: on
@ -205,8 +297,6 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel): class ConfigSchemaTraining(BaseModel):
# fmt: off # fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
@ -219,8 +309,6 @@ class ConfigSchemaTraining(BaseModel):
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
optimizer: Optimizer = Field(..., title="The optimizer to use") optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress") logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@ -273,36 +361,40 @@ class ConfigSchemaPretrain(BaseModel):
arbitrary_types_allowed = True arbitrary_types_allowed = True
class ConfigSchemaInit(BaseModel):
# fmt: off
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchema(BaseModel): class ConfigSchema(BaseModel):
training: ConfigSchemaTraining training: ConfigSchemaTraining
nlp: ConfigSchemaNlp nlp: ConfigSchemaNlp
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
components: Dict[str, Dict[str, Any]] components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader] corpora: Dict[str, Reader]
initialize: ConfigSchemaInit
@root_validator(allow_reuse=True)
def validate_config(cls, values):
"""Perform additional validation for settings with dependencies."""
pt = values.get("pretraining")
if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
err = "Need nlp.vectors if pretraining.objective.type is vectors"
raise ValueError(err)
return values
class Config: class Config:
extra = "allow" extra = "allow"
arbitrary_types_allowed = True arbitrary_types_allowed = True
class TrainingSchema(BaseModel): CONFIG_SCHEMAS = {
training: ConfigSchemaTraining "nlp": ConfigSchemaNlp,
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} "training": ConfigSchemaTraining,
corpora: Dict[str, Reader] "pretraining": ConfigSchemaPretrain,
"initialize": ConfigSchemaInit,
class Config: }
extra = "allow"
arbitrary_types_allowed = True
# Project config Schema # Project config Schema

View File

@ -32,9 +32,7 @@ class PRFScore:
def __add__(self, other): def __add__(self, other):
return PRFScore( return PRFScore(
tp=self.tp+other.tp, tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
fp=self.fp+other.fp,
fn=self.fn+other.fn
) )
def score_set(self, cand: set, gold: set) -> None: def score_set(self, cand: set, gold: set) -> None:
@ -485,7 +483,7 @@ class Scorer:
(pred_ent.start_char, pred_ent.end_char), None (pred_ent.start_char, pred_ent.end_char), None
) )
label = gold_span.label_ label = gold_span.label_
if not label in f_per_type: if label not in f_per_type:
f_per_type[label] = PRFScore() f_per_type[label] = PRFScore()
gold = gold_span.kb_id_ gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred, # only evaluating entities that overlap between gold and pred,
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
continue continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents} golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y align_x2y = eg.alignment.x2y
preds = set()
for pred_ent in eg.x.ents: for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores: if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore() scores[pred_ent.label_] = PRFScore()

View File

@ -272,22 +272,35 @@ def zh_tokenizer_char():
def zh_tokenizer_jieba(): def zh_tokenizer_jieba():
pytest.importorskip("jieba") pytest.importorskip("jieba")
config = { config = {
"@tokenizers": "spacy.zh.ChineseTokenizer", "nlp": {
"segmenter": "jieba", "tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba",
}
}
} }
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) nlp = get_lang_class("zh").from_config(config)
return nlp.tokenizer return nlp.tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def zh_tokenizer_pkuseg(): def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg") pytest.importorskip("pkuseg")
pytest.importorskip("pickle5")
config = { config = {
"@tokenizers": "spacy.zh.ChineseTokenizer", "nlp": {
"segmenter": "pkuseg", "tokenizer": {
"pkuseg_model": "default", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "default",
}
},
} }
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) nlp = get_lang_class("zh").from_config(config)
nlp.initialize()
return nlp.tokenizer return nlp.tokenizer

View File

@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)]) ner.initialize(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
doc.ents = [("ANIMAL", 3, 4)] doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)]) ner.initialize(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
orig_iobs = [t.ent_iob_ for t in doc] orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents) doc.ents = list(doc.ents)

View File

@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
words = ["Eat", "blue", "ham"] words = ["Eat", "blue", "ham"]
morph = ["Feat=V", "Feat=J", "Feat=N"] morph = ["Feat=V", "Feat=J", "Feat=N"]
doc = Doc(en_vocab, words=words, morphs=morph) doc = Doc(en_vocab, words=words, morphs=morph)
assert morph[0] == doc[0].morph_ assert morph[0] == str(doc[0].morph)
assert morph[1] == doc[1].morph_ assert morph[1] == str(doc[1].morph)
assert morph[2] == doc[2].morph_ assert morph[2] == str(doc[2].morph)
feats_array = doc.to_array((ORTH, MORPH)) feats_array = doc.to_array((ORTH, MORPH))
assert feats_array[0][1] == doc[0].morph.key assert feats_array[0][1] == doc[0].morph.key

View File

@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
assert [t.is_sent_start for t in doc] == [True, False, True, False] assert [t.is_sent_start for t in doc] == [True, False, True, False]
# heads override sent_starts # heads override sent_starts
doc = Doc( doc = Doc(
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4, en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
) )
assert [t.is_sent_start for t in doc] == [True, False, True, False] assert [t.is_sent_start for t in doc] == [True, False, True, False]
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
words = ["I", "live", "in", "New", "York", "."] words = ["I", "live", "in", "New", "York", "."]
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on # fmt: on
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words, morphs=morphs)
for i, morph in enumerate(morphs):
doc[i].morph_ = morph
attrs = [MORPH] attrs = [MORPH]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert [t.morph_ for t in new_doc] == morphs assert [str(t.morph) for t in new_doc] == morphs
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
doc[0].tag_ = "A" doc[0].tag_ = "A"
doc[0].pos_ = "X" doc[0].pos_ = "X"
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
doc[0].lemma_ = "a" doc[0].lemma_ = "a"
doc[0].dep_ = "dep" doc[0].dep_ = "dep"
doc[0].head = doc[1] doc[0].head = doc[1]
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
doc[1].tag_ = "A" doc[1].tag_ = "A"
doc[1].pos_ = "X" doc[1].pos_ = "X"
doc[1].morph_ = "" doc[1].set_morph("")
doc[1].lemma_ = "a" doc[1].lemma_ = "a"
doc[1].dep_ = "dep" doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")] doc.ents = [Span(doc, 0, 2, label="HELLO")]
@ -533,5 +531,78 @@ def test_doc_ents_setter():
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
vocab = Vocab() vocab = Vocab()
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
doc = Doc(vocab, words=words, ents=ents) doc = Doc(vocab, words=words, ents=ents)
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
doc1 = en_tokenizer("a b")
doc1b = en_tokenizer("c d")
doc2 = de_tokenizer("a b")
# unset values can be copied
doc1[0].morph = doc1[1].morph
assert doc1[0].morph.key == 0
assert doc1[1].morph.key == 0
# morph values from the same vocab can be copied
doc1[0].set_morph("Feat=Val")
doc1[1].morph = doc1[0].morph
assert doc1[0].morph == doc1[1].morph
# ... also across docs
doc1b[0].morph = doc1[0].morph
assert doc1[0].morph == doc1b[0].morph
doc2[0].set_morph("Feat2=Val2")
# the morph value must come from the same vocab
with pytest.raises(ValueError):
doc1[0].morph = doc2[0].morph
def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"]
ents = ["O"] * len(words)
doc = Doc(Vocab(), words=words, ents=ents)
assert doc.ents == ()
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 3
# None is missing
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# empty tag is missing
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# invalid IOB
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no dash
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no ent type
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# not strings or None
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)

View File

@ -4,13 +4,13 @@ import pytest
@pytest.fixture @pytest.fixture
def i_has(en_tokenizer): def i_has(en_tokenizer):
doc = en_tokenizer("I has") doc = en_tokenizer("I has")
doc[0].morph_ = {"PronType": "prs"} doc[0].set_morph({"PronType": "prs"})
doc[1].morph_ = { doc[1].set_morph({
"VerbForm": "fin", "VerbForm": "fin",
"Tense": "pres", "Tense": "pres",
"Number": "sing", "Number": "sing",
"Person": "three", "Person": "three",
} })
return doc return doc
@ -47,20 +47,20 @@ def test_morph_get(i_has):
def test_morph_set(i_has): def test_morph_set(i_has):
assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[0].morph.get("PronType") == ["prs"]
# set by string # set by string
i_has[0].morph_ = "PronType=unk" i_has[0].set_morph("PronType=unk")
assert i_has[0].morph.get("PronType") == ["unk"] assert i_has[0].morph.get("PronType") == ["unk"]
# set by string, fields are alphabetized # set by string, fields are alphabetized
i_has[0].morph_ = "PronType=123|NounType=unk" i_has[0].set_morph("PronType=123|NounType=unk")
assert i_has[0].morph_ == "NounType=unk|PronType=123" assert str(i_has[0].morph) == "NounType=unk|PronType=123"
# set by dict # set by dict
i_has[0].morph_ = {"AType": "123", "BType": "unk"} i_has[0].set_morph({"AType": "123", "BType": "unk"})
assert i_has[0].morph_ == "AType=123|BType=unk" assert str(i_has[0].morph) == "AType=123|BType=unk"
# set by string with multiple values, fields and values are alphabetized # set by string with multiple values, fields and values are alphabetized
i_has[0].morph_ = "BType=c|AType=b,a" i_has[0].set_morph("BType=c|AType=b,a")
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
# set by dict with multiple values, fields and values are alphabetized # set by dict with multiple values, fields and values are alphabetized
i_has[0].morph_ = {"AType": "b,a", "BType": "c"} i_has[0].set_morph({"AType": "b,a", "BType": "c"})
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
def test_morph_str(i_has): def test_morph_str(i_has):
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
doc = tokenizer("a dog") doc = tokenizer("a dog")
# set through token.morph_ # set through token.morph_
doc[0].morph_ = "PronType=prs" doc[0].set_morph("PronType=prs")
assert doc[0].morph_ == "PronType=prs" assert str(doc[0].morph) == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0 assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph # unset with token.morph
doc[0].morph = 0 doc[0].set_morph(None)
assert doc.to_array(["MORPH"])[0] == 0 assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_" # empty morph is equivalent to "_"
doc[0].morph_ = "" doc[0].set_morph("")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph # "_" morph is also equivalent to empty morph
doc[0].morph_ = "_" doc[0].set_morph("_")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph # set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val") tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
assert doc[0].morph_ == "Feat=Val" assert str(doc[0].morph) == "Feat=Val"

View File

@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys" assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys " assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED" assert doc[4].tag_ == "NAMED"
assert doc[4].morph_ == "Number=Plur" assert str(doc[4].morph) == "Number=Plur"
assert doc[5].text == "all night" assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night" assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED" assert doc[5].tag_ == "NAMED"
assert doc[5].morph_ == "Number=Plur" assert str(doc[5].morph) == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer): def test_doc_retokenize_merge_children(en_tokenizer):
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[10] = "B-GPE"
ents[13] = "B-PERSON"
ents[14] = "I-PERSON"
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc( doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values # if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-fg"
ents[6] = "I-ent-fg"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg") en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B # check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-de"
ents[6] = "I-ent-de"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:

View File

@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
assert doc[0].text == "Los" assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0 assert doc[0].idx == 0
assert doc[0].morph_ == "Number=Sing" assert str(doc[0].morph) == "Number=Sing"
assert doc[1].idx == 3 assert doc[1].idx == 3
assert doc[1].text == "Angeles" assert doc[1].text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
assert doc[1].morph_ == "Number=Sing" assert str(doc[1].morph) == "Number=Sing"
assert doc[2].text == "start" assert doc[2].text == "start"
assert doc[2].head.text == "." assert doc[2].head.text == "."
assert doc[3].text == "." assert doc[3].text == "."

View File

@ -9,7 +9,7 @@ def doc(en_vocab):
tags = ["VBP", "NN", "NN"] tags = ["VBP", "NN", "NN"]
heads = [0, 0, 0] heads = [0, 0, 0]
deps = ["ROOT", "dobj", "dobj"] deps = ["ROOT", "dobj", "dobj"]
ents = [("ORG", 1, 2)] ents = ["O", "B-ORG", "O"]
return Doc( return Doc(
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
) )

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_de(de_tokenizer): def test_noun_chunks_is_parsed_de(de_tokenizer):
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
"""
doc = de_tokenizer("Er lag auf seinem") doc = de_tokenizer("Er lag auf seinem")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_el(el_tokenizer): def test_noun_chunks_is_parsed_el(el_tokenizer):
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
"""
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -7,8 +7,7 @@ import pytest
def test_noun_chunks_is_parsed(en_tokenizer): def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
"""
doc = en_tokenizer("This is a sentence") doc = en_tokenizer("This is a sentence")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_es(es_tokenizer): def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
"""
doc = es_tokenizer("en Oxford este verano") doc = es_tokenizer("en Oxford este verano")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fa(fa_tokenizer): def test_noun_chunks_is_parsed_fa(fa_tokenizer):
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
"""
doc = fa_tokenizer("این یک جمله نمونه می باشد.") doc = fa_tokenizer("این یک جمله نمونه می باشد.")
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize( @pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text): def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
"""
doc = fr_tokenizer("trouver des travaux antérieurs") doc = fr_tokenizer("trouver des travaux antérieurs")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_id(id_tokenizer): def test_noun_chunks_is_parsed_id(id_tokenizer):
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
"""
doc = id_tokenizer("sebelas") doc = id_tokenizer("sebelas")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS, "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
) )
def test_ja_tokenizer_sub_tokens( def test_ja_tokenizer_sub_tokens(
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_nb(nb_tokenizer): def test_noun_chunks_is_parsed_nb(nb_tokenizer):
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
"""
doc = nb_tokenizer("Smørsausen brukes bl.a. til") doc = nb_tokenizer("Smørsausen brukes bl.a. til")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)], "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
) )
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
tokens = ne_tokenizer(text) tokens = ne_tokenizer(text)

View File

@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,), ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6), ("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
], ],
) )

View File

@ -3,8 +3,7 @@ from spacy.tokens import Doc
def test_noun_chunks_is_parsed_sv(sv_tokenizer): def test_noun_chunks_is_parsed_sv(sv_tokenizer):
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
"""
doc = sv_tokenizer("Studenten läste den bästa boken") doc = sv_tokenizer("Studenten läste den bästa boken")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
@registry.misc("lemmatizer_init_lookups") @registry.misc("lemmatizer_init_lookups")
def lemmatizer_init_lookups(): def lemmatizer_init_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
"""Test that languages can be initialized.""" # Test that languages can be initialized
nlp = get_lang_class(lang)() nlp = get_lang_class(lang)()
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}}) lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert not lemmatizer.lookups.tables
nlp.config["initialize"]["components"]["lemmatizer"] = {
"lookups": {"@misc": "lemmatizer_init_lookups"}
}
with pytest.raises(ValueError):
nlp("x")
nlp.initialize()
assert lemmatizer.lookups.tables
doc = nlp("x")
# Check for stray print statements (see #3342) # Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr() captured = capfd.readouterr()
assert not captured.out assert not captured.out
assert doc[0].lemma_ == "y"
# Test initialization by calling .initialize() directly
nlp = get_lang_class(lang)()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
assert nlp("x")[0].lemma_ == "y"

View File

@ -27,9 +27,18 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
@pytest.mark.slow @pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese( config = {
meta={ "nlp": {
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}} "tokenizer": {
} "@tokenizers": "spacy.zh.ChineseTokenizer",
) "segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "medicine",
}
},
}
nlp = Chinese.from_config(config)
nlp.initialize()
zh_tokenizer_serialize(nlp.tokenizer) zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH # IS_SUBSET acts like "IN" for attrs other than MORPH
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH # IS_SUPERSET with more than one value only matches for MORPH
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1" doc[0].set_morph("Feat2=Val2|Feat1=Val1")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2" doc[0].set_morph("Feat1=Val1|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# multiple values are split # multiple values are split
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True) matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser): def _train_parser(parser):
fix_random_seed(1) fix_random_seed(1)
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) parser.initialize(lambda: [_parser_example(parser)])
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(5): for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")
ner1.add_label("A") ner1.add_label("A")
ner1.begin_training(lambda: [_ner_example(ner1)]) ner1.initialize(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config) ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes # the second model needs to be resized before we can call from_bytes

View File

@ -202,7 +202,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True) ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON") ner.add_label("PERSON")
nlp.begin_training() nlp.initialize()
for itn in range(2): for itn in range(2):
losses = {} losses = {}
batches = util.minibatch(train_examples, size=8) batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
def test_overwrite_token(): def test_overwrite_token():
nlp = English() nlp = English()
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.begin_training() nlp.initialize()
# The untrained NER will predict O for each token # The untrained NER will predict O for each token
doc = nlp("I live in New York") doc = nlp("I live in New York")
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
nlp = English() nlp = English()
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("MY_LABEL") ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("John is watching the news about Croatia's elections") doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken # if this goes wrong, the initialization of the parser's upper layer is probably broken
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
# 2: untrained NER - should set everything else to O # 2: untrained NER - should set everything else to O
untrained_ner = nlp.add_pipe("ner") untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("This is Antti Korhonen speaking in Finland") doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""] expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
# 1: untrained NER - should set everything to O # 1: untrained NER - should set everything to O
untrained_ner = nlp.add_pipe("ner", name="uner") untrained_ner = nlp.add_pipe("ner", name="uner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
# 2 : Entity Ruler - should set "this" to B and keep everything else O # 2 : Entity Ruler - should set "this" to B and keep everything else O
patterns = [{"label": "THING", "pattern": "This"}] patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
nlp.add_pipe("blocker", config={"start": 2, "end": 5}) nlp.add_pipe("blocker", config={"start": 2, "end": 5})
untrained_ner = nlp.add_pipe("ner") untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("This is Antti L Korhonen speaking in Finland") doc = nlp("This is Antti L Korhonen speaking in Finland")
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
expected_types = ["", "", "", "", "", "", "", ""] expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"): for ent in annotations.get("entities"):
ner.add_label(ent[2]) ner.add_label(ent[2])
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(50): for i in range(50):
losses = {} losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner") nlp.add_pipe("ner")
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.begin_training() nlp.initialize()
assert "W033" in caplog.text assert "W033" in caplog.text
caplog.clear() caplog.clear()
nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.begin_training() nlp.initialize()
assert "W033" not in caplog.text assert "W033" not in caplog.text
@ -358,5 +358,5 @@ class BlockerComponent1:
self.name = name self.name = name
def __call__(self, doc): def __call__(self, doc):
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
return doc return doc

View File

@ -191,7 +191,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []): for dep in annotations.get("deps", []):
parser.add_label(dep) parser.add_label(dep)
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(100): for i in range(100):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -34,7 +34,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32
# parser.add_label('right') # parser.add_label('right')
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) parser.initialize(lambda: [_parser_example(parser)])
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(10): for i in range(10):

View File

@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
a.add(**p) a.add(**p)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
) )
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
dev_examples = [ dev_examples = [
Example.from_dict( Example.from_dict(
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
for i in range(len(doc)): for i in range(len(doc)):
if i == 4: if i == 4:
assert doc[i].pos_ == "PUNCT" assert doc[i].pos_ == "PUNCT"
assert doc[i].morph_ == "PunctType=peri" assert str(doc[i].morph) == "PunctType=peri"
else: else:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
def test_attributeruler_morph_rules(nlp, morph_rules): def test_attributeruler_morph_rules(nlp, morph_rules):
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
for i in range(len(doc)): for i in range(len(doc)):
if i != 2: if i != 2:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
else: else:
assert doc[2].pos_ == "DET" assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a" assert doc[2].lemma_ == "a"
assert doc[2].morph_ == "Case=Nom" assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_indices(nlp): def test_attributeruler_indices(nlp):
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
for i in range(len(doc)): for i in range(len(doc)):
if i == 1: if i == 1:
assert doc[i].lemma_ == "was" assert doc[i].lemma_ == "was"
assert doc[i].morph_ == "Case=Nom|Number=Sing" assert str(doc[i].morph) == "Case=Nom|Number=Sing"
elif i == 2: elif i == 2:
assert doc[i].lemma_ == "the" assert doc[i].lemma_ == "the"
assert doc[i].morph_ == "Case=Nom|Number=Plur" assert str(doc[i].morph) == "Case=Nom|Number=Plur"
elif i == 3: elif i == 3:
assert doc[i].lemma_ == "cat" assert doc[i].lemma_ == "cat"
else: else:
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
# raises an error when trying to modify a token outside of the match # raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
"""Test that the EL can't train without defining a KB""" """Test that the EL can't train without defining a KB"""
entity_linker = nlp.add_pipe("entity_linker", config={}) entity_linker = nlp.add_pipe("entity_linker", config={})
with pytest.raises(ValueError): with pytest.raises(ValueError):
entity_linker.begin_training(lambda: []) entity_linker.initialize(lambda: [])
def test_kb_empty(nlp): def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
entity_linker = nlp.add_pipe("entity_linker", config=config) entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
with pytest.raises(ValueError): with pytest.raises(ValueError):
entity_linker.begin_training(lambda: []) entity_linker.initialize(lambda: [])
def test_kb_serialize(nlp): def test_kb_serialize(nlp):
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases # adding aliases
douglas_hash = mykb.add_alias( mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam") candidates = mykb.get_alias_candidates("adam")
@ -360,7 +358,7 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
nlp.begin_training() nlp.initialize()
assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == vector_length
# test whether the entity links are preserved by the `as_doc()` function # test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +461,7 @@ def test_overfitting_IO():
) )
# train the NEL pipe # train the NEL pipe
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == vector_length
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

View File

@ -0,0 +1,69 @@
import pytest
from spacy.language import Language
from spacy.lang.en import English
from spacy.training import Example
from thinc.api import ConfigValidationError
from pydantic import StrictBool
def test_initialize_arguments():
name = "test_initialize_arguments"
class CustomTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.from_initialize = None
def __call__(self, text):
return self.tokenizer(text)
def initialize(self, get_examples, nlp, custom: int):
self.from_initialize = custom
class Component:
def __init__(self):
self.from_initialize = None
def initialize(
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
):
self.from_initialize = (custom1, custom2)
Language.factory(name, func=lambda nlp, name: Component())
nlp = English()
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
example = Example.from_dict(nlp("x"), {})
get_examples = lambda: [example]
nlp.add_pipe(name)
# The settings here will typically come from the [initialize] block
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e:
# Empty config for component, no required custom1 argument
nlp.initialize(get_examples)
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ("custom1",)
assert errors[0]["type"] == "value_error.missing"
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": 1}},
}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e:
# Wrong type of custom 2
nlp.initialize(get_examples)
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ("custom2",)
assert errors[0]["type"] == "value_error.strictbool"
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x"}},
}
nlp.config["initialize"].update(init_cfg)
nlp.initialize(get_examples)
assert nlp.tokenizer.from_initialize == 1
pipe = nlp.get_pipe(name)
assert pipe.from_initialize == ("x", False)

View File

@ -8,61 +8,52 @@ from ..util import make_tempdir
@pytest.fixture @pytest.fixture
def nlp(): def nlp():
return English()
@pytest.fixture
def lemmatizer(nlp):
@registry.misc("cope_lookups") @registry.misc("cope_lookups")
def cope_lookups(): def cope_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
lemmatizer = nlp.add_pipe( nlp = English()
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} nlp.config["initialize"]["components"]["lemmatizer"] = {
) "lookups": {"@misc": "cope_lookups"}
return lemmatizer }
return nlp
def test_lemmatizer_init(nlp): def test_lemmatizer_init(nlp):
@registry.misc("cope_lookups") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
)
assert isinstance(lemmatizer.lookups, Lookups) assert isinstance(lemmatizer.lookups, Lookups)
assert not lemmatizer.lookups.tables
assert lemmatizer.mode == "lookup" assert lemmatizer.mode == "lookup"
with pytest.raises(ValueError):
nlp("test")
nlp.initialize()
assert lemmatizer.lookups.tables
assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "cope"
# replace any tables from spacy-lookups-data # replace any tables from spacy-lookups-data
lemmatizer.lookups = Lookups() lemmatizer.lookups = Lookups()
doc = nlp("coping")
# lookup with no tables sets text as lemma # lookup with no tables sets text as lemma
assert doc[0].lemma_ == "coping" assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "coped"
nlp.remove_pipe("lemmatizer") nlp.remove_pipe("lemmatizer")
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
@registry.misc("empty_lookups")
def empty_lookups():
return Lookups()
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.add_pipe( # Can't initialize without required tables
"lemmatizer", lemmatizer.initialize(lookups=Lookups())
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}}, lookups = Lookups()
) lookups.add_table("lemma_lookup", {})
lemmatizer.initialize(lookups=lookups)
def test_lemmatizer_config(nlp, lemmatizer): def test_lemmatizer_config(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
doc = nlp.make_doc("coping") doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB" doc[0].pos_ = "VERB"
assert doc[0].lemma_ == "" assert doc[0].lemma_ == ""
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
assert doc[0].lemma_ == "cope" assert doc[0].lemma_ == "cope"
def test_lemmatizer_serialize(nlp, lemmatizer): def test_lemmatizer_serialize(nlp):
@registry.misc("cope_lookups") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
def cope_lookups(): def cope_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
nlp2 = English() nlp2 = English()
lemmatizer2 = nlp2.add_pipe( lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} lemmatizer2.initialize(lookups=cope_lookups())
)
lemmatizer2.from_bytes(lemmatizer.to_bytes()) lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2.make_doc("coping") doc2 = nlp2.make_doc("coping")
doc2[0].pos_ = "VERB" doc2[0].pos_ = "VERB"
assert doc2[0].lemma_ == "" assert doc2[0].lemma_ == ""
doc2 = lemmatizer(doc2) doc2 = lemmatizer(doc2)
assert doc2[0].text == "coping" assert doc2[0].text == "coping"
assert doc2[0].lemma_ == "cope" assert doc2[0].lemma_ == "cope"

View File

@ -33,7 +33,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("morphologizer") nlp.add_pipe("morphologizer")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_implicit_label(): def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize(): def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
morphologizer = nlp.add_pipe("morphologizer") morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
nlp.begin_training() nlp.initialize()
# this throws an error because the morphologizer can't be resized after initialization # this throws an error because the morphologizer can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
morphologizer = nlp.add_pipe("morphologizer") morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for inst in TRAIN_DATA: for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50): for i in range(50):
losses = {} losses = {}
@ -91,7 +91,7 @@ def test_overfitting_IO():
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
assert [t.morph_ for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Also test the results are still the same after IO # Also test the results are still the same after IO
@ -99,5 +99,5 @@ def test_overfitting_IO():
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert [t.morph_ for t in doc2] == gold_morphs assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags assert [t.pos_ for t in doc2] == gold_pos_tags

View File

@ -31,19 +31,19 @@ TRAIN_DATA = [
] ]
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
nlp.add_pipe("senter") nlp.add_pipe("senter")
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
train_examples[1].reference[11].is_sent_start = False train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe("senter") nlp.add_pipe("senter")
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(200): for i in range(200):
losses = {} losses = {}

View File

@ -15,14 +15,14 @@ def test_label_types():
tagger.add_label(9) tagger.add_label(9)
def test_tagger_begin_training_tag_map(): def test_tagger_initialize_tag_map():
"""Test that Tagger.begin_training() without gold tuples does not clobber """Test that Tagger.initialize() without gold tuples does not clobber
the tag map.""" the tag map."""
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
orig_tag_count = len(tagger.labels) orig_tag_count = len(tagger.labels)
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.initialize()
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
@ -38,7 +38,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("tagger") nlp.add_pipe("tagger")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_no_resize(): def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
tagger.add_label("N") tagger.add_label("N")
tagger.add_label("V") tagger.add_label("V")
assert tagger.labels == ("N", "V") assert tagger.labels == ("N", "V")
nlp.begin_training() nlp.initialize()
assert tagger.model.get_dim("nO") == 2 assert tagger.model.get_dim("nO") == 2
# this throws an error because the tagger can't be resized after initialization # this throws an error because the tagger can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
train_examples = [] train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: []) nlp.initialize(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.initialize(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=lambda: [])
with pytest.raises(ValueError):
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert tagger.model.get_dim("nO") == len(TAGS) assert tagger.model.get_dim("nO") == len(TAGS)
for i in range(50): for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
nlp = English() nlp = English()
nlp.add_pipe("tagger") nlp.add_pipe("tagger")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()

View File

@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer from spacy.scorer import Scorer
from spacy.training import Example
from spacy.training.initialize import verify_textcat_config
from ..util import make_tempdir from ..util import make_tempdir
from ...cli.train import verify_textcat_config
from ...training import Example
TRAIN_DATA = [ TRAIN_DATA = [
@ -26,7 +26,7 @@ def test_simple_train():
nlp = Language() nlp = Language()
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
textcat.add_label("answer") textcat.add_label("answer")
nlp.begin_training() nlp.initialize()
for i in range(5): for i in range(5):
for text, answer in [ for text, answer in [
("aaaa", 1.0), ("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
textcat = TextCategorizer(nlp.vocab, width=8) textcat = TextCategorizer(nlp.vocab, width=8)
for letter in letters: for letter in letters:
textcat.add_label(letter) textcat.add_label(letter)
optimizer = textcat.begin_training(lambda: []) optimizer = textcat.initialize(lambda: [])
for i in range(30): for i in range(30):
losses = {} losses = {}
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("textcat") nlp.add_pipe("textcat")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_implicit_label(): def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize(): def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE") textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE") textcat.add_label("NEGATIVE")
nlp.begin_training() nlp.initialize()
assert textcat.model.get_dim("nO") == 2 assert textcat.model.get_dim("nO") == 2
# this throws an error because the textcat can't be resized after initialization # this throws an error because the textcat can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
textcat.add_label("NEUTRAL") textcat.add_label("NEUTRAL")
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
train_examples = [] train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
for label, value in annotations.get("cats").items(): for label, value in annotations.get("cats").items():
textcat.add_label(label) textcat.add_label(label)
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2 assert textcat.model.get_dim("nO") == 2
for i in range(50): for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items(): for label, value in annotations.get("cats").items():
textcat.add_label(label) textcat.add_label(label)
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(5): for i in range(5):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
with pytest.raises(ValueError): with pytest.raises(ValueError):
verify_textcat_config(nlp, pipe_config) verify_textcat_config(nlp, pipe_config)
def test_textcat_evaluation(): def test_textcat_evaluation():
train_examples = [] train_examples = []
nlp = English() nlp = English()
@ -241,15 +242,17 @@ def test_textcat_evaluation():
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
train_examples.append(Example(pred2, ref2)) train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) scores = Scorer().score_cats(
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2 train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1 )
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
assert scores["cats_f_per_type"]["summer"]["p"] == 0 assert scores["cats_f_per_type"]["summer"]["p"] == 0
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1 assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1 assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2 assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2 assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2 assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
assert scores["cats_micro_p"] == 4/5 assert scores["cats_micro_p"] == 4 / 5
assert scores["cats_micro_r"] == 4/6 assert scores["cats_micro_r"] == 4 / 6

View File

@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
encode_config["width"] = width encode_config["width"] = width
docs = get_batch(3) docs = get_batch(3)
tok2vec = build_Tok2Vec_model( tok2vec = build_Tok2Vec_model(
embed_arch(**embed_config), embed_arch(**embed_config), encode_arch(**encode_config)
encode_arch(**encode_config)
) )
tok2vec.initialize(docs) tok2vec.initialize(docs)
vectors, backprop = tok2vec.begin_update(docs) vectors, backprop = tok2vec.begin_update(docs)
@ -88,7 +87,7 @@ def test_init_tok2vec():
nlp = English() nlp = English()
tok2vec = nlp.add_pipe("tok2vec") tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == [] assert tok2vec.listeners == []
nlp.begin_training() nlp.initialize()
assert tok2vec.model.get_dim("nO") assert tok2vec.model.get_dim("nO")
@ -154,7 +153,7 @@ def test_tok2vec_listener():
# Check that the Tok2Vec component finds it listeners # Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == [] assert tok2vec.listeners == []
optimizer = nlp.begin_training(lambda: train_examples) optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec] assert tok2vec.listeners == [tagger_tok2vec]
for i in range(5): for i in range(5):

View File

@ -428,7 +428,7 @@ def test_issue999():
for _, offsets in TRAIN_DATA: for _, offsets in TRAIN_DATA:
for start, end, label in offsets: for start, end, label in offsets:
ner.add_label(label) ner.add_label(label)
nlp.begin_training() nlp.initialize()
for itn in range(20): for itn in range(20):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA: for raw_text, entity_offsets in TRAIN_DATA:

Some files were not shown because too many files have changed in this diff Show More