Refactor CLI

This commit is contained in:
Ines Montani 2020-09-28 15:09:59 +02:00
parent a89e0ff7cb
commit 822ea4ef61
18 changed files with 917 additions and 913 deletions

View File

@ -15,8 +15,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
#from .init_model import init_model # noqa: F401
from .init_pipeline import init_pipeline # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401

View File

@ -10,13 +10,12 @@ from click import NoSuchOption
from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError
from thinc.api import Config, ConfigValidationError, require_gpu
from configparser import InterpolationError
import os
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ensure_path
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
"""RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
"""
return [
name
for name, cfg in config.get("components", {}).items()
if "factory" not in cfg and "source" in cfg
]
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""Upload a file.
@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
p = int(p)
result.append(p)
return result
class CliLogger:
"""Helper mocking up the most commonly used logger methods. Can be passed
into functions like train() to make them output pretty-printed messages
on the CLI and regular logging if used from within Python.
"""
debug = msg.text
info = msg.info
warn = msg.info
error = msg.fail
def setup_gpu(use_gpu: int):
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")

View File

@ -1,7 +1,7 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from thinc.api import Config, ConfigValidationError
from thinc.api import Config
from thinc.config import VARIABLE_RE
import typer
@ -52,10 +52,8 @@ def debug_config(
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp = util.load_model_from_config(config)
# Use the resolved config here in case user has one function returning
# a dict of corpora etc.
resolved = util.resolve_training_config(nlp.config)
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
dot_names = ["training.dev_corpus", "training.train_corpus"]
util.resolve_dot_names(nlp.config, dot_names)
msg.good("Config is valid")
if show_vars:
variables = get_variables(config)
@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
value = util.dot_to_object(config, path)
result[variable] = repr(value)
return result
def check_section_refs(config: Config, fields: List[str]) -> None:
"""Validate fields in the config that refer to other sections or values
(e.g. in the corpora) and make sure that those references exist.
"""
errors = []
for field in fields:
# If the field doesn't exist in the config, we ignore it
try:
value = util.dot_to_object(config, field)
except KeyError:
continue
try:
util.dot_to_object(config, value)
except KeyError:
msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config=config, errors=errors)

View File

@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
import typer
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components
from ._util import import_code, debug_cli
from ..training import Corpus, Example
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj
from ..language import Language
from ..util import registry
from .. import util
@ -94,26 +97,13 @@ def debug_data(
with show_validation_error(config_path):
cfg = util.load_config(config_path, overrides=config_overrides)
nlp = util.load_model_from_config(cfg)
C = util.resolve_training_config(nlp.config)
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
# Use original config here, not resolved version
sourced_components = get_sourced_components(cfg)
frozen_components = C["training"]["frozen_components"]
frozen_components = T["frozen_components"]
resume_components = [p for p in sourced_components if p not in frozen_components]
pipeline = nlp.pipe_names
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
tag_map_path = util.ensure_path(C["training"]["tag_map"])
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
morph_rules = {}
if morph_rules_path is not None:
morph_rules = srsly.read_json(morph_rules_path)
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
msg.divider("Data file validation")
# Create the gold corpus to be able to better analyze data
@ -145,10 +135,10 @@ def debug_data(
train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"]
frozen_components = C["training"]["frozen_components"]
frozen_components = T["frozen_components"]
msg.divider("Training stats")
msg.text(f"Language: {C['nlp']['lang']}")
msg.text(f"Language: {nlp.lang}")
msg.text(f"Training pipeline: {', '.join(pipeline)}")
if resume_components:
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@ -355,6 +345,7 @@ def debug_data(
if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]]
# TODO: does this need to be updated?
tag_map = nlp.vocab.morphology.tag_map
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
labels_with_counts = _format_labels(

View File

@ -4,12 +4,14 @@ from pathlib import Path
from spacy.training import Example
from spacy.util import dot_to_object
from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list
from ._util import parse_config_overrides, string_to_list, setup_gpu
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util
@ -37,11 +39,7 @@ def debug_model_cli(
DOCS: https://nightly.spacy.io/api/cli#debug-model
"""
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
setup_gpu(use_gpu)
layers = string_to_list(layers, intify=True)
print_settings = {
"dimensions": dimensions,
@ -65,8 +63,8 @@ def debug_model_cli(
set_gpu_allocator(allocator)
with show_validation_error(config_path):
nlp = util.load_model_from_config(raw_config)
C = util.resolve_training_config(nlp.config)
seed = C["training"]["seed"]
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
seed = T["seed"]
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
@ -77,7 +75,7 @@ def debug_model_cli(
exits=1,
)
model = pipe.model
debug_model(C, nlp, model, print_settings=print_settings)
debug_model(T, nlp, model, print_settings=print_settings)
def debug_model(

View File

@ -3,11 +3,11 @@ from wasabi import Printer
from pathlib import Path
import re
import srsly
from thinc.api import require_gpu, fix_random_seed
from thinc.api import fix_random_seed
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt
from ._util import app, Arg, Opt, setup_gpu
from ..scorer import Scorer
from .. import util
from .. import displacy
@ -61,8 +61,7 @@ def evaluate(
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
if use_gpu >= 0:
require_gpu(use_gpu)
setup_gpu(use_gpu)
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)
displacy_path = util.ensure_path(displacy_path)

View File

@ -1,22 +1,13 @@
from typing import Optional, Dict, Callable, Any
from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
from thinc.api import Config, fix_random_seed, set_gpu_allocator
import srsly
from .. import util
from ..util import registry, resolve_dot_names, OOV_RANK
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
from ..language import Language
from ..lookups import Lookups
from ..errors import Errors
from ..training.initialize import init_nlp
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, get_sourced_components
DEFAULT_OOV_PROB = -20
from ._util import import_code, CliLogger, setup_gpu
@init_cli.command(
@ -31,178 +22,16 @@ def init_pipeline_cli(
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp = init_pipeline(config)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}")
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
raw_config = config
config = raw_config.interpolate()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
with show_validation_error():
nlp = util.load_model_from_config(raw_config, auto_fill=True)
msg.good("Set up nlp object from config")
config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components]
msg.info(f"Pipeline: {nlp.pipe_names}")
if resume_components:
with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
add_tok2vec_weights(nlp, P, I)
# TODO: this should be handled better?
nlp = before_to_disk(nlp)
return nlp
def init_vocab(
nlp: Language,
*,
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
) -> Language:
if lookups:
nlp.vocab.lookups = lookups
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = util.ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
msg.good("Created vocabulary")
if vectors is not None:
add_vectors(nlp, vectors)
msg.good(f"Added vectors: {vectors}")
def add_tok2vec_weights(
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> None:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
V = vocab_config
weights_data = None
init_tok2vec = util.ensure_path(V["init_tok2vec"])
if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = "Need initialize.vectors if pretraining.objective.type is vectors"
msg.fail(err, exits=1)
if not init_tok2vec.exists():
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
if weights_data is not None:
tok2vec_component = P["component"]
if tok2vec_component is None:
msg.fail(
f"To use pretrained tok2vec weights, [pretraining.component] "
f"needs to specify the component that should load them.",
exits=1,
)
layer = nlp.get_pipe(tok2vec_component).model
if P["layer"]:
layer = layer.get_ref(P["layer"])
layer.from_bytes(weights_data)
msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
def add_vectors(nlp: Language, vectors: str) -> None:
title = f"Config validation error for vectors {vectors}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
with show_validation_error(
title=title, desc=desc, hint_fill=False, show_config=False
):
util.load_vectors_into_model(nlp, vectors)
msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
def verify_config(nlp: Language) -> None:
"""Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
for pipe_config in nlp.config["components"].values():
# We can't assume that the component name == the factory
factory = pipe_config["factory"]
if factory == "textcat":
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if pipe_config.get("positive_label"):
textcat_labels = nlp.get_pipe("textcat").labels
pos_label = pipe_config.get("positive_label")
if pos_label not in textcat_labels:
raise ValueError(
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
)
if len(list(textcat_labels)) != 2:
raise ValueError(
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
)
def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]]
) -> Callable[[Language], Language]:
def before_to_disk(nlp: Language) -> Language:
if not callback:
return nlp
modified_nlp = callback(nlp)
if not isinstance(modified_nlp, Language):
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
raise ValueError(err)
return modified_nlp
return before_to_disk

View File

@ -1,25 +1,13 @@
from typing import Optional
import numpy
import time
import re
from collections import Counter
from pathlib import Path
from thinc.api import require_gpu, set_gpu_allocator
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import Config, CosineDistance, L2Distance
from wasabi import msg
import srsly
from functools import partial
import typer
import re
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..tokens import Doc
from ..attrs import ID
from .. import util
from ..util import dot_to_object
from ._util import import_code, setup_gpu, CliLogger
from ..training.pretrain import pretrain
from ..util import load_config
@app.command(
@ -61,15 +49,11 @@ def pretrain_cli(
config_overrides = parse_config_overrides(ctx.args)
import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
setup_gpu(use_gpu)
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
raw_config = util.load_config(
raw_config = load_config(
config_path, overrides=config_overrides, interpolate=False
)
config = raw_config.interpolate()
@ -89,250 +73,11 @@ def pretrain_cli(
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
logger=CliLogger,
)
def pretrain(
config: Config,
output_dir: Path,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
):
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
nlp = util.load_model_from_config(config)
C = util.resolve_training_config(nlp.config)
P_cfg = C["pretraining"]
corpus = dot_to_object(C, P_cfg["corpus"])
batcher = P_cfg["batcher"]
model = create_pretraining_model(nlp, C["pretraining"])
optimizer = C["pretraining"]["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
_resume_model(model, resume_path, epoch_resume)
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
def _save_model(epoch, is_temp=False):
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
file_.write(model.get_ref("tok2vec").to_bytes())
log = {
"nr_word": tracker.nr_word,
"loss": tracker.loss,
"epoch_loss": tracker.epoch_loss,
"epoch": epoch,
}
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
objective = create_objective(P_cfg["objective"])
# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
msg.good("Successfully finished pretrain")
def ensure_docs(examples_or_docs):
docs = []
for eg_or_doc in examples_or_docs:
if isinstance(eg_or_doc, Doc):
docs.append(eg_or_doc)
else:
docs.append(eg_or_doc.reference)
return docs
def _resume_model(model, resume_path, epoch_resume):
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
# Parse the epoch number from the given weight file
model_name = re.search(r"model\d+\.bin", str(resume_path))
if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
msg.info(f"Resuming from epoch: {epoch_resume}")
else:
msg.info(f"Resuming from epoch: {epoch_resume}")
def make_update(model, docs, optimizer, objective_func):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
"""
predictions, backprop = model.begin_update(docs)
loss, gradients = objective_func(model.ops, docs, predictions)
backprop(gradients)
model.finish_update(optimizer)
# Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss
return float(loss)
def create_objective(config):
"""Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss.
Bleh.
"""
objective_type = config["type"]
if objective_type == "characters":
return partial(get_characters_loss, nr_char=config["n_characters"])
elif objective_type == "vectors":
if config["loss"] == "cosine":
return partial(
get_vectors_loss,
distance=CosineDistance(normalize=True, ignore_zeros=True),
)
elif config["loss"] == "L2":
return partial(
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
)
else:
raise ValueError("Unexpected loss type", config["loss"])
else:
raise ValueError("Unexpected objective_type", objective_type)
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc.
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO
maxout_pieces = 3
hidden_size = 300
if pretrain_config["objective"]["type"] == "vectors":
model = build_cloze_multi_task_model(
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
elif pretrain_config["objective"]["type"] == "characters":
model = build_cloze_characters_multi_task_model(
nlp.vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=pretrain_config["objective"]["n_characters"],
)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
set_dropout_rate(model, pretrain_config["dropout"])
return model
class ProgressTracker:
def __init__(self, frequency=1000000):
self.loss = 0.0
self.prev_loss = 0.0
self.nr_word = 0
self.words_per_epoch = Counter()
self.frequency = frequency
self.last_time = time.time()
self.last_update = 0
self.epoch_loss = 0.0
def update(self, epoch, loss, docs):
self.loss += loss
self.epoch_loss += loss
words_in_batch = sum(len(doc) for doc in docs)
self.words_per_epoch[epoch] += words_in_batch
self.nr_word += words_in_batch
words_since_update = self.nr_word - self.last_update
if words_since_update >= self.frequency:
wps = words_since_update / (time.time() - self.last_time)
self.last_update = self.nr_word
self.last_time = time.time()
loss_per_word = self.loss - self.prev_loss
status = (
epoch,
self.nr_word,
_smart_round(self.loss, width=10),
_smart_round(loss_per_word, width=6),
int(wps),
)
self.prev_loss = float(self.loss)
return status
else:
return None
def _smart_round(figure, width=10, max_decimal=4):
"""Round large numbers as integers, smaller numbers as decimals."""
n_digits = len(str(int(figure)))
n_decimal = width - (n_digits + 1)
if n_decimal <= 1:
return str(int(figure))
else:
n_decimal = min(n_decimal, max_decimal)
format_str = "%." + str(n_decimal) + "f"
return format_str % figure
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)

View File

@ -1,24 +1,16 @@
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
from timeit import default_timer as timer
import tqdm
from typing import Optional
from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
import random
from thinc.api import Config
import typer
import logging
from .init_pipeline import init_pipeline
from .init_pipeline import create_before_to_disk_callback
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ._util import import_code, CliLogger, setup_gpu
from ..language import Language
from ..training.loop import train
from ..training.initialize import init_nlp, must_reinitialize
from .. import util
from ..errors import Errors
from ..util import resolve_dot_names, registry
from ..schemas import ConfigSchemaTraining
@app.command(
@ -52,31 +44,33 @@ def train_cli(
verify_cli_args(config_path, output_path)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)
msg.divider("Initializing pipeline")
nlp = init_nlp(config, output_path)
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu)
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
if final_path:
msg.good(f"Saved pipeline to output directory", final_path)
def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
def init_pipeline(
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
) -> Language:
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
if output_path is not None:
init_path = output_path / "model-initial"
if not init_path.exists():
msg.info(f"Initializing the pipeline in {init_path}")
nlp = init_pipeline(config)
nlp = init_nlp(config, **init_kwargs)
nlp.to_disk(init_path)
msg.good(f"Saved initialized pipeline to {init_path}")
else:
nlp = util.load_model(init_path)
if must_reinitialize(config, nlp.config):
msg.warn("Config has changed: need to re-initialize pipeline")
nlp = init_pipeline(config)
nlp = init_nlp(config, **init_kwargs)
nlp.to_disk(init_path)
msg.good(f"Re-initialized pipeline in {init_path}")
else:
@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
"the vocabulary, vectors and label scheme. To take advantage of this, "
"provide an output directory."
)
return init_pipeline(config)
def train(
nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
) -> None:
# Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"]
score_weights = T["score_weights"]
batcher = T["batcher"]
train_logger = T["logger"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Create iterator, which yields out info after each optimization step.
training_step_iterator = train_while_improving(
nlp,
optimizer,
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
create_evaluation_callback(nlp, dev_corpus, score_weights),
dropout=T["dropout"],
accumulate_gradient=T["accumulate_gradient"],
patience=T["patience"],
max_steps=T["max_steps"],
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
)
msg.info(f"Pipeline: {nlp.pipe_names}")
if frozen_components:
msg.info(f"Frozen components: {frozen_components}")
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch 1")
for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1)
if is_best_checkpoint is not None:
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch {info['epoch']}")
except Exception as e:
finalize_logger()
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
# specific error.
msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-final")
raise e
finally:
finalize_logger()
if output_path is not None:
final_model_path = output_path / "model-final"
if optimizer.averages:
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
else:
nlp.to_disk(final_model_path)
msg.good(f"Saved pipeline to output directory {final_model_path}")
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
# TODO: do this better and more fine-grained
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
def add_vectors(nlp: Language, vectors: str) -> None:
title = f"Config validation error for vectors {vectors}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
with show_validation_error(
title=title, desc=desc, hint_fill=False, show_config=False
):
util.load_vectors_into_model(nlp, vectors)
def create_train_batches(iterator, batcher, max_epochs: int):
epoch = 0
examples = list(iterator)
if not examples:
# Raise error if no data
raise ValueError(Errors.E986)
while max_epochs < 1 or epoch != max_epochs:
random.shuffle(examples)
for batch in batcher(examples):
yield epoch, batch
epoch += 1
def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples)
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key, value in scores.items():
if key in weights and not isinstance(value, (int, float)):
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
try:
weighted_score = sum(
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
)
except KeyError as e:
keys = list(scores.keys())
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
raise KeyError(err) from None
return weighted_score, scores
return evaluate
def train_while_improving(
nlp: Language,
optimizer: Optimizer,
train_data,
evaluate,
*,
dropout: float,
eval_frequency: int,
accumulate_gradient: int,
patience: int,
max_steps: int,
exclude: List[str],
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
where info is a dict, and is_best_checkpoint is in [True, False, None] --
None indicating that the iteration was not evaluated as a checkpoint.
The evaluation is conducted by calling the evaluate callback.
Positional arguments:
nlp: The spaCy pipeline to evaluate.
optimizer: The optimizer callable.
train_data (Iterable[Batch]): A generator of batches, with the training
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
data iterable needs to take care of iterating over the epochs and
shuffling.
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
The callback should take no arguments and return a tuple
`(main_score, other_scores)`. The main_score should be a float where
higher is better. other_scores can be any object.
Every iteration, the function yields out a tuple with:
* batch: A list of Example objects.
* info: A dict with various information about the last update (see below).
* is_best_checkpoint: A value in None, False, True, indicating whether this
was the best evaluation so far. You should use this to save the model
checkpoints during training. If None, evaluation was not conducted on
that iteration. False means evaluation was conducted, but a previous
evaluation was better.
The info dict provides the following information:
epoch (int): How many passes over the data have been completed.
step (int): How many steps have been completed.
score (float): The main score from the last evaluation.
other_scores: : The other scores from the last evaluation.
losses: The accumulated losses throughout training.
checkpoints: A list of previous results, where each result is a
(score, step, epoch) tuple.
"""
if isinstance(dropout, float):
dropouts = thinc.schedules.constant(dropout)
else:
dropouts = dropout
results = []
losses = {}
words_seen = 0
start_time = timer()
for step, (epoch, batch) in enumerate(train_data):
dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
)
# TODO: refactor this so we don't have to run it separately in here
for name, proc in nlp.pipeline:
if (
name not in exclude
and hasattr(proc, "model")
and proc.model not in (True, False, None)
):
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
if optimizer.averages:
with nlp.use_params(optimizer.averages):
score, other_scores = evaluate()
else:
score, other_scores = evaluate()
results.append((score, step))
is_best_checkpoint = score == max(results)[0]
else:
score, other_scores = (None, None)
is_best_checkpoint = None
words_seen += sum(len(eg) for eg in batch)
info = {
"epoch": epoch,
"step": step,
"score": score,
"other_scores": other_scores,
"losses": losses,
"checkpoints": results,
"seconds": int(timer() - start_time),
"words": words_seen,
}
yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None:
losses = {}
# Stop if no improvement in `patience` updates (if specified)
best_score, best_step = max(results)
if patience and (step - best_step) >= patience:
break
# Stop if we've exhausted our max steps (if specified)
if max_steps and step >= max_steps:
break
def subdivide_batch(batch, accumulate_gradient):
batch = list(batch)
batch.sort(key=lambda eg: len(eg.predicted))
sub_len = len(batch) // accumulate_gradient
start = 0
for i in range(accumulate_gradient):
subbatch = batch[start : start + sub_len]
if subbatch:
yield subbatch
start += len(subbatch)
subbatch = batch[start:]
if subbatch:
yield subbatch
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
nlp.meta["performance"] = {}
for metric in training["score_weights"]:
if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
return init_nlp(config, **init_kwargs)
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
if not output_path.exists():
output_path.mkdir()
msg.good(f"Created output directory: {output_path}")
# TODO: this is currently imported by the ray extension and not used otherwise
def load_from_paths(
config: Config,
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
weights_data = None
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
if init_tok2vec is not None:
if not init_tok2vec.exists():
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
return None, {}, {}, weights_data

View File

@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.training.initialize import verify_textcat_config
from ..util import make_tempdir
from ...cli.train import verify_textcat_config
from ...training import Example
TRAIN_DATA = [

View File

@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from spacy.cli.debug_config import check_section_refs
from thinc.api import ConfigValidationError, Config
import srsly
import os
@ -414,15 +413,3 @@ def test_string_to_list(value):
def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3]
def test_check_section_refs():
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
config = Config(config)
# Valid section reference
check_section_refs(config, ["a.b.c"])
# Section that doesn't exist in this config
check_section_refs(config, ["x.y.z"])
# Invalid section reference
with pytest.raises(ConfigValidationError):
check_section_refs(config, ["a.b.c", "f.g"])

View File

@ -7,7 +7,6 @@ from spacy import util
from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from thinc.api import Optimizer
@pytest.fixture
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result) == dot_notation
def test_resolve_training_config():
config = {
"nlp": {"lang": "en", "disabled": []},
"training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
"corpora": {},
}
resolved = util.resolve_training_config(config)
assert resolved["training"]["dropout"] == 0.1
assert isinstance(resolved["training"]["optimizer"], Optimizer)
assert resolved["corpora"] == {}
assert "nlp" not in resolved

View File

@ -1,14 +1,15 @@
import pytest
from .util import get_random_doc
from spacy import util
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from ..lang.en import English
from ..lang.nl import Dutch
from ..language import DEFAULT_CONFIG_PATH
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.mark.parametrize(
@ -101,8 +102,8 @@ def test_util_dot_section():
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
resolved = util.resolve_training_config(nl_nlp.config)
assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
@ -120,3 +121,17 @@ def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["foo.bar"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
import pytest
from thinc.api import Config
from spacy import Language
from spacy.util import load_model_from_config, registry, dot_to_object
from spacy.util import resolve_training_config
from spacy.util import load_model_from_config, registry, resolve_dot_names
from spacy.schemas import ConfigSchemaTraining
from spacy.training import Example
@ -39,21 +39,21 @@ def test_readers():
config = Config().from_str(config_string)
nlp = load_model_from_config(config, auto_fill=True)
resolved = resolve_training_config(nlp.config)
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
dot_names = ["training.train_corpus", "training.dev_corpus"]
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
assert isinstance(train_corpus, Callable)
optimizer = resolved["training"]["optimizer"]
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer)
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
scores = nlp.evaluate(list(dev_corpus(nlp)))
assert scores["cats_score"]
# ensure the pipeline runs
doc = nlp("Quick test")
assert doc.cats
extra_corpus = resolved["corpora"]["extra"]
extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
assert isinstance(extra_corpus, Callable)
@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
config["corpora"]["@readers"] = reader
config["corpora"].update(additional_config)
nlp = load_model_from_config(config, auto_fill=True)
resolved = resolve_training_config(nlp.config)
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
optimizer = resolved["training"]["optimizer"]
dot_names = ["training.train_corpus", "training.dev_corpus"]
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
nlp.update([example], sgd=optimizer)
# simulate performance benchmark on dev corpus
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
dev_examples = list(dev_corpus(nlp))
for example in dev_examples:
# this shouldn't fail if each dev example has at least one positive label

View File

@ -0,0 +1,205 @@
from typing import Union, Dict, Optional, Any, List, Callable
from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError
from pathlib import Path
import srsly
from .loop import create_before_to_disk_callback
from ..language import Language
from ..lookups import Lookups
from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, resolve_dot_names
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
def init_nlp(
config: Config,
*,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
on_success: Callable[[str], None] = lambda x: None,
) -> Language:
raw_config = config
config = raw_config.interpolate()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
nlp = load_model_from_config(raw_config, auto_fill=True)
on_success("Set up nlp object from config")
config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}")
if resume_components:
with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
on_success(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, I)
if loaded and P["component"]:
on_success(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp)
return nlp
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
# TODO: do this better and more fine-grained
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
def init_vocab(
nlp: Language,
*,
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
on_success: Callable[[str], None] = lambda x: None,
) -> Language:
if lookups:
nlp.vocab.lookups = lookups
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
on_success("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
on_success(f"Added vectors: {vectors}")
def load_vectors_into_model(
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
vectors_nlp = load_model(name)
except ConfigValidationError as e:
title = f"Config validation error for vectors {name}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
raise err from None
nlp.vocab.vectors = vectors_nlp.vocab.vectors
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def add_tok2vec_weights(
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
V = vocab_config
weights_data = None
init_tok2vec = ensure_path(V["init_tok2vec"])
if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
errors = [{"loc": ["initialize", "vectors"], "msg": err}]
raise ConfigValidationError(config=nlp.config, errors=errors)
if not init_tok2vec.exists():
err = f"can't find pretrained tok2vec: {init_tok2vec}"
errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
raise ConfigValidationError(config=nlp.config, errors=errors)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
if weights_data is not None:
tok2vec_component = P["component"]
if tok2vec_component is None:
desc = (
f"To use pretrained tok2vec weights, [pretraining.component] "
f"needs to specify the component that should load them."
)
err = "component can't be null"
errors = [{"loc": ["pretraining", "component"], "msg": err}]
raise ConfigValidationError(
config=nlp.config["pretraining"], errors=errors, desc=desc
)
layer = nlp.get_pipe(tok2vec_component).model
if P["layer"]:
layer = layer.get_ref(P["layer"])
layer.from_bytes(weights_data)
return True
return False
def verify_config(nlp: Language) -> None:
"""Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
for pipe_config in nlp.config["components"].values():
# We can't assume that the component name == the factory
factory = pipe_config["factory"]
if factory == "textcat":
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if pipe_config.get("positive_label"):
textcat_labels = nlp.get_pipe("textcat").labels
pos_label = pipe_config.get("positive_label")
if pos_label not in textcat_labels:
raise ValueError(
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
)
if len(list(textcat_labels)) != 2:
raise ValueError(
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
)
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
"""RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
"""
return [
name
for name, cfg in config.get("components", {}).items()
if "factory" not in cfg and "source" in cfg
]

301
spacy/training/loop.py Normal file
View File

@ -0,0 +1,301 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
from typing import Optional
from pathlib import Path
from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
import random
import tqdm
from .example import Example
from ..schemas import ConfigSchemaTraining
from ..language import Language
from ..errors import Errors
from ..util import resolve_dot_names, registry, logger
def train(
nlp: Language,
output_path: Optional[Path] = None,
*,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
) -> Optional[Path]:
"""Train a pipeline.
nlp (Language): The initialized nlp object with the full config.
output_path (Path): Optional output path to save trained model to.
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
before calling this function.
logger (Callable[[Any], Any]): Optional logger exposing the methods info,
error, debug and warn. Defaults to regular spaCy logger but can be
swapped for CLI logger.
RETURNS (Path / None): The path to the final exported model.
"""
# Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"]
score_weights = T["score_weights"]
batcher = T["batcher"]
train_logger = T["logger"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Create iterator, which yields out info after each optimization step.
training_step_iterator = train_while_improving(
nlp,
optimizer,
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
create_evaluation_callback(nlp, dev_corpus, score_weights),
dropout=T["dropout"],
accumulate_gradient=T["accumulate_gradient"],
patience=T["patience"],
max_steps=T["max_steps"],
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
)
logger.info(f"Pipeline: {nlp.pipe_names}")
if frozen_components:
logger.info(f"Frozen components: {frozen_components}")
logger.info(f"Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch 1")
for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1)
if is_best_checkpoint is not None:
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch {info['epoch']}")
except Exception as e:
finalize_logger()
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
# specific error.
logger.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-final")
raise e
finally:
finalize_logger()
if output_path is not None:
final_model_path = output_path / "model-final"
if optimizer.averages:
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
else:
nlp.to_disk(final_model_path)
return final_model_path
def train_while_improving(
nlp: Language,
optimizer: Optimizer,
train_data,
evaluate,
*,
dropout: float,
eval_frequency: int,
accumulate_gradient: int,
patience: int,
max_steps: int,
exclude: List[str],
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
where info is a dict, and is_best_checkpoint is in [True, False, None] --
None indicating that the iteration was not evaluated as a checkpoint.
The evaluation is conducted by calling the evaluate callback.
Positional arguments:
nlp: The spaCy pipeline to evaluate.
optimizer: The optimizer callable.
train_data (Iterable[Batch]): A generator of batches, with the training
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
data iterable needs to take care of iterating over the epochs and
shuffling.
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
The callback should take no arguments and return a tuple
`(main_score, other_scores)`. The main_score should be a float where
higher is better. other_scores can be any object.
Every iteration, the function yields out a tuple with:
* batch: A list of Example objects.
* info: A dict with various information about the last update (see below).
* is_best_checkpoint: A value in None, False, True, indicating whether this
was the best evaluation so far. You should use this to save the model
checkpoints during training. If None, evaluation was not conducted on
that iteration. False means evaluation was conducted, but a previous
evaluation was better.
The info dict provides the following information:
epoch (int): How many passes over the data have been completed.
step (int): How many steps have been completed.
score (float): The main score from the last evaluation.
other_scores: : The other scores from the last evaluation.
losses: The accumulated losses throughout training.
checkpoints: A list of previous results, where each result is a
(score, step, epoch) tuple.
"""
if isinstance(dropout, float):
dropouts = constant(dropout)
else:
dropouts = dropout
results = []
losses = {}
words_seen = 0
start_time = timer()
for step, (epoch, batch) in enumerate(train_data):
dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
)
# TODO: refactor this so we don't have to run it separately in here
for name, proc in nlp.pipeline:
if (
name not in exclude
and hasattr(proc, "model")
and proc.model not in (True, False, None)
):
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
if optimizer.averages:
with nlp.use_params(optimizer.averages):
score, other_scores = evaluate()
else:
score, other_scores = evaluate()
results.append((score, step))
is_best_checkpoint = score == max(results)[0]
else:
score, other_scores = (None, None)
is_best_checkpoint = None
words_seen += sum(len(eg) for eg in batch)
info = {
"epoch": epoch,
"step": step,
"score": score,
"other_scores": other_scores,
"losses": losses,
"checkpoints": results,
"seconds": int(timer() - start_time),
"words": words_seen,
}
yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None:
losses = {}
# Stop if no improvement in `patience` updates (if specified)
best_score, best_step = max(results)
if patience and (step - best_step) >= patience:
break
# Stop if we've exhausted our max steps (if specified)
if max_steps and step >= max_steps:
break
def subdivide_batch(batch, accumulate_gradient):
batch = list(batch)
batch.sort(key=lambda eg: len(eg.predicted))
sub_len = len(batch) // accumulate_gradient
start = 0
for i in range(accumulate_gradient):
subbatch = batch[start : start + sub_len]
if subbatch:
yield subbatch
start += len(subbatch)
subbatch = batch[start:]
if subbatch:
yield subbatch
def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples)
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key, value in scores.items():
if key in weights and not isinstance(value, (int, float)):
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
try:
weighted_score = sum(
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
)
except KeyError as e:
keys = list(scores.keys())
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
raise KeyError(err) from None
return weighted_score, scores
return evaluate
def create_train_batches(
iterator: Iterator[Example],
batcher: Callable[[Iterable[Example]], Iterable[Example]],
max_epochs: int,
):
epoch = 0
examples = list(iterator)
if not examples:
# Raise error if no data
raise ValueError(Errors.E986)
while max_epochs < 1 or epoch != max_epochs:
random.shuffle(examples)
for batch in batcher(examples):
yield epoch, batch
epoch += 1
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
nlp.meta["performance"] = {}
for metric in training["score_weights"]:
if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]]
) -> Callable[[Language], Language]:
def before_to_disk(nlp: Language) -> Language:
if not callback:
return nlp
modified_nlp = callback(nlp)
if not isinstance(modified_nlp, Language):
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
raise ValueError(err)
return modified_nlp
return before_to_disk

267
spacy/training/pretrain.py Normal file
View File

@ -0,0 +1,267 @@
from typing import Optional, Callable, Any, Iterable, Union, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
from pathlib import Path
from functools import partial
from collections import Counter
import srsly
import numpy
import time
import re
from wasabi import msg
from .example import Example
from ..tokens import Doc
from ..attrs import ID
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, dot_to_object, logger
def pretrain(
config: Config,
output_dir: Path,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
):
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
nlp = load_model_from_config(config)
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
corpus = dot_to_object(T, P["corpus"])
batcher = P["batcher"]
model = create_pretraining_model(nlp, P)
optimizer = P["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
_resume_model(model, resume_path, epoch_resume)
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
# TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
def _save_model(epoch, is_temp=False):
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
file_.write(model.get_ref("tok2vec").to_bytes())
log = {
"nr_word": tracker.nr_word,
"loss": tracker.loss,
"epoch_loss": tracker.epoch_loss,
"epoch": epoch,
}
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
objective = create_objective(P["objective"])
# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
docs = []
for eg_or_doc in examples_or_docs:
if isinstance(eg_or_doc, Doc):
docs.append(eg_or_doc)
else:
docs.append(eg_or_doc.reference)
return docs
def _resume_model(
model: Model,
resume_path: Path,
epoch_resume: int,
logger: Callable[[Any], Any] = logger,
) -> None:
logger.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
# Parse the epoch number from the given weight file
model_name = re.search(r"model\d+\.bin", str(resume_path))
if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
logger.info(f"Resuming from epoch: {epoch_resume}")
else:
logger.info(f"Resuming from epoch: {epoch_resume}")
def make_update(
model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
) -> float:
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
"""
predictions, backprop = model.begin_update(docs)
loss, gradients = objective_func(model.ops, docs, predictions)
backprop(gradients)
model.finish_update(optimizer)
# Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss
return float(loss)
def create_objective(config: Config):
"""Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss.
Bleh.
"""
objective_type = config["type"]
if objective_type == "characters":
return partial(get_characters_loss, nr_char=config["n_characters"])
elif objective_type == "vectors":
if config["loss"] == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
elif config["loss"] == "L2":
distance = L2Distance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
else:
raise ValueError("Unexpected loss type", config["loss"])
else:
raise ValueError("Unexpected objective_type", objective_type)
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc.
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO
maxout_pieces = 3
hidden_size = 300
if pretrain_config["objective"]["type"] == "vectors":
model = build_cloze_multi_task_model(
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
elif pretrain_config["objective"]["type"] == "characters":
model = build_cloze_characters_multi_task_model(
nlp.vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=pretrain_config["objective"]["n_characters"],
)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
set_dropout_rate(model, pretrain_config["dropout"])
return model
class ProgressTracker:
def __init__(self, frequency=1000000):
self.loss = 0.0
self.prev_loss = 0.0
self.nr_word = 0
self.words_per_epoch = Counter()
self.frequency = frequency
self.last_time = time.time()
self.last_update = 0
self.epoch_loss = 0.0
def update(self, epoch, loss, docs):
self.loss += loss
self.epoch_loss += loss
words_in_batch = sum(len(doc) for doc in docs)
self.words_per_epoch[epoch] += words_in_batch
self.nr_word += words_in_batch
words_since_update = self.nr_word - self.last_update
if words_since_update >= self.frequency:
wps = words_since_update / (time.time() - self.last_time)
self.last_update = self.nr_word
self.last_time = time.time()
loss_per_word = self.loss - self.prev_loss
status = (
epoch,
self.nr_word,
_smart_round(self.loss, width=10),
_smart_round(loss_per_word, width=6),
int(wps),
)
self.prev_loss = float(self.loss)
return status
else:
return None
def _smart_round(
figure: Union[float, int], width: int = 10, max_decimal: int = 4
) -> str:
"""Round large numbers as integers, smaller numbers as decimals."""
n_digits = len(str(int(figure)))
n_decimal = width - (n_digits + 1)
if n_decimal <= 1:
return str(int(figure))
else:
n_decimal = min(n_decimal, max_decimal)
format_str = "%." + str(n_decimal) + "f"
return format_str % figure

View File

@ -8,6 +8,7 @@ import re
from pathlib import Path
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
from thinc.api import ConfigValidationError
import functools
import itertools
import numpy.random
@ -56,6 +57,7 @@ if TYPE_CHECKING:
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
return Path(sys.modules[module.__module__].__file__).parent
def load_vectors_into_model(
nlp: "Language", name: Union[str, Path], *, add_strings=True
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
vectors_nlp = load_model(name)
nlp.vocab.vectors = vectors_nlp.vocab.vectors
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_model(
name: Union[str, Path],
*,
@ -391,32 +379,9 @@ def load_model_from_config(
return nlp
def resolve_training_config(
config: Config,
exclude: Iterable[str] = ("nlp", "components"),
validate: bool = True,
) -> Dict[str, Any]:
"""Resolve the config sections relevant for trainig and create all objects.
Mostly used in the CLI to separate training config (not resolved by default
because not runtime-relevant an nlp object should load fine even if it's
[training] block refers to functions that are not available etc.).
config (Config): The config to resolve.
exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
be available in the final resolved config.
validate (bool): Whether to validate the config.
RETURNS (Dict[str, Any]): The resolved config.
"""
config = config.copy()
for key in exclude:
if key in config:
config.pop(key)
return registry.resolve(config, validate=validate)
def resolve_dot_names(
config: Config, dot_names: List[Optional[str]]
) -> List[Optional[Callable]]:
) -> Tuple[Any]:
"""Resolve one or more "dot notation" names, e.g. corpora.train.
The paths could point anywhere into the config, so we don't know which
top-level section we'll be looking within.
@ -424,18 +389,42 @@ def resolve_dot_names(
We resolve the whole top-level section, although we could resolve less --
we could find the lowest part of the tree.
"""
# TODO: include schema?
# TODO: clean this up and avoid duplication
resolved = {}
output = []
errors = []
for name in dot_names:
if name is None:
output.append(name)
else:
section = name.split(".")[0]
# We want to avoid resolving the same thing twice.
# We want to avoid resolving the same thing twice
if section not in resolved:
resolved[section] = registry.resolve(config[section])
try:
output.append(dot_to_object(resolved, name))
return output
except KeyError:
msg = f"not a valid section reference: {name}"
errors.append({"loc": name.split("."), "msg": msg})
objects = []
for ref in output:
if not isinstance(ref, str):
msg = f"not a valid section reference: {ref} ({type(ref)})"
errors.append({"loc": ref.split("."), "msg": msg})
continue
section = ref.split(".")[0]
# We want to avoid resolving the same thing twice
if section not in resolved:
resolved[section] = registry.resolve(config[section])
try:
objects.append(dot_to_object(resolved, ref))
except KeyError:
msg = f"not a valid section reference: {name}"
errors.append({"loc": ref.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config=config, errors=errors)
return tuple(objects)
def load_model_from_init_py(