mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Refactor CLI
This commit is contained in:
parent
a89e0ff7cb
commit
822ea4ef61
|
@ -15,8 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
#from .init_model import init_model # noqa: F401
|
||||
from .init_pipeline import init_pipeline # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
|
|
|
@ -10,13 +10,12 @@ from click import NoSuchOption
|
|||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import ensure_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||
"""RETURNS (List[str]): All sourced components in the original config,
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
"factory", we assume it refers to a component factory.
|
||||
"""
|
||||
return [
|
||||
name
|
||||
for name, cfg in config.get("components", {}).items()
|
||||
if "factory" not in cfg and "source" in cfg
|
||||
]
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
|
@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
|||
p = int(p)
|
||||
result.append(p)
|
||||
return result
|
||||
|
||||
|
||||
class CliLogger:
|
||||
"""Helper mocking up the most commonly used logger methods. Can be passed
|
||||
into functions like train() to make them output pretty-printed messages
|
||||
on the CLI and regular logging if used from within Python.
|
||||
"""
|
||||
|
||||
debug = msg.text
|
||||
info = msg.info
|
||||
warn = msg.info
|
||||
error = msg.fail
|
||||
|
||||
|
||||
def setup_gpu(use_gpu: int):
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
import typer
|
||||
|
||||
|
@ -52,10 +52,8 @@ def debug_config(
|
|||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp = util.load_model_from_config(config)
|
||||
# Use the resolved config here in case user has one function returning
|
||||
# a dict of corpora etc.
|
||||
resolved = util.resolve_training_config(nlp.config)
|
||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
||||
dot_names = ["training.dev_corpus", "training.train_corpus"]
|
||||
util.resolve_dot_names(nlp.config, dot_names)
|
||||
msg.good("Config is valid")
|
||||
if show_vars:
|
||||
variables = get_variables(config)
|
||||
|
@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
|||
value = util.dot_to_object(config, path)
|
||||
result[variable] = repr(value)
|
||||
return result
|
||||
|
||||
|
||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||
"""Validate fields in the config that refer to other sections or values
|
||||
(e.g. in the corpora) and make sure that those references exist.
|
||||
"""
|
||||
errors = []
|
||||
for field in fields:
|
||||
# If the field doesn't exist in the config, we ignore it
|
||||
try:
|
||||
value = util.dot_to_object(config, field)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
util.dot_to_object(config, value)
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {value}"
|
||||
errors.append({"loc": field.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config=config, errors=errors)
|
||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
|||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, get_sourced_components
|
||||
from ._util import import_code, debug_cli
|
||||
from ..training import Corpus, Example
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..language import Language
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -94,26 +97,13 @@ def debug_data(
|
|||
with show_validation_error(config_path):
|
||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp = util.load_model_from_config(cfg)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
# Use original config here, not resolved version
|
||||
sourced_components = get_sourced_components(cfg)
|
||||
frozen_components = C["training"]["frozen_components"]
|
||||
frozen_components = T["frozen_components"]
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
pipeline = nlp.pipe_names
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
tag_map_path = util.ensure_path(C["training"]["tag_map"])
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
|
||||
morph_rules = {}
|
||||
if morph_rules_path is not None:
|
||||
morph_rules = srsly.read_json(morph_rules_path)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
msg.divider("Data file validation")
|
||||
|
||||
# Create the gold corpus to be able to better analyze data
|
||||
|
@ -145,10 +135,10 @@ def debug_data(
|
|||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
frozen_components = C["training"]["frozen_components"]
|
||||
frozen_components = T["frozen_components"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text(f"Language: {C['nlp']['lang']}")
|
||||
msg.text(f"Language: {nlp.lang}")
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
if resume_components:
|
||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||
|
@ -355,6 +345,7 @@ def debug_data(
|
|||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
# TODO: does this need to be updated?
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||
labels_with_counts = _format_labels(
|
||||
|
|
|
@ -4,12 +4,14 @@ from pathlib import Path
|
|||
from spacy.training import Example
|
||||
from spacy.util import dot_to_object
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list
|
||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -37,11 +39,7 @@ def debug_model_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||
"""
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
setup_gpu(use_gpu)
|
||||
layers = string_to_list(layers, intify=True)
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
|
@ -65,8 +63,8 @@ def debug_model_cli(
|
|||
set_gpu_allocator(allocator)
|
||||
with show_validation_error(config_path):
|
||||
nlp = util.load_model_from_config(raw_config)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
seed = C["training"]["seed"]
|
||||
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
seed = T["seed"]
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
|
@ -77,7 +75,7 @@ def debug_model_cli(
|
|||
exits=1,
|
||||
)
|
||||
model = pipe.model
|
||||
debug_model(C, nlp, model, print_settings=print_settings)
|
||||
debug_model(T, nlp, model, print_settings=print_settings)
|
||||
|
||||
|
||||
def debug_model(
|
||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
|||
from pathlib import Path
|
||||
import re
|
||||
import srsly
|
||||
from thinc.api import require_gpu, fix_random_seed
|
||||
from thinc.api import fix_random_seed
|
||||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, setup_gpu
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
@ -61,8 +61,7 @@ def evaluate(
|
|||
) -> Scorer:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
if use_gpu >= 0:
|
||||
require_gpu(use_gpu)
|
||||
setup_gpu(use_gpu)
|
||||
data_path = util.ensure_path(data_path)
|
||||
output_path = util.ensure_path(output)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
|
|
|
@ -1,22 +1,13 @@
|
|||
from typing import Optional, Dict, Callable, Any
|
||||
from typing import Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||
import srsly
|
||||
|
||||
from .. import util
|
||||
from ..util import registry, resolve_dot_names, OOV_RANK
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
|
||||
from ..language import Language
|
||||
from ..lookups import Lookups
|
||||
from ..errors import Errors
|
||||
from ..training.initialize import init_nlp
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, get_sourced_components
|
||||
|
||||
|
||||
DEFAULT_OOV_PROB = -20
|
||||
from ._util import import_code, CliLogger, setup_gpu
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
|
@ -31,178 +22,16 @@ def init_pipeline_cli(
|
|||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp = init_pipeline(config)
|
||||
with show_validation_error(hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
|
||||
nlp.to_disk(output_path)
|
||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||
|
||||
|
||||
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
||||
raw_config = config
|
||||
config = raw_config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error():
|
||||
nlp = util.load_model_from_config(raw_config, auto_fill=True)
|
||||
msg.good("Set up nlp object from config")
|
||||
config = nlp.config.interpolate()
|
||||
# Resolve all training-relevant sections using the filled nlp config
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
V = I["vocab"]
|
||||
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
|
||||
optimizer = T["optimizer"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
msg.info(f"Resuming training for: {resume_components}")
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
msg.good(f"Initialized pipeline components")
|
||||
# Verify the config after calling 'begin_training' to ensure labels
|
||||
# are properly initialized
|
||||
verify_config(nlp)
|
||||
if "pretraining" in config and config["pretraining"]:
|
||||
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
||||
add_tok2vec_weights(nlp, P, I)
|
||||
# TODO: this should be handled better?
|
||||
nlp = before_to_disk(nlp)
|
||||
return nlp
|
||||
|
||||
|
||||
def init_vocab(
|
||||
nlp: Language,
|
||||
*,
|
||||
data: Optional[Path] = None,
|
||||
lookups: Optional[Lookups] = None,
|
||||
vectors: Optional[str] = None,
|
||||
) -> Language:
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
data_path = util.ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = OOV_RANK
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
if len(nlp.vocab):
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
msg.good("Created vocabulary")
|
||||
if vectors is not None:
|
||||
add_vectors(nlp, vectors)
|
||||
msg.good(f"Added vectors: {vectors}")
|
||||
|
||||
|
||||
def add_tok2vec_weights(
|
||||
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||
) -> None:
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
P = pretrain_config
|
||||
V = vocab_config
|
||||
weights_data = None
|
||||
init_tok2vec = util.ensure_path(V["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||
err = "Need initialize.vectors if pretraining.objective.type is vectors"
|
||||
msg.fail(err, exits=1)
|
||||
if not init_tok2vec.exists():
|
||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
if weights_data is not None:
|
||||
tok2vec_component = P["component"]
|
||||
if tok2vec_component is None:
|
||||
msg.fail(
|
||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||
f"needs to specify the component that should load them.",
|
||||
exits=1,
|
||||
)
|
||||
layer = nlp.get_pipe(tok2vec_component).model
|
||||
if P["layer"]:
|
||||
layer = layer.get_ref(P["layer"])
|
||||
layer.from_bytes(weights_data)
|
||||
msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
||||
|
||||
|
||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
||||
title = f"Config validation error for vectors {vectors}"
|
||||
desc = (
|
||||
"This typically means that there's a problem in the config.cfg included "
|
||||
"with the packaged vectors. Make sure that the vectors package you're "
|
||||
"loading is compatible with the current version of spaCy."
|
||||
)
|
||||
with show_validation_error(
|
||||
title=title, desc=desc, hint_fill=False, show_config=False
|
||||
):
|
||||
util.load_vectors_into_model(nlp, vectors)
|
||||
msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
|
||||
|
||||
|
||||
def verify_config(nlp: Language) -> None:
|
||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||
# TODO: maybe we should validate based on the actual components, the list
|
||||
# in config["nlp"]["pipeline"] instead?
|
||||
for pipe_config in nlp.config["components"].values():
|
||||
# We can't assume that the component name == the factory
|
||||
factory = pipe_config["factory"]
|
||||
if factory == "textcat":
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||
# if 'positive_label' is provided: double check whether it's in the data and
|
||||
# the task is binary
|
||||
if pipe_config.get("positive_label"):
|
||||
textcat_labels = nlp.get_pipe("textcat").labels
|
||||
pos_label = pipe_config.get("positive_label")
|
||||
if pos_label not in textcat_labels:
|
||||
raise ValueError(
|
||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
if len(list(textcat_labels)) != 2:
|
||||
raise ValueError(
|
||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
|
||||
|
||||
def create_before_to_disk_callback(
|
||||
callback: Optional[Callable[[Language], Language]]
|
||||
) -> Callable[[Language], Language]:
|
||||
def before_to_disk(nlp: Language) -> Language:
|
||||
if not callback:
|
||||
return nlp
|
||||
modified_nlp = callback(nlp)
|
||||
if not isinstance(modified_nlp, Language):
|
||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
||||
raise ValueError(err)
|
||||
return modified_nlp
|
||||
|
||||
return before_to_disk
|
||||
|
|
|
@ -1,25 +1,13 @@
|
|||
from typing import Optional
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import require_gpu, set_gpu_allocator
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import Config, CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from functools import partial
|
||||
import typer
|
||||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from .. import util
|
||||
from ..util import dot_to_object
|
||||
from ._util import import_code, setup_gpu, CliLogger
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -61,15 +49,11 @@ def pretrain_cli(
|
|||
config_overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
setup_gpu(use_gpu)
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
|
||||
with show_validation_error(config_path):
|
||||
raw_config = util.load_config(
|
||||
raw_config = load_config(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
config = raw_config.interpolate()
|
||||
|
@ -89,250 +73,11 @@ def pretrain_cli(
|
|||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
logger=CliLogger,
|
||||
)
|
||||
|
||||
|
||||
def pretrain(
|
||||
config: Config,
|
||||
output_dir: Path,
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
):
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp = util.load_model_from_config(config)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
P_cfg = C["pretraining"]
|
||||
corpus = dot_to_object(C, P_cfg["corpus"])
|
||||
batcher = P_cfg["batcher"]
|
||||
model = create_pretraining_model(nlp, C["pretraining"])
|
||||
optimizer = C["pretraining"]["optimizer"]
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
_resume_model(model, resume_path, epoch_resume)
|
||||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
"epoch_loss": tracker.epoch_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
objective = create_objective(P_cfg["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def ensure_docs(examples_or_docs):
|
||||
docs = []
|
||||
for eg_or_doc in examples_or_docs:
|
||||
if isinstance(eg_or_doc, Doc):
|
||||
docs.append(eg_or_doc)
|
||||
else:
|
||||
docs.append(eg_or_doc.reference)
|
||||
return docs
|
||||
|
||||
|
||||
def _resume_model(model, resume_path, epoch_resume):
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, objective_func):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
return float(loss)
|
||||
|
||||
|
||||
def create_objective(config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
return partial(
|
||||
get_vectors_loss,
|
||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
||||
)
|
||||
elif config["loss"] == "L2":
|
||||
return partial(
|
||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
||||
)
|
||||
else:
|
||||
raise ValueError("Unexpected loss type", config["loss"])
|
||||
else:
|
||||
raise ValueError("Unexpected objective_type", objective_type)
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
def __init__(self, frequency=1000000):
|
||||
self.loss = 0.0
|
||||
self.prev_loss = 0.0
|
||||
self.nr_word = 0
|
||||
self.words_per_epoch = Counter()
|
||||
self.frequency = frequency
|
||||
self.last_time = time.time()
|
||||
self.last_update = 0
|
||||
self.epoch_loss = 0.0
|
||||
|
||||
def update(self, epoch, loss, docs):
|
||||
self.loss += loss
|
||||
self.epoch_loss += loss
|
||||
words_in_batch = sum(len(doc) for doc in docs)
|
||||
self.words_per_epoch[epoch] += words_in_batch
|
||||
self.nr_word += words_in_batch
|
||||
words_since_update = self.nr_word - self.last_update
|
||||
if words_since_update >= self.frequency:
|
||||
wps = words_since_update / (time.time() - self.last_time)
|
||||
self.last_update = self.nr_word
|
||||
self.last_time = time.time()
|
||||
loss_per_word = self.loss - self.prev_loss
|
||||
status = (
|
||||
epoch,
|
||||
self.nr_word,
|
||||
_smart_round(self.loss, width=10),
|
||||
_smart_round(loss_per_word, width=6),
|
||||
int(wps),
|
||||
)
|
||||
self.prev_loss = float(self.loss)
|
||||
return status
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _smart_round(figure, width=10, max_decimal=4):
|
||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
||||
n_digits = len(str(int(figure)))
|
||||
n_decimal = width - (n_digits + 1)
|
||||
if n_decimal <= 1:
|
||||
return str(int(figure))
|
||||
else:
|
||||
n_decimal = min(n_decimal, max_decimal)
|
||||
format_str = "%." + str(n_decimal) + "f"
|
||||
return format_str % figure
|
||||
|
||||
|
||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
|
|
@ -1,24 +1,16 @@
|
|||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
||||
from timeit import default_timer as timer
|
||||
import tqdm
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
from thinc.api import Config
|
||||
import typer
|
||||
import logging
|
||||
|
||||
from .init_pipeline import init_pipeline
|
||||
from .init_pipeline import create_before_to_disk_callback
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ._util import import_code, CliLogger, setup_gpu
|
||||
from ..language import Language
|
||||
from ..training.loop import train
|
||||
from ..training.initialize import init_nlp, must_reinitialize
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..util import resolve_dot_names, registry
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -52,31 +44,33 @@ def train_cli(
|
|||
verify_cli_args(config_path, output_path)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||
msg.divider("Initializing pipeline")
|
||||
nlp = init_nlp(config, output_path)
|
||||
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
|
||||
msg.divider("Training pipeline")
|
||||
train(nlp, output_path, use_gpu=use_gpu)
|
||||
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
|
||||
if final_path:
|
||||
msg.good(f"Saved pipeline to output directory", final_path)
|
||||
|
||||
|
||||
def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
|
||||
def init_pipeline(
|
||||
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
|
||||
) -> Language:
|
||||
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
|
||||
if output_path is not None:
|
||||
init_path = output_path / "model-initial"
|
||||
if not init_path.exists():
|
||||
msg.info(f"Initializing the pipeline in {init_path}")
|
||||
nlp = init_pipeline(config)
|
||||
nlp = init_nlp(config, **init_kwargs)
|
||||
nlp.to_disk(init_path)
|
||||
msg.good(f"Saved initialized pipeline to {init_path}")
|
||||
else:
|
||||
nlp = util.load_model(init_path)
|
||||
if must_reinitialize(config, nlp.config):
|
||||
msg.warn("Config has changed: need to re-initialize pipeline")
|
||||
nlp = init_pipeline(config)
|
||||
nlp = init_nlp(config, **init_kwargs)
|
||||
nlp.to_disk(init_path)
|
||||
msg.good(f"Re-initialized pipeline in {init_path}")
|
||||
else:
|
||||
|
@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
|
|||
"the vocabulary, vectors and label scheme. To take advantage of this, "
|
||||
"provide an output directory."
|
||||
)
|
||||
return init_pipeline(config)
|
||||
|
||||
|
||||
def train(
|
||||
nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
|
||||
) -> None:
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
config = nlp.config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
optimizer = T["optimizer"]
|
||||
score_weights = T["score_weights"]
|
||||
batcher = T["batcher"]
|
||||
train_logger = T["logger"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
training_step_iterator = train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
|
||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
||||
dropout=T["dropout"],
|
||||
accumulate_gradient=T["accumulate_gradient"],
|
||||
patience=T["patience"],
|
||||
max_steps=T["max_steps"],
|
||||
eval_frequency=T["eval_frequency"],
|
||||
exclude=frozen_components,
|
||||
)
|
||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if frozen_components:
|
||||
msg.info(f"Frozen components: {frozen_components}")
|
||||
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch 1")
|
||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||
progress.update(1)
|
||||
if is_best_checkpoint is not None:
|
||||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch {info['epoch']}")
|
||||
except Exception as e:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
# We don't want to swallow the traceback if we don't have a
|
||||
# specific error.
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}"
|
||||
)
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-final")
|
||||
raise e
|
||||
finally:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / "model-final"
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||
|
||||
|
||||
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
||||
# TODO: do this better and more fine-grained
|
||||
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
|
||||
|
||||
|
||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
||||
title = f"Config validation error for vectors {vectors}"
|
||||
desc = (
|
||||
"This typically means that there's a problem in the config.cfg included "
|
||||
"with the packaged vectors. Make sure that the vectors package you're "
|
||||
"loading is compatible with the current version of spaCy."
|
||||
)
|
||||
with show_validation_error(
|
||||
title=title, desc=desc, hint_fill=False, show_config=False
|
||||
):
|
||||
util.load_vectors_into_model(nlp, vectors)
|
||||
|
||||
|
||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||
epoch = 0
|
||||
examples = list(iterator)
|
||||
if not examples:
|
||||
# Raise error if no data
|
||||
raise ValueError(Errors.E986)
|
||||
while max_epochs < 1 or epoch != max_epochs:
|
||||
random.shuffle(examples)
|
||||
for batch in batcher(examples):
|
||||
yield epoch, batch
|
||||
epoch += 1
|
||||
|
||||
|
||||
def create_evaluation_callback(
|
||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
weights = {key: value for key, value in weights.items() if value is not None}
|
||||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
for key, value in scores.items():
|
||||
if key in weights and not isinstance(value, (int, float)):
|
||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||
try:
|
||||
weighted_score = sum(
|
||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||
)
|
||||
except KeyError as e:
|
||||
keys = list(scores.keys())
|
||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||
raise KeyError(err) from None
|
||||
return weighted_score, scores
|
||||
|
||||
return evaluate
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
nlp: Language,
|
||||
optimizer: Optimizer,
|
||||
train_data,
|
||||
evaluate,
|
||||
*,
|
||||
dropout: float,
|
||||
eval_frequency: int,
|
||||
accumulate_gradient: int,
|
||||
patience: int,
|
||||
max_steps: int,
|
||||
exclude: List[str],
|
||||
):
|
||||
"""Train until an evaluation stops improving. Works as a generator,
|
||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
||||
None indicating that the iteration was not evaluated as a checkpoint.
|
||||
The evaluation is conducted by calling the evaluate callback.
|
||||
|
||||
Positional arguments:
|
||||
nlp: The spaCy pipeline to evaluate.
|
||||
optimizer: The optimizer callable.
|
||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
||||
data iterable needs to take care of iterating over the epochs and
|
||||
shuffling.
|
||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
||||
The callback should take no arguments and return a tuple
|
||||
`(main_score, other_scores)`. The main_score should be a float where
|
||||
higher is better. other_scores can be any object.
|
||||
|
||||
Every iteration, the function yields out a tuple with:
|
||||
|
||||
* batch: A list of Example objects.
|
||||
* info: A dict with various information about the last update (see below).
|
||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||
was the best evaluation so far. You should use this to save the model
|
||||
checkpoints during training. If None, evaluation was not conducted on
|
||||
that iteration. False means evaluation was conducted, but a previous
|
||||
evaluation was better.
|
||||
|
||||
The info dict provides the following information:
|
||||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score from the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
losses: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
if isinstance(dropout, float):
|
||||
dropouts = thinc.schedules.constant(dropout)
|
||||
else:
|
||||
dropouts = dropout
|
||||
results = []
|
||||
losses = {}
|
||||
words_seen = 0
|
||||
start_time = timer()
|
||||
for step, (epoch, batch) in enumerate(train_data):
|
||||
dropout = next(dropouts)
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
nlp.update(
|
||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
||||
)
|
||||
# TODO: refactor this so we don't have to run it separately in here
|
||||
for name, proc in nlp.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and hasattr(proc, "model")
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.model.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
score, other_scores = evaluate()
|
||||
else:
|
||||
score, other_scores = evaluate()
|
||||
results.append((score, step))
|
||||
is_best_checkpoint = score == max(results)[0]
|
||||
else:
|
||||
score, other_scores = (None, None)
|
||||
is_best_checkpoint = None
|
||||
words_seen += sum(len(eg) for eg in batch)
|
||||
info = {
|
||||
"epoch": epoch,
|
||||
"step": step,
|
||||
"score": score,
|
||||
"other_scores": other_scores,
|
||||
"losses": losses,
|
||||
"checkpoints": results,
|
||||
"seconds": int(timer() - start_time),
|
||||
"words": words_seen,
|
||||
}
|
||||
yield batch, info, is_best_checkpoint
|
||||
if is_best_checkpoint is not None:
|
||||
losses = {}
|
||||
# Stop if no improvement in `patience` updates (if specified)
|
||||
best_score, best_step = max(results)
|
||||
if patience and (step - best_step) >= patience:
|
||||
break
|
||||
# Stop if we've exhausted our max steps (if specified)
|
||||
if max_steps and step >= max_steps:
|
||||
break
|
||||
|
||||
|
||||
def subdivide_batch(batch, accumulate_gradient):
|
||||
batch = list(batch)
|
||||
batch.sort(key=lambda eg: len(eg.predicted))
|
||||
sub_len = len(batch) // accumulate_gradient
|
||||
start = 0
|
||||
for i in range(accumulate_gradient):
|
||||
subbatch = batch[start : start + sub_len]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
start += len(subbatch)
|
||||
subbatch = batch[start:]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
|
||||
|
||||
def update_meta(
|
||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||
) -> None:
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in training["score_weights"]:
|
||||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
return init_nlp(config, **init_kwargs)
|
||||
|
||||
|
||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
||||
|
@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
|||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
|
||||
|
||||
# TODO: this is currently imported by the ray extension and not used otherwise
|
||||
def load_from_paths(
|
||||
config: Config,
|
||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
||||
weights_data = None
|
||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if not init_tok2vec.exists():
|
||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
return None, {}, {}, weights_data
|
||||
|
|
|
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
|
|||
from spacy.tokens import Doc
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.training import Example
|
||||
from spacy.training.initialize import verify_textcat_config
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...cli.train import verify_textcat_config
|
||||
from ...training import Example
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
|
|
|
@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
|||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from spacy.cli.debug_config import check_section_refs
|
||||
from thinc.api import ConfigValidationError, Config
|
||||
import srsly
|
||||
import os
|
||||
|
@ -414,15 +413,3 @@ def test_string_to_list(value):
|
|||
def test_string_to_list_intify(value):
|
||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
def test_check_section_refs():
|
||||
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
|
||||
config = Config(config)
|
||||
# Valid section reference
|
||||
check_section_refs(config, ["a.b.c"])
|
||||
# Section that doesn't exist in this config
|
||||
check_section_refs(config, ["x.y.z"])
|
||||
# Invalid section reference
|
||||
with pytest.raises(ConfigValidationError):
|
||||
check_section_refs(config, ["a.b.c", "f.g"])
|
||||
|
|
|
@ -7,7 +7,6 @@ from spacy import util
|
|||
from spacy import prefer_gpu, require_gpu
|
||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
from thinc.api import Optimizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
|
|||
result = util.dot_to_dict(dot_notation)
|
||||
assert result == expected
|
||||
assert util.dict_to_dot(result) == dot_notation
|
||||
|
||||
|
||||
def test_resolve_training_config():
|
||||
config = {
|
||||
"nlp": {"lang": "en", "disabled": []},
|
||||
"training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
|
||||
"corpora": {},
|
||||
}
|
||||
resolved = util.resolve_training_config(config)
|
||||
assert resolved["training"]["dropout"] == 0.1
|
||||
assert isinstance(resolved["training"]["optimizer"], Optimizer)
|
||||
assert resolved["corpora"] == {}
|
||||
assert "nlp" not in resolved
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
import pytest
|
||||
|
||||
from .util import get_random_doc
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import dot_to_object, SimpleFrozenList
|
||||
from thinc.api import Config, Optimizer
|
||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||
from spacy.training.batchers import minibatch_by_words
|
||||
from ..lang.en import English
|
||||
from ..lang.nl import Dutch
|
||||
from ..language import DEFAULT_CONFIG_PATH
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
from spacy.language import DEFAULT_CONFIG_PATH
|
||||
from spacy.schemas import ConfigSchemaTraining
|
||||
|
||||
from .util import get_random_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -101,8 +102,8 @@ def test_util_dot_section():
|
|||
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
||||
resolved = util.resolve_training_config(nl_nlp.config)
|
||||
assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
|
||||
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
|
||||
|
||||
|
||||
def test_simple_frozen_list():
|
||||
|
@ -120,3 +121,17 @@ def test_simple_frozen_list():
|
|||
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.append("baz")
|
||||
|
||||
|
||||
def test_resolve_dot_names():
|
||||
config = {
|
||||
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
|
||||
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
|
||||
}
|
||||
result = util.resolve_dot_names(config, ["foo.bar"])
|
||||
assert isinstance(result[0], Optimizer)
|
||||
with pytest.raises(ConfigValidationError) as e:
|
||||
util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
|
||||
errors = e.value.errors
|
||||
assert len(errors) == 1
|
||||
assert errors[0]["loc"] == ["training", "xyz"]
|
||||
|
|
|
@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
|
|||
import pytest
|
||||
from thinc.api import Config
|
||||
from spacy import Language
|
||||
from spacy.util import load_model_from_config, registry, dot_to_object
|
||||
from spacy.util import resolve_training_config
|
||||
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
||||
from spacy.schemas import ConfigSchemaTraining
|
||||
from spacy.training import Example
|
||||
|
||||
|
||||
|
@ -39,21 +39,21 @@ def test_readers():
|
|||
|
||||
config = Config().from_str(config_string)
|
||||
nlp = load_model_from_config(config, auto_fill=True)
|
||||
resolved = resolve_training_config(nlp.config)
|
||||
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
||||
dot_names = ["training.train_corpus", "training.dev_corpus"]
|
||||
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
||||
assert isinstance(train_corpus, Callable)
|
||||
optimizer = resolved["training"]["optimizer"]
|
||||
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
optimizer = T["optimizer"]
|
||||
# simulate a training loop
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
for example in train_corpus(nlp):
|
||||
nlp.update([example], sgd=optimizer)
|
||||
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||
assert scores["cats_score"]
|
||||
# ensure the pipeline runs
|
||||
doc = nlp("Quick test")
|
||||
assert doc.cats
|
||||
extra_corpus = resolved["corpora"]["extra"]
|
||||
extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
|
||||
assert isinstance(extra_corpus, Callable)
|
||||
|
||||
|
||||
|
@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
|
|||
config["corpora"]["@readers"] = reader
|
||||
config["corpora"].update(additional_config)
|
||||
nlp = load_model_from_config(config, auto_fill=True)
|
||||
resolved = resolve_training_config(nlp.config)
|
||||
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
||||
optimizer = resolved["training"]["optimizer"]
|
||||
dot_names = ["training.train_corpus", "training.dev_corpus"]
|
||||
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
||||
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
optimizer = T["optimizer"]
|
||||
# simulate a training loop
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
for example in train_corpus(nlp):
|
||||
|
@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
|
|||
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
|
||||
nlp.update([example], sgd=optimizer)
|
||||
# simulate performance benchmark on dev corpus
|
||||
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
for example in dev_examples:
|
||||
# this shouldn't fail if each dev example has at least one positive label
|
||||
|
|
|
@ -0,0 +1,205 @@
|
|||
from typing import Union, Dict, Optional, Any, List, Callable
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||
from thinc.api import ConfigValidationError
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
from .loop import create_before_to_disk_callback
|
||||
from ..language import Language
|
||||
from ..lookups import Lookups
|
||||
from ..errors import Errors
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
|
||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
||||
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
|
||||
|
||||
|
||||
def init_nlp(
|
||||
config: Config,
|
||||
*,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
on_success: Callable[[str], None] = lambda x: None,
|
||||
) -> Language:
|
||||
raw_config = config
|
||||
config = raw_config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
nlp = load_model_from_config(raw_config, auto_fill=True)
|
||||
on_success("Set up nlp object from config")
|
||||
config = nlp.config.interpolate()
|
||||
# Resolve all training-relevant sections using the filled nlp config
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
V = I["vocab"]
|
||||
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
|
||||
optimizer = T["optimizer"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
logger.info(f"Resuming training for: {resume_components}")
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
on_success(f"Initialized pipeline components")
|
||||
# Verify the config after calling 'begin_training' to ensure labels
|
||||
# are properly initialized
|
||||
verify_config(nlp)
|
||||
if "pretraining" in config and config["pretraining"]:
|
||||
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
||||
loaded = add_tok2vec_weights(nlp, P, I)
|
||||
if loaded and P["component"]:
|
||||
on_success(f"Loaded pretrained weights into component '{P['component']}'")
|
||||
nlp = before_to_disk(nlp)
|
||||
return nlp
|
||||
|
||||
|
||||
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
||||
# TODO: do this better and more fine-grained
|
||||
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
|
||||
|
||||
|
||||
def init_vocab(
|
||||
nlp: Language,
|
||||
*,
|
||||
data: Optional[Path] = None,
|
||||
lookups: Optional[Lookups] = None,
|
||||
vectors: Optional[str] = None,
|
||||
on_success: Callable[[str], None] = lambda x: None,
|
||||
) -> Language:
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
data_path = ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = OOV_RANK
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
if len(nlp.vocab):
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
on_success("Created vocabulary")
|
||||
if vectors is not None:
|
||||
load_vectors_into_model(nlp, vectors)
|
||||
on_success(f"Added vectors: {vectors}")
|
||||
|
||||
|
||||
def load_vectors_into_model(
|
||||
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
|
||||
) -> None:
|
||||
"""Load word vectors from an installed model or path into a model instance."""
|
||||
try:
|
||||
vectors_nlp = load_model(name)
|
||||
except ConfigValidationError as e:
|
||||
title = f"Config validation error for vectors {name}"
|
||||
desc = (
|
||||
"This typically means that there's a problem in the config.cfg included "
|
||||
"with the packaged vectors. Make sure that the vectors package you're "
|
||||
"loading is compatible with the current version of spaCy."
|
||||
)
|
||||
err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
|
||||
raise err from None
|
||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||
if add_strings:
|
||||
# I guess we should add the strings from the vectors_nlp model?
|
||||
# E.g. if someone does a similarity query, they might expect the strings.
|
||||
for key in nlp.vocab.vectors.key2row:
|
||||
if key in vectors_nlp.vocab.strings:
|
||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def add_tok2vec_weights(
|
||||
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||
) -> bool:
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
P = pretrain_config
|
||||
V = vocab_config
|
||||
weights_data = None
|
||||
init_tok2vec = ensure_path(V["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
|
||||
errors = [{"loc": ["initialize", "vectors"], "msg": err}]
|
||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||
if not init_tok2vec.exists():
|
||||
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
||||
errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
|
||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
if weights_data is not None:
|
||||
tok2vec_component = P["component"]
|
||||
if tok2vec_component is None:
|
||||
desc = (
|
||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||
f"needs to specify the component that should load them."
|
||||
)
|
||||
err = "component can't be null"
|
||||
errors = [{"loc": ["pretraining", "component"], "msg": err}]
|
||||
raise ConfigValidationError(
|
||||
config=nlp.config["pretraining"], errors=errors, desc=desc
|
||||
)
|
||||
layer = nlp.get_pipe(tok2vec_component).model
|
||||
if P["layer"]:
|
||||
layer = layer.get_ref(P["layer"])
|
||||
layer.from_bytes(weights_data)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def verify_config(nlp: Language) -> None:
|
||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||
# TODO: maybe we should validate based on the actual components, the list
|
||||
# in config["nlp"]["pipeline"] instead?
|
||||
for pipe_config in nlp.config["components"].values():
|
||||
# We can't assume that the component name == the factory
|
||||
factory = pipe_config["factory"]
|
||||
if factory == "textcat":
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||
# if 'positive_label' is provided: double check whether it's in the data and
|
||||
# the task is binary
|
||||
if pipe_config.get("positive_label"):
|
||||
textcat_labels = nlp.get_pipe("textcat").labels
|
||||
pos_label = pipe_config.get("positive_label")
|
||||
if pos_label not in textcat_labels:
|
||||
raise ValueError(
|
||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
if len(list(textcat_labels)) != 2:
|
||||
raise ValueError(
|
||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
|
||||
|
||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||
"""RETURNS (List[str]): All sourced components in the original config,
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
"factory", we assume it refers to a component factory.
|
||||
"""
|
||||
return [
|
||||
name
|
||||
for name, cfg in config.get("components", {}).items()
|
||||
if "factory" not in cfg and "source" in cfg
|
||||
]
|
301
spacy/training/loop.py
Normal file
301
spacy/training/loop.py
Normal file
|
@ -0,0 +1,301 @@
|
|||
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from timeit import default_timer as timer
|
||||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
import tqdm
|
||||
|
||||
from .example import Example
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..util import resolve_dot_names, registry, logger
|
||||
|
||||
|
||||
def train(
|
||||
nlp: Language,
|
||||
output_path: Optional[Path] = None,
|
||||
*,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
) -> Optional[Path]:
|
||||
"""Train a pipeline.
|
||||
|
||||
nlp (Language): The initialized nlp object with the full config.
|
||||
output_path (Path): Optional output path to save trained model to.
|
||||
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
|
||||
before calling this function.
|
||||
logger (Callable[[Any], Any]): Optional logger exposing the methods info,
|
||||
error, debug and warn. Defaults to regular spaCy logger but can be
|
||||
swapped for CLI logger.
|
||||
RETURNS (Path / None): The path to the final exported model.
|
||||
"""
|
||||
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
config = nlp.config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
optimizer = T["optimizer"]
|
||||
score_weights = T["score_weights"]
|
||||
batcher = T["batcher"]
|
||||
train_logger = T["logger"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
training_step_iterator = train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
|
||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
||||
dropout=T["dropout"],
|
||||
accumulate_gradient=T["accumulate_gradient"],
|
||||
patience=T["patience"],
|
||||
max_steps=T["max_steps"],
|
||||
eval_frequency=T["eval_frequency"],
|
||||
exclude=frozen_components,
|
||||
)
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if frozen_components:
|
||||
logger.info(f"Frozen components: {frozen_components}")
|
||||
logger.info(f"Initial learn rate: {optimizer.learn_rate}")
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch 1")
|
||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||
progress.update(1)
|
||||
if is_best_checkpoint is not None:
|
||||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch {info['epoch']}")
|
||||
except Exception as e:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
# We don't want to swallow the traceback if we don't have a
|
||||
# specific error.
|
||||
logger.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}"
|
||||
)
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-final")
|
||||
raise e
|
||||
finally:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / "model-final"
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
return final_model_path
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
nlp: Language,
|
||||
optimizer: Optimizer,
|
||||
train_data,
|
||||
evaluate,
|
||||
*,
|
||||
dropout: float,
|
||||
eval_frequency: int,
|
||||
accumulate_gradient: int,
|
||||
patience: int,
|
||||
max_steps: int,
|
||||
exclude: List[str],
|
||||
):
|
||||
"""Train until an evaluation stops improving. Works as a generator,
|
||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
||||
None indicating that the iteration was not evaluated as a checkpoint.
|
||||
The evaluation is conducted by calling the evaluate callback.
|
||||
|
||||
Positional arguments:
|
||||
nlp: The spaCy pipeline to evaluate.
|
||||
optimizer: The optimizer callable.
|
||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
||||
data iterable needs to take care of iterating over the epochs and
|
||||
shuffling.
|
||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
||||
The callback should take no arguments and return a tuple
|
||||
`(main_score, other_scores)`. The main_score should be a float where
|
||||
higher is better. other_scores can be any object.
|
||||
|
||||
Every iteration, the function yields out a tuple with:
|
||||
|
||||
* batch: A list of Example objects.
|
||||
* info: A dict with various information about the last update (see below).
|
||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||
was the best evaluation so far. You should use this to save the model
|
||||
checkpoints during training. If None, evaluation was not conducted on
|
||||
that iteration. False means evaluation was conducted, but a previous
|
||||
evaluation was better.
|
||||
|
||||
The info dict provides the following information:
|
||||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score from the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
losses: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
if isinstance(dropout, float):
|
||||
dropouts = constant(dropout)
|
||||
else:
|
||||
dropouts = dropout
|
||||
results = []
|
||||
losses = {}
|
||||
words_seen = 0
|
||||
start_time = timer()
|
||||
for step, (epoch, batch) in enumerate(train_data):
|
||||
dropout = next(dropouts)
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
nlp.update(
|
||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
||||
)
|
||||
# TODO: refactor this so we don't have to run it separately in here
|
||||
for name, proc in nlp.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and hasattr(proc, "model")
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.model.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
score, other_scores = evaluate()
|
||||
else:
|
||||
score, other_scores = evaluate()
|
||||
results.append((score, step))
|
||||
is_best_checkpoint = score == max(results)[0]
|
||||
else:
|
||||
score, other_scores = (None, None)
|
||||
is_best_checkpoint = None
|
||||
words_seen += sum(len(eg) for eg in batch)
|
||||
info = {
|
||||
"epoch": epoch,
|
||||
"step": step,
|
||||
"score": score,
|
||||
"other_scores": other_scores,
|
||||
"losses": losses,
|
||||
"checkpoints": results,
|
||||
"seconds": int(timer() - start_time),
|
||||
"words": words_seen,
|
||||
}
|
||||
yield batch, info, is_best_checkpoint
|
||||
if is_best_checkpoint is not None:
|
||||
losses = {}
|
||||
# Stop if no improvement in `patience` updates (if specified)
|
||||
best_score, best_step = max(results)
|
||||
if patience and (step - best_step) >= patience:
|
||||
break
|
||||
# Stop if we've exhausted our max steps (if specified)
|
||||
if max_steps and step >= max_steps:
|
||||
break
|
||||
|
||||
|
||||
def subdivide_batch(batch, accumulate_gradient):
|
||||
batch = list(batch)
|
||||
batch.sort(key=lambda eg: len(eg.predicted))
|
||||
sub_len = len(batch) // accumulate_gradient
|
||||
start = 0
|
||||
for i in range(accumulate_gradient):
|
||||
subbatch = batch[start : start + sub_len]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
start += len(subbatch)
|
||||
subbatch = batch[start:]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
|
||||
|
||||
def create_evaluation_callback(
|
||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
weights = {key: value for key, value in weights.items() if value is not None}
|
||||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
for key, value in scores.items():
|
||||
if key in weights and not isinstance(value, (int, float)):
|
||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||
try:
|
||||
weighted_score = sum(
|
||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||
)
|
||||
except KeyError as e:
|
||||
keys = list(scores.keys())
|
||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||
raise KeyError(err) from None
|
||||
return weighted_score, scores
|
||||
|
||||
return evaluate
|
||||
|
||||
|
||||
def create_train_batches(
|
||||
iterator: Iterator[Example],
|
||||
batcher: Callable[[Iterable[Example]], Iterable[Example]],
|
||||
max_epochs: int,
|
||||
):
|
||||
epoch = 0
|
||||
examples = list(iterator)
|
||||
if not examples:
|
||||
# Raise error if no data
|
||||
raise ValueError(Errors.E986)
|
||||
while max_epochs < 1 or epoch != max_epochs:
|
||||
random.shuffle(examples)
|
||||
for batch in batcher(examples):
|
||||
yield epoch, batch
|
||||
epoch += 1
|
||||
|
||||
|
||||
def update_meta(
|
||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||
) -> None:
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in training["score_weights"]:
|
||||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
||||
def create_before_to_disk_callback(
|
||||
callback: Optional[Callable[[Language], Language]]
|
||||
) -> Callable[[Language], Language]:
|
||||
def before_to_disk(nlp: Language) -> Language:
|
||||
if not callback:
|
||||
return nlp
|
||||
modified_nlp = callback(nlp)
|
||||
if not isinstance(modified_nlp, Language):
|
||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
||||
raise ValueError(err)
|
||||
return modified_nlp
|
||||
|
||||
return before_to_disk
|
267
spacy/training/pretrain.py
Normal file
267
spacy/training/pretrain.py
Normal file
|
@ -0,0 +1,267 @@
|
|||
from typing import Optional, Callable, Any, Iterable, Union, List
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
||||
from pathlib import Path
|
||||
from functools import partial
|
||||
from collections import Counter
|
||||
import srsly
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from wasabi import msg
|
||||
|
||||
from .example import Example
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||
from ..util import registry, load_model_from_config, dot_to_object, logger
|
||||
|
||||
|
||||
def pretrain(
|
||||
config: Config,
|
||||
output_dir: Path,
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
):
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp = load_model_from_config(config)
|
||||
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||
P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
|
||||
corpus = dot_to_object(T, P["corpus"])
|
||||
batcher = P["batcher"]
|
||||
model = create_pretraining_model(nlp, P)
|
||||
optimizer = P["optimizer"]
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
_resume_model(model, resume_path, epoch_resume)
|
||||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
# TODO: move this to logger function?
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
"epoch_loss": tracker.epoch_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
objective = create_objective(P["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
|
||||
|
||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||
docs = []
|
||||
for eg_or_doc in examples_or_docs:
|
||||
if isinstance(eg_or_doc, Doc):
|
||||
docs.append(eg_or_doc)
|
||||
else:
|
||||
docs.append(eg_or_doc.reference)
|
||||
return docs
|
||||
|
||||
|
||||
def _resume_model(
|
||||
model: Model,
|
||||
resume_path: Path,
|
||||
epoch_resume: int,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
) -> None:
|
||||
logger.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||
|
||||
|
||||
def make_update(
|
||||
model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
|
||||
) -> float:
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
return float(loss)
|
||||
|
||||
|
||||
def create_objective(config: Config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
elif config["loss"] == "L2":
|
||||
distance = L2Distance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
else:
|
||||
raise ValueError("Unexpected loss type", config["loss"])
|
||||
else:
|
||||
raise ValueError("Unexpected objective_type", objective_type)
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
def __init__(self, frequency=1000000):
|
||||
self.loss = 0.0
|
||||
self.prev_loss = 0.0
|
||||
self.nr_word = 0
|
||||
self.words_per_epoch = Counter()
|
||||
self.frequency = frequency
|
||||
self.last_time = time.time()
|
||||
self.last_update = 0
|
||||
self.epoch_loss = 0.0
|
||||
|
||||
def update(self, epoch, loss, docs):
|
||||
self.loss += loss
|
||||
self.epoch_loss += loss
|
||||
words_in_batch = sum(len(doc) for doc in docs)
|
||||
self.words_per_epoch[epoch] += words_in_batch
|
||||
self.nr_word += words_in_batch
|
||||
words_since_update = self.nr_word - self.last_update
|
||||
if words_since_update >= self.frequency:
|
||||
wps = words_since_update / (time.time() - self.last_time)
|
||||
self.last_update = self.nr_word
|
||||
self.last_time = time.time()
|
||||
loss_per_word = self.loss - self.prev_loss
|
||||
status = (
|
||||
epoch,
|
||||
self.nr_word,
|
||||
_smart_round(self.loss, width=10),
|
||||
_smart_round(loss_per_word, width=6),
|
||||
int(wps),
|
||||
)
|
||||
self.prev_loss = float(self.loss)
|
||||
return status
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _smart_round(
|
||||
figure: Union[float, int], width: int = 10, max_decimal: int = 4
|
||||
) -> str:
|
||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
||||
n_digits = len(str(int(figure)))
|
||||
n_decimal = width - (n_digits + 1)
|
||||
if n_decimal <= 1:
|
||||
return str(int(figure))
|
||||
else:
|
||||
n_decimal = min(n_decimal, max_decimal)
|
||||
format_str = "%." + str(n_decimal) + "f"
|
||||
return format_str % figure
|
|
@ -8,6 +8,7 @@ import re
|
|||
from pathlib import Path
|
||||
import thinc
|
||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
||||
from thinc.api import ConfigValidationError
|
||||
import functools
|
||||
import itertools
|
||||
import numpy.random
|
||||
|
@ -56,6 +57,7 @@ if TYPE_CHECKING:
|
|||
|
||||
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||
|
@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
|
|||
return Path(sys.modules[module.__module__].__file__).parent
|
||||
|
||||
|
||||
def load_vectors_into_model(
|
||||
nlp: "Language", name: Union[str, Path], *, add_strings=True
|
||||
) -> None:
|
||||
"""Load word vectors from an installed model or path into a model instance."""
|
||||
vectors_nlp = load_model(name)
|
||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||
if add_strings:
|
||||
# I guess we should add the strings from the vectors_nlp model?
|
||||
# E.g. if someone does a similarity query, they might expect the strings.
|
||||
for key in nlp.vocab.vectors.key2row:
|
||||
if key in vectors_nlp.vocab.strings:
|
||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def load_model(
|
||||
name: Union[str, Path],
|
||||
*,
|
||||
|
@ -391,32 +379,9 @@ def load_model_from_config(
|
|||
return nlp
|
||||
|
||||
|
||||
def resolve_training_config(
|
||||
config: Config,
|
||||
exclude: Iterable[str] = ("nlp", "components"),
|
||||
validate: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve the config sections relevant for trainig and create all objects.
|
||||
Mostly used in the CLI to separate training config (not resolved by default
|
||||
because not runtime-relevant – an nlp object should load fine even if it's
|
||||
[training] block refers to functions that are not available etc.).
|
||||
|
||||
config (Config): The config to resolve.
|
||||
exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
|
||||
be available in the final resolved config.
|
||||
validate (bool): Whether to validate the config.
|
||||
RETURNS (Dict[str, Any]): The resolved config.
|
||||
"""
|
||||
config = config.copy()
|
||||
for key in exclude:
|
||||
if key in config:
|
||||
config.pop(key)
|
||||
return registry.resolve(config, validate=validate)
|
||||
|
||||
|
||||
def resolve_dot_names(
|
||||
config: Config, dot_names: List[Optional[str]]
|
||||
) -> List[Optional[Callable]]:
|
||||
) -> Tuple[Any]:
|
||||
"""Resolve one or more "dot notation" names, e.g. corpora.train.
|
||||
The paths could point anywhere into the config, so we don't know which
|
||||
top-level section we'll be looking within.
|
||||
|
@ -424,18 +389,42 @@ def resolve_dot_names(
|
|||
We resolve the whole top-level section, although we could resolve less --
|
||||
we could find the lowest part of the tree.
|
||||
"""
|
||||
# TODO: include schema?
|
||||
# TODO: clean this up and avoid duplication
|
||||
resolved = {}
|
||||
output = []
|
||||
errors = []
|
||||
for name in dot_names:
|
||||
if name is None:
|
||||
output.append(name)
|
||||
else:
|
||||
section = name.split(".")[0]
|
||||
# We want to avoid resolving the same thing twice.
|
||||
# We want to avoid resolving the same thing twice
|
||||
if section not in resolved:
|
||||
resolved[section] = registry.resolve(config[section])
|
||||
output.append(dot_to_object(resolved, name))
|
||||
return output
|
||||
try:
|
||||
output.append(dot_to_object(resolved, name))
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {name}"
|
||||
errors.append({"loc": name.split("."), "msg": msg})
|
||||
objects = []
|
||||
for ref in output:
|
||||
if not isinstance(ref, str):
|
||||
msg = f"not a valid section reference: {ref} ({type(ref)})"
|
||||
errors.append({"loc": ref.split("."), "msg": msg})
|
||||
continue
|
||||
section = ref.split(".")[0]
|
||||
# We want to avoid resolving the same thing twice
|
||||
if section not in resolved:
|
||||
resolved[section] = registry.resolve(config[section])
|
||||
try:
|
||||
objects.append(dot_to_object(resolved, ref))
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {name}"
|
||||
errors.append({"loc": ref.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config=config, errors=errors)
|
||||
return tuple(objects)
|
||||
|
||||
|
||||
def load_model_from_init_py(
|
||||
|
|
Loading…
Reference in New Issue
Block a user