mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Refactor CLI
This commit is contained in:
parent
a89e0ff7cb
commit
822ea4ef61
|
@ -15,8 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
#from .init_model import init_model # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_pipeline import init_pipeline # noqa: F401
|
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
|
|
@ -10,13 +10,12 @@ from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.api import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import ensure_path
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
|
||||||
"""RETURNS (List[str]): All sourced components in the original config,
|
|
||||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
|
||||||
"factory", we assume it refers to a component factory.
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
name
|
|
||||||
for name, cfg in config.get("components", {}).items()
|
|
||||||
if "factory" not in cfg and "source" in cfg
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
|
@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = int(p)
|
p = int(p)
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class CliLogger:
|
||||||
|
"""Helper mocking up the most commonly used logger methods. Can be passed
|
||||||
|
into functions like train() to make them output pretty-printed messages
|
||||||
|
on the CLI and regular logging if used from within Python.
|
||||||
|
"""
|
||||||
|
|
||||||
|
debug = msg.text
|
||||||
|
info = msg.info
|
||||||
|
warn = msg.info
|
||||||
|
error = msg.fail
|
||||||
|
|
||||||
|
|
||||||
|
def setup_gpu(use_gpu: int):
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
from typing import Optional, Dict, Any, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from wasabi import msg, table
|
||||||
from thinc.api import Config, ConfigValidationError
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -52,10 +52,8 @@ def debug_config(
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp = util.load_model_from_config(config)
|
nlp = util.load_model_from_config(config)
|
||||||
# Use the resolved config here in case user has one function returning
|
dot_names = ["training.dev_corpus", "training.train_corpus"]
|
||||||
# a dict of corpora etc.
|
util.resolve_dot_names(nlp.config, dot_names)
|
||||||
resolved = util.resolve_training_config(nlp.config)
|
|
||||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
|
||||||
msg.good("Config is valid")
|
msg.good("Config is valid")
|
||||||
if show_vars:
|
if show_vars:
|
||||||
variables = get_variables(config)
|
variables = get_variables(config)
|
||||||
|
@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
||||||
value = util.dot_to_object(config, path)
|
value = util.dot_to_object(config, path)
|
||||||
result[variable] = repr(value)
|
result[variable] = repr(value)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
|
||||||
"""Validate fields in the config that refer to other sections or values
|
|
||||||
(e.g. in the corpora) and make sure that those references exist.
|
|
||||||
"""
|
|
||||||
errors = []
|
|
||||||
for field in fields:
|
|
||||||
# If the field doesn't exist in the config, we ignore it
|
|
||||||
try:
|
|
||||||
value = util.dot_to_object(config, field)
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
util.dot_to_object(config, value)
|
|
||||||
except KeyError:
|
|
||||||
msg = f"not a valid section reference: {value}"
|
|
||||||
errors.append({"loc": field.split("."), "msg": msg})
|
|
||||||
if errors:
|
|
||||||
raise ConfigValidationError(config=config, errors=errors)
|
|
||||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli
|
||||||
from ..training import Corpus, Example
|
from ..training import Corpus, Example
|
||||||
|
from ..training.initialize import get_sourced_components
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,26 +97,13 @@ def debug_data(
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp = util.load_model_from_config(cfg)
|
nlp = util.load_model_from_config(cfg)
|
||||||
C = util.resolve_training_config(nlp.config)
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
frozen_components = C["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
pipeline = nlp.pipe_names
|
pipeline = nlp.pipe_names
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
tag_map_path = util.ensure_path(C["training"]["tag_map"])
|
|
||||||
tag_map = {}
|
|
||||||
if tag_map_path is not None:
|
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
|
||||||
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
|
|
||||||
morph_rules = {}
|
|
||||||
if morph_rules_path is not None:
|
|
||||||
morph_rules = srsly.read_json(morph_rules_path)
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
msg.divider("Data file validation")
|
msg.divider("Data file validation")
|
||||||
|
|
||||||
# Create the gold corpus to be able to better analyze data
|
# Create the gold corpus to be able to better analyze data
|
||||||
|
@ -145,10 +135,10 @@ def debug_data(
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
frozen_components = C["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text(f"Language: {C['nlp']['lang']}")
|
msg.text(f"Language: {nlp.lang}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
|
@ -355,6 +345,7 @@ def debug_data(
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
|
# TODO: does this need to be updated?
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
tag_map = nlp.vocab.morphology.tag_map
|
||||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
|
|
|
@ -4,12 +4,14 @@ from pathlib import Path
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import dot_to_object
|
from spacy.util import dot_to_object
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||||
from ._util import parse_config_overrides, string_to_list
|
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,11 +39,7 @@ def debug_model_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
layers = string_to_list(layers, intify=True)
|
layers = string_to_list(layers, intify=True)
|
||||||
print_settings = {
|
print_settings = {
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
|
@ -65,8 +63,8 @@ def debug_model_cli(
|
||||||
set_gpu_allocator(allocator)
|
set_gpu_allocator(allocator)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
nlp = util.load_model_from_config(raw_config)
|
nlp = util.load_model_from_config(raw_config)
|
||||||
C = util.resolve_training_config(nlp.config)
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
seed = C["training"]["seed"]
|
seed = T["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
@ -77,7 +75,7 @@ def debug_model_cli(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
debug_model(C, nlp, model, print_settings=print_settings)
|
debug_model(T, nlp, model, print_settings=print_settings)
|
||||||
|
|
||||||
|
|
||||||
def debug_model(
|
def debug_model(
|
||||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import require_gpu, fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, setup_gpu
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
@ -61,8 +61,7 @@ def evaluate(
|
||||||
) -> Scorer:
|
) -> Scorer:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
require_gpu(use_gpu)
|
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
|
|
@ -1,22 +1,13 @@
|
||||||
from typing import Optional, Dict, Callable, Any
|
from typing import Optional
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import typer
|
import typer
|
||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import registry, resolve_dot_names, OOV_RANK
|
from ..training.initialize import init_nlp
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
|
|
||||||
from ..language import Language
|
|
||||||
from ..lookups import Lookups
|
|
||||||
from ..errors import Errors
|
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, CliLogger, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OOV_PROB = -20
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command(
|
@init_cli.command(
|
||||||
|
@ -31,178 +22,16 @@ def init_pipeline_cli(
|
||||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp = init_pipeline(config)
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
|
||||||
nlp.to_disk(output_path)
|
nlp.to_disk(output_path)
|
||||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
|
||||||
raw_config = config
|
|
||||||
config = raw_config.interpolate()
|
|
||||||
if config["training"]["seed"] is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
# Use original config here before it's resolved to functions
|
|
||||||
sourced_components = get_sourced_components(config)
|
|
||||||
with show_validation_error():
|
|
||||||
nlp = util.load_model_from_config(raw_config, auto_fill=True)
|
|
||||||
msg.good("Set up nlp object from config")
|
|
||||||
config = nlp.config.interpolate()
|
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
|
||||||
V = I["vocab"]
|
|
||||||
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
|
|
||||||
optimizer = T["optimizer"]
|
|
||||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
|
||||||
# Components that shouldn't be updated during training
|
|
||||||
frozen_components = T["frozen_components"]
|
|
||||||
# Sourced components that require resume_training
|
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
|
||||||
if resume_components:
|
|
||||||
with nlp.select_pipes(enable=resume_components):
|
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
|
||||||
nlp.resume_training(sgd=optimizer)
|
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
|
||||||
msg.good(f"Initialized pipeline components")
|
|
||||||
# Verify the config after calling 'begin_training' to ensure labels
|
|
||||||
# are properly initialized
|
|
||||||
verify_config(nlp)
|
|
||||||
if "pretraining" in config and config["pretraining"]:
|
|
||||||
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
|
||||||
add_tok2vec_weights(nlp, P, I)
|
|
||||||
# TODO: this should be handled better?
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def init_vocab(
|
|
||||||
nlp: Language,
|
|
||||||
*,
|
|
||||||
data: Optional[Path] = None,
|
|
||||||
lookups: Optional[Lookups] = None,
|
|
||||||
vectors: Optional[str] = None,
|
|
||||||
) -> Language:
|
|
||||||
if lookups:
|
|
||||||
nlp.vocab.lookups = lookups
|
|
||||||
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
|
||||||
data_path = util.ensure_path(data)
|
|
||||||
if data_path is not None:
|
|
||||||
lex_attrs = srsly.read_jsonl(data_path)
|
|
||||||
for lexeme in nlp.vocab:
|
|
||||||
lexeme.rank = OOV_RANK
|
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
if len(nlp.vocab):
|
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
|
||||||
else:
|
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
|
||||||
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
|
||||||
msg.good("Created vocabulary")
|
|
||||||
if vectors is not None:
|
|
||||||
add_vectors(nlp, vectors)
|
|
||||||
msg.good(f"Added vectors: {vectors}")
|
|
||||||
|
|
||||||
|
|
||||||
def add_tok2vec_weights(
|
|
||||||
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
|
||||||
P = pretrain_config
|
|
||||||
V = vocab_config
|
|
||||||
weights_data = None
|
|
||||||
init_tok2vec = util.ensure_path(V["init_tok2vec"])
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
|
||||||
err = "Need initialize.vectors if pretraining.objective.type is vectors"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
if not init_tok2vec.exists():
|
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
if weights_data is not None:
|
|
||||||
tok2vec_component = P["component"]
|
|
||||||
if tok2vec_component is None:
|
|
||||||
msg.fail(
|
|
||||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
|
||||||
f"needs to specify the component that should load them.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
layer = nlp.get_pipe(tok2vec_component).model
|
|
||||||
if P["layer"]:
|
|
||||||
layer = layer.get_ref(P["layer"])
|
|
||||||
layer.from_bytes(weights_data)
|
|
||||||
msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
|
||||||
title = f"Config validation error for vectors {vectors}"
|
|
||||||
desc = (
|
|
||||||
"This typically means that there's a problem in the config.cfg included "
|
|
||||||
"with the packaged vectors. Make sure that the vectors package you're "
|
|
||||||
"loading is compatible with the current version of spaCy."
|
|
||||||
)
|
|
||||||
with show_validation_error(
|
|
||||||
title=title, desc=desc, hint_fill=False, show_config=False
|
|
||||||
):
|
|
||||||
util.load_vectors_into_model(nlp, vectors)
|
|
||||||
msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
|
|
||||||
|
|
||||||
|
|
||||||
def verify_config(nlp: Language) -> None:
|
|
||||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
|
||||||
# in config["nlp"]["pipeline"] instead?
|
|
||||||
for pipe_config in nlp.config["components"].values():
|
|
||||||
# We can't assume that the component name == the factory
|
|
||||||
factory = pipe_config["factory"]
|
|
||||||
if factory == "textcat":
|
|
||||||
verify_textcat_config(nlp, pipe_config)
|
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
|
||||||
# the task is binary
|
|
||||||
if pipe_config.get("positive_label"):
|
|
||||||
textcat_labels = nlp.get_pipe("textcat").labels
|
|
||||||
pos_label = pipe_config.get("positive_label")
|
|
||||||
if pos_label not in textcat_labels:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
if len(list(textcat_labels)) != 2:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_before_to_disk_callback(
|
|
||||||
callback: Optional[Callable[[Language], Language]]
|
|
||||||
) -> Callable[[Language], Language]:
|
|
||||||
def before_to_disk(nlp: Language) -> Language:
|
|
||||||
if not callback:
|
|
||||||
return nlp
|
|
||||||
modified_nlp = callback(nlp)
|
|
||||||
if not isinstance(modified_nlp, Language):
|
|
||||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
|
||||||
raise ValueError(err)
|
|
||||||
return modified_nlp
|
|
||||||
|
|
||||||
return before_to_disk
|
|
||||||
|
|
|
@ -1,25 +1,13 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import require_gpu, set_gpu_allocator
|
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
|
||||||
from thinc.api import Config, CosineDistance, L2Distance
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
|
||||||
from functools import partial
|
|
||||||
import typer
|
import typer
|
||||||
|
import re
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code, setup_gpu, CliLogger
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
from ..training.pretrain import pretrain
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..util import load_config
|
||||||
from ..tokens import Doc
|
|
||||||
from ..attrs import ID
|
|
||||||
from .. import util
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -61,15 +49,11 @@ def pretrain_cli(
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
raw_config = util.load_config(
|
raw_config = load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=False
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
)
|
)
|
||||||
config = raw_config.interpolate()
|
config = raw_config.interpolate()
|
||||||
|
@ -89,250 +73,11 @@ def pretrain_cli(
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
|
logger=CliLogger,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
|
||||||
config: Config,
|
|
||||||
output_dir: Path,
|
|
||||||
resume_path: Optional[Path] = None,
|
|
||||||
epoch_resume: Optional[int] = None,
|
|
||||||
use_gpu: int = -1,
|
|
||||||
):
|
|
||||||
if config["training"]["seed"] is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
nlp = util.load_model_from_config(config)
|
|
||||||
C = util.resolve_training_config(nlp.config)
|
|
||||||
P_cfg = C["pretraining"]
|
|
||||||
corpus = dot_to_object(C, P_cfg["corpus"])
|
|
||||||
batcher = P_cfg["batcher"]
|
|
||||||
model = create_pretraining_model(nlp, C["pretraining"])
|
|
||||||
optimizer = C["pretraining"]["optimizer"]
|
|
||||||
# Load in pretrained weights to resume from
|
|
||||||
if resume_path is not None:
|
|
||||||
_resume_model(model, resume_path, epoch_resume)
|
|
||||||
else:
|
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
|
||||||
epoch_resume = 0
|
|
||||||
|
|
||||||
tracker = ProgressTracker(frequency=10000)
|
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
|
||||||
log = {
|
|
||||||
"nr_word": tracker.nr_word,
|
|
||||||
"loss": tracker.loss,
|
|
||||||
"epoch_loss": tracker.epoch_loss,
|
|
||||||
"epoch": epoch,
|
|
||||||
}
|
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
|
||||||
|
|
||||||
objective = create_objective(P_cfg["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
|
||||||
# 'create_train_batches' function?
|
|
||||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
|
||||||
docs = ensure_docs(batch)
|
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
msg.row(progress, **row_settings)
|
|
||||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
|
||||||
_save_model(epoch, is_temp=True)
|
|
||||||
_save_model(epoch)
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs):
|
|
||||||
docs = []
|
|
||||||
for eg_or_doc in examples_or_docs:
|
|
||||||
if isinstance(eg_or_doc, Doc):
|
|
||||||
docs.append(eg_or_doc)
|
|
||||||
else:
|
|
||||||
docs.append(eg_or_doc.reference)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(model, resume_path, epoch_resume):
|
|
||||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
|
||||||
with resume_path.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
|
||||||
# Parse the epoch number from the given weight file
|
|
||||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
|
||||||
if model_name:
|
|
||||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
|
||||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
else:
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, objective_func):
|
|
||||||
"""Perform an update over a single batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
|
||||||
optimizer (callable): An optimizer.
|
|
||||||
RETURNS loss: A float for the loss.
|
|
||||||
"""
|
|
||||||
predictions, backprop = model.begin_update(docs)
|
|
||||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
|
||||||
backprop(gradients)
|
|
||||||
model.finish_update(optimizer)
|
|
||||||
# Don't want to return a cupy object here
|
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
|
||||||
# so we get an accurate loss
|
|
||||||
return float(loss)
|
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss,
|
|
||||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
|
||||||
)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected loss type", config["loss"])
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected objective_type", objective_type)
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
|
||||||
Each array in the output needs to have one row per token in the doc.
|
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
|
||||||
serialized to file and read back in when calling the 'train' command.
|
|
||||||
"""
|
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
|
||||||
if pretrain_config.get("layer"):
|
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
|
||||||
else:
|
|
||||||
tok2vec = component.model
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
maxout_pieces = 3
|
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker:
|
|
||||||
def __init__(self, frequency=1000000):
|
|
||||||
self.loss = 0.0
|
|
||||||
self.prev_loss = 0.0
|
|
||||||
self.nr_word = 0
|
|
||||||
self.words_per_epoch = Counter()
|
|
||||||
self.frequency = frequency
|
|
||||||
self.last_time = time.time()
|
|
||||||
self.last_update = 0
|
|
||||||
self.epoch_loss = 0.0
|
|
||||||
|
|
||||||
def update(self, epoch, loss, docs):
|
|
||||||
self.loss += loss
|
|
||||||
self.epoch_loss += loss
|
|
||||||
words_in_batch = sum(len(doc) for doc in docs)
|
|
||||||
self.words_per_epoch[epoch] += words_in_batch
|
|
||||||
self.nr_word += words_in_batch
|
|
||||||
words_since_update = self.nr_word - self.last_update
|
|
||||||
if words_since_update >= self.frequency:
|
|
||||||
wps = words_since_update / (time.time() - self.last_time)
|
|
||||||
self.last_update = self.nr_word
|
|
||||||
self.last_time = time.time()
|
|
||||||
loss_per_word = self.loss - self.prev_loss
|
|
||||||
status = (
|
|
||||||
epoch,
|
|
||||||
self.nr_word,
|
|
||||||
_smart_round(self.loss, width=10),
|
|
||||||
_smart_round(loss_per_word, width=6),
|
|
||||||
int(wps),
|
|
||||||
)
|
|
||||||
self.prev_loss = float(self.loss)
|
|
||||||
return status
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _smart_round(figure, width=10, max_decimal=4):
|
|
||||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
|
||||||
n_digits = len(str(int(figure)))
|
|
||||||
n_decimal = width - (n_digits + 1)
|
|
||||||
if n_decimal <= 1:
|
|
||||||
return str(int(figure))
|
|
||||||
else:
|
|
||||||
n_decimal = min(n_decimal, max_decimal)
|
|
||||||
format_str = "%." + str(n_decimal) + "f"
|
|
||||||
return format_str % figure
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
|
@ -1,24 +1,16 @@
|
||||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
from typing import Optional
|
||||||
from timeit import default_timer as timer
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
from thinc.api import Config
|
||||||
import thinc.schedules
|
|
||||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
|
||||||
import random
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .init_pipeline import init_pipeline
|
|
||||||
from .init_pipeline import create_before_to_disk_callback
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code, CliLogger, setup_gpu
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..training.loop import train
|
||||||
|
from ..training.initialize import init_nlp, must_reinitialize
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
|
||||||
from ..util import resolve_dot_names, registry
|
|
||||||
from ..schemas import ConfigSchemaTraining
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -52,31 +44,33 @@ def train_cli(
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
with show_validation_error(config_path):
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
msg.divider("Initializing pipeline")
|
msg.divider("Initializing pipeline")
|
||||||
nlp = init_nlp(config, output_path)
|
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
|
||||||
msg.divider("Training pipeline")
|
msg.divider("Training pipeline")
|
||||||
train(nlp, output_path, use_gpu=use_gpu)
|
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
|
||||||
|
if final_path:
|
||||||
|
msg.good(f"Saved pipeline to output directory", final_path)
|
||||||
|
|
||||||
|
|
||||||
def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
|
def init_pipeline(
|
||||||
|
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
|
||||||
|
) -> Language:
|
||||||
|
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
init_path = output_path / "model-initial"
|
init_path = output_path / "model-initial"
|
||||||
if not init_path.exists():
|
if not init_path.exists():
|
||||||
msg.info(f"Initializing the pipeline in {init_path}")
|
msg.info(f"Initializing the pipeline in {init_path}")
|
||||||
nlp = init_pipeline(config)
|
nlp = init_nlp(config, **init_kwargs)
|
||||||
nlp.to_disk(init_path)
|
nlp.to_disk(init_path)
|
||||||
msg.good(f"Saved initialized pipeline to {init_path}")
|
msg.good(f"Saved initialized pipeline to {init_path}")
|
||||||
else:
|
else:
|
||||||
nlp = util.load_model(init_path)
|
nlp = util.load_model(init_path)
|
||||||
if must_reinitialize(config, nlp.config):
|
if must_reinitialize(config, nlp.config):
|
||||||
msg.warn("Config has changed: need to re-initialize pipeline")
|
msg.warn("Config has changed: need to re-initialize pipeline")
|
||||||
nlp = init_pipeline(config)
|
nlp = init_nlp(config, **init_kwargs)
|
||||||
nlp.to_disk(init_path)
|
nlp.to_disk(init_path)
|
||||||
msg.good(f"Re-initialized pipeline in {init_path}")
|
msg.good(f"Re-initialized pipeline in {init_path}")
|
||||||
else:
|
else:
|
||||||
|
@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
|
||||||
"the vocabulary, vectors and label scheme. To take advantage of this, "
|
"the vocabulary, vectors and label scheme. To take advantage of this, "
|
||||||
"provide an output directory."
|
"provide an output directory."
|
||||||
)
|
)
|
||||||
return init_pipeline(config)
|
return init_nlp(config, **init_kwargs)
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
|
|
||||||
) -> None:
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
|
||||||
config = nlp.config.interpolate()
|
|
||||||
if config["training"]["seed"] is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
|
||||||
optimizer = T["optimizer"]
|
|
||||||
score_weights = T["score_weights"]
|
|
||||||
batcher = T["batcher"]
|
|
||||||
train_logger = T["logger"]
|
|
||||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
|
||||||
# Components that shouldn't be updated during training
|
|
||||||
frozen_components = T["frozen_components"]
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
|
||||||
training_step_iterator = train_while_improving(
|
|
||||||
nlp,
|
|
||||||
optimizer,
|
|
||||||
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
|
|
||||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
|
||||||
dropout=T["dropout"],
|
|
||||||
accumulate_gradient=T["accumulate_gradient"],
|
|
||||||
patience=T["patience"],
|
|
||||||
max_steps=T["max_steps"],
|
|
||||||
eval_frequency=T["eval_frequency"],
|
|
||||||
exclude=frozen_components,
|
|
||||||
)
|
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
|
||||||
if frozen_components:
|
|
||||||
msg.info(f"Frozen components: {frozen_components}")
|
|
||||||
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
|
||||||
|
|
||||||
try:
|
|
||||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch 1")
|
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
|
||||||
progress.update(1)
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
progress.close()
|
|
||||||
print_row(info)
|
|
||||||
if is_best_checkpoint and output_path is not None:
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
update_meta(T, nlp, info)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-best")
|
|
||||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
|
||||||
except Exception as e:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
# We don't want to swallow the traceback if we don't have a
|
|
||||||
# specific error.
|
|
||||||
msg.warn(
|
|
||||||
f"Aborting and saving the final best model. "
|
|
||||||
f"Encountered exception: {str(e)}"
|
|
||||||
)
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-final")
|
|
||||||
raise e
|
|
||||||
finally:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
final_model_path = output_path / "model-final"
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
|
||||||
# TODO: do this better and more fine-grained
|
|
||||||
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
|
||||||
title = f"Config validation error for vectors {vectors}"
|
|
||||||
desc = (
|
|
||||||
"This typically means that there's a problem in the config.cfg included "
|
|
||||||
"with the packaged vectors. Make sure that the vectors package you're "
|
|
||||||
"loading is compatible with the current version of spaCy."
|
|
||||||
)
|
|
||||||
with show_validation_error(
|
|
||||||
title=title, desc=desc, hint_fill=False, show_config=False
|
|
||||||
):
|
|
||||||
util.load_vectors_into_model(nlp, vectors)
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
|
||||||
epoch = 0
|
|
||||||
examples = list(iterator)
|
|
||||||
if not examples:
|
|
||||||
# Raise error if no data
|
|
||||||
raise ValueError(Errors.E986)
|
|
||||||
while max_epochs < 1 or epoch != max_epochs:
|
|
||||||
random.shuffle(examples)
|
|
||||||
for batch in batcher(examples):
|
|
||||||
yield epoch, batch
|
|
||||||
epoch += 1
|
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(
|
|
||||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
|
||||||
weights = {key: value for key, value in weights.items() if value is not None}
|
|
||||||
|
|
||||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
|
||||||
dev_examples = list(dev_corpus(nlp))
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
|
||||||
# Calculate a weighted sum based on score_weights for the main score.
|
|
||||||
# We can only consider scores that are ints/floats, not dicts like
|
|
||||||
# entity scores per type etc.
|
|
||||||
for key, value in scores.items():
|
|
||||||
if key in weights and not isinstance(value, (int, float)):
|
|
||||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
|
||||||
try:
|
|
||||||
weighted_score = sum(
|
|
||||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
|
||||||
)
|
|
||||||
except KeyError as e:
|
|
||||||
keys = list(scores.keys())
|
|
||||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
|
||||||
raise KeyError(err) from None
|
|
||||||
return weighted_score, scores
|
|
||||||
|
|
||||||
return evaluate
|
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
|
||||||
nlp: Language,
|
|
||||||
optimizer: Optimizer,
|
|
||||||
train_data,
|
|
||||||
evaluate,
|
|
||||||
*,
|
|
||||||
dropout: float,
|
|
||||||
eval_frequency: int,
|
|
||||||
accumulate_gradient: int,
|
|
||||||
patience: int,
|
|
||||||
max_steps: int,
|
|
||||||
exclude: List[str],
|
|
||||||
):
|
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
|
||||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
|
||||||
None indicating that the iteration was not evaluated as a checkpoint.
|
|
||||||
The evaluation is conducted by calling the evaluate callback.
|
|
||||||
|
|
||||||
Positional arguments:
|
|
||||||
nlp: The spaCy pipeline to evaluate.
|
|
||||||
optimizer: The optimizer callable.
|
|
||||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
|
||||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
|
||||||
data iterable needs to take care of iterating over the epochs and
|
|
||||||
shuffling.
|
|
||||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
|
||||||
The callback should take no arguments and return a tuple
|
|
||||||
`(main_score, other_scores)`. The main_score should be a float where
|
|
||||||
higher is better. other_scores can be any object.
|
|
||||||
|
|
||||||
Every iteration, the function yields out a tuple with:
|
|
||||||
|
|
||||||
* batch: A list of Example objects.
|
|
||||||
* info: A dict with various information about the last update (see below).
|
|
||||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
|
||||||
was the best evaluation so far. You should use this to save the model
|
|
||||||
checkpoints during training. If None, evaluation was not conducted on
|
|
||||||
that iteration. False means evaluation was conducted, but a previous
|
|
||||||
evaluation was better.
|
|
||||||
|
|
||||||
The info dict provides the following information:
|
|
||||||
|
|
||||||
epoch (int): How many passes over the data have been completed.
|
|
||||||
step (int): How many steps have been completed.
|
|
||||||
score (float): The main score from the last evaluation.
|
|
||||||
other_scores: : The other scores from the last evaluation.
|
|
||||||
losses: The accumulated losses throughout training.
|
|
||||||
checkpoints: A list of previous results, where each result is a
|
|
||||||
(score, step, epoch) tuple.
|
|
||||||
"""
|
|
||||||
if isinstance(dropout, float):
|
|
||||||
dropouts = thinc.schedules.constant(dropout)
|
|
||||||
else:
|
|
||||||
dropouts = dropout
|
|
||||||
results = []
|
|
||||||
losses = {}
|
|
||||||
words_seen = 0
|
|
||||||
start_time = timer()
|
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
|
||||||
dropout = next(dropouts)
|
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
|
||||||
nlp.update(
|
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
|
||||||
)
|
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if (
|
|
||||||
name not in exclude
|
|
||||||
and hasattr(proc, "model")
|
|
||||||
and proc.model not in (True, False, None)
|
|
||||||
):
|
|
||||||
proc.model.finish_update(optimizer)
|
|
||||||
optimizer.step_schedules()
|
|
||||||
if not (step % eval_frequency):
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
else:
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
results.append((score, step))
|
|
||||||
is_best_checkpoint = score == max(results)[0]
|
|
||||||
else:
|
|
||||||
score, other_scores = (None, None)
|
|
||||||
is_best_checkpoint = None
|
|
||||||
words_seen += sum(len(eg) for eg in batch)
|
|
||||||
info = {
|
|
||||||
"epoch": epoch,
|
|
||||||
"step": step,
|
|
||||||
"score": score,
|
|
||||||
"other_scores": other_scores,
|
|
||||||
"losses": losses,
|
|
||||||
"checkpoints": results,
|
|
||||||
"seconds": int(timer() - start_time),
|
|
||||||
"words": words_seen,
|
|
||||||
}
|
|
||||||
yield batch, info, is_best_checkpoint
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
losses = {}
|
|
||||||
# Stop if no improvement in `patience` updates (if specified)
|
|
||||||
best_score, best_step = max(results)
|
|
||||||
if patience and (step - best_step) >= patience:
|
|
||||||
break
|
|
||||||
# Stop if we've exhausted our max steps (if specified)
|
|
||||||
if max_steps and step >= max_steps:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def subdivide_batch(batch, accumulate_gradient):
|
|
||||||
batch = list(batch)
|
|
||||||
batch.sort(key=lambda eg: len(eg.predicted))
|
|
||||||
sub_len = len(batch) // accumulate_gradient
|
|
||||||
start = 0
|
|
||||||
for i in range(accumulate_gradient):
|
|
||||||
subbatch = batch[start : start + sub_len]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
start += len(subbatch)
|
|
||||||
subbatch = batch[start:]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
|
|
||||||
|
|
||||||
def update_meta(
|
|
||||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
nlp.meta["performance"] = {}
|
|
||||||
for metric in training["score_weights"]:
|
|
||||||
if metric is not None:
|
|
||||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
|
||||||
for pipe_name in nlp.pipe_names:
|
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
||||||
|
@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
# TODO: this is currently imported by the ray extension and not used otherwise
|
|
||||||
def load_from_paths(
|
|
||||||
config: Config,
|
|
||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
|
||||||
weights_data = None
|
|
||||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
if not init_tok2vec.exists():
|
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
return None, {}, {}, weights_data
|
|
||||||
|
|
|
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.training.initialize import verify_textcat_config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...cli.train import verify_textcat_config
|
|
||||||
from ...training import Example
|
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
|
|
@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||||
from spacy.cli.debug_config import check_section_refs
|
|
||||||
from thinc.api import ConfigValidationError, Config
|
from thinc.api import ConfigValidationError, Config
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
@ -414,15 +413,3 @@ def test_string_to_list(value):
|
||||||
def test_string_to_list_intify(value):
|
def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
def test_check_section_refs():
|
|
||||||
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
|
|
||||||
config = Config(config)
|
|
||||||
# Valid section reference
|
|
||||||
check_section_refs(config, ["a.b.c"])
|
|
||||||
# Section that doesn't exist in this config
|
|
||||||
check_section_refs(config, ["x.y.z"])
|
|
||||||
# Invalid section reference
|
|
||||||
with pytest.raises(ConfigValidationError):
|
|
||||||
check_section_refs(config, ["a.b.c", "f.g"])
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ from spacy import util
|
||||||
from spacy import prefer_gpu, require_gpu
|
from spacy import prefer_gpu, require_gpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
from thinc.api import Optimizer
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
|
||||||
result = util.dot_to_dict(dot_notation)
|
result = util.dot_to_dict(dot_notation)
|
||||||
assert result == expected
|
assert result == expected
|
||||||
assert util.dict_to_dot(result) == dot_notation
|
assert util.dict_to_dot(result) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
def test_resolve_training_config():
|
|
||||||
config = {
|
|
||||||
"nlp": {"lang": "en", "disabled": []},
|
|
||||||
"training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
|
|
||||||
"corpora": {},
|
|
||||||
}
|
|
||||||
resolved = util.resolve_training_config(config)
|
|
||||||
assert resolved["training"]["dropout"] == 0.1
|
|
||||||
assert isinstance(resolved["training"]["optimizer"], Optimizer)
|
|
||||||
assert resolved["corpora"] == {}
|
|
||||||
assert "nlp" not in resolved
|
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .util import get_random_doc
|
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList
|
from spacy.util import dot_to_object, SimpleFrozenList
|
||||||
from thinc.api import Config, Optimizer
|
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||||
from spacy.training.batchers import minibatch_by_words
|
from spacy.training.batchers import minibatch_by_words
|
||||||
from ..lang.en import English
|
from spacy.lang.en import English
|
||||||
from ..lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
from ..language import DEFAULT_CONFIG_PATH
|
from spacy.language import DEFAULT_CONFIG_PATH
|
||||||
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
|
||||||
|
from .util import get_random_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -101,8 +102,8 @@ def test_util_dot_section():
|
||||||
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
||||||
resolved = util.resolve_training_config(nl_nlp.config)
|
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
|
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
|
||||||
|
|
||||||
|
|
||||||
def test_simple_frozen_list():
|
def test_simple_frozen_list():
|
||||||
|
@ -120,3 +121,17 @@ def test_simple_frozen_list():
|
||||||
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
||||||
with pytest.raises(NotImplementedError):
|
with pytest.raises(NotImplementedError):
|
||||||
t.append("baz")
|
t.append("baz")
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_dot_names():
|
||||||
|
config = {
|
||||||
|
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
|
||||||
|
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
|
||||||
|
}
|
||||||
|
result = util.resolve_dot_names(config, ["foo.bar"])
|
||||||
|
assert isinstance(result[0], Optimizer)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ["training", "xyz"]
|
||||||
|
|
|
@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from spacy.util import load_model_from_config, registry, dot_to_object
|
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
||||||
from spacy.util import resolve_training_config
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,21 +39,21 @@ def test_readers():
|
||||||
|
|
||||||
config = Config().from_str(config_string)
|
config = Config().from_str(config_string)
|
||||||
nlp = load_model_from_config(config, auto_fill=True)
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
resolved = resolve_training_config(nlp.config)
|
dot_names = ["training.train_corpus", "training.dev_corpus"]
|
||||||
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
||||||
assert isinstance(train_corpus, Callable)
|
assert isinstance(train_corpus, Callable)
|
||||||
optimizer = resolved["training"]["optimizer"]
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
|
optimizer = T["optimizer"]
|
||||||
# simulate a training loop
|
# simulate a training loop
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
|
||||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||||
assert scores["cats_score"]
|
assert scores["cats_score"]
|
||||||
# ensure the pipeline runs
|
# ensure the pipeline runs
|
||||||
doc = nlp("Quick test")
|
doc = nlp("Quick test")
|
||||||
assert doc.cats
|
assert doc.cats
|
||||||
extra_corpus = resolved["corpora"]["extra"]
|
extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
|
||||||
assert isinstance(extra_corpus, Callable)
|
assert isinstance(extra_corpus, Callable)
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
|
||||||
config["corpora"]["@readers"] = reader
|
config["corpora"]["@readers"] = reader
|
||||||
config["corpora"].update(additional_config)
|
config["corpora"].update(additional_config)
|
||||||
nlp = load_model_from_config(config, auto_fill=True)
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
resolved = resolve_training_config(nlp.config)
|
dot_names = ["training.train_corpus", "training.dev_corpus"]
|
||||||
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
||||||
optimizer = resolved["training"]["optimizer"]
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
|
optimizer = T["optimizer"]
|
||||||
# simulate a training loop
|
# simulate a training loop
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
|
@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
|
||||||
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
|
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
# simulate performance benchmark on dev corpus
|
# simulate performance benchmark on dev corpus
|
||||||
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
|
||||||
dev_examples = list(dev_corpus(nlp))
|
dev_examples = list(dev_corpus(nlp))
|
||||||
for example in dev_examples:
|
for example in dev_examples:
|
||||||
# this shouldn't fail if each dev example has at least one positive label
|
# this shouldn't fail if each dev example has at least one positive label
|
||||||
|
|
|
@ -0,0 +1,205 @@
|
||||||
|
from typing import Union, Dict, Optional, Any, List, Callable
|
||||||
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||||
|
from thinc.api import ConfigValidationError
|
||||||
|
from pathlib import Path
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .loop import create_before_to_disk_callback
|
||||||
|
from ..language import Language
|
||||||
|
from ..lookups import Lookups
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
|
||||||
|
from ..util import registry, load_model_from_config, resolve_dot_names
|
||||||
|
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
|
||||||
|
|
||||||
|
|
||||||
|
def init_nlp(
|
||||||
|
config: Config,
|
||||||
|
*,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
logger: Callable[[Any], Any] = logger,
|
||||||
|
on_success: Callable[[str], None] = lambda x: None,
|
||||||
|
) -> Language:
|
||||||
|
raw_config = config
|
||||||
|
config = raw_config.interpolate()
|
||||||
|
if config["training"]["seed"] is not None:
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
|
allocator = config["training"]["gpu_allocator"]
|
||||||
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
|
# Use original config here before it's resolved to functions
|
||||||
|
sourced_components = get_sourced_components(config)
|
||||||
|
nlp = load_model_from_config(raw_config, auto_fill=True)
|
||||||
|
on_success("Set up nlp object from config")
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
V = I["vocab"]
|
||||||
|
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
|
||||||
|
optimizer = T["optimizer"]
|
||||||
|
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||||
|
# Components that shouldn't be updated during training
|
||||||
|
frozen_components = T["frozen_components"]
|
||||||
|
# Sourced components that require resume_training
|
||||||
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
|
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
|
if resume_components:
|
||||||
|
with nlp.select_pipes(enable=resume_components):
|
||||||
|
logger.info(f"Resuming training for: {resume_components}")
|
||||||
|
nlp.resume_training(sgd=optimizer)
|
||||||
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
|
on_success(f"Initialized pipeline components")
|
||||||
|
# Verify the config after calling 'begin_training' to ensure labels
|
||||||
|
# are properly initialized
|
||||||
|
verify_config(nlp)
|
||||||
|
if "pretraining" in config and config["pretraining"]:
|
||||||
|
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
|
loaded = add_tok2vec_weights(nlp, P, I)
|
||||||
|
if loaded and P["component"]:
|
||||||
|
on_success(f"Loaded pretrained weights into component '{P['component']}'")
|
||||||
|
nlp = before_to_disk(nlp)
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
||||||
|
# TODO: do this better and more fine-grained
|
||||||
|
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
|
||||||
|
|
||||||
|
|
||||||
|
def init_vocab(
|
||||||
|
nlp: Language,
|
||||||
|
*,
|
||||||
|
data: Optional[Path] = None,
|
||||||
|
lookups: Optional[Lookups] = None,
|
||||||
|
vectors: Optional[str] = None,
|
||||||
|
on_success: Callable[[str], None] = lambda x: None,
|
||||||
|
) -> Language:
|
||||||
|
if lookups:
|
||||||
|
nlp.vocab.lookups = lookups
|
||||||
|
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||||
|
data_path = ensure_path(data)
|
||||||
|
if data_path is not None:
|
||||||
|
lex_attrs = srsly.read_jsonl(data_path)
|
||||||
|
for lexeme in nlp.vocab:
|
||||||
|
lexeme.rank = OOV_RANK
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
if len(nlp.vocab):
|
||||||
|
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||||
|
else:
|
||||||
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
|
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||||
|
on_success("Created vocabulary")
|
||||||
|
if vectors is not None:
|
||||||
|
load_vectors_into_model(nlp, vectors)
|
||||||
|
on_success(f"Added vectors: {vectors}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_vectors_into_model(
|
||||||
|
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
|
||||||
|
) -> None:
|
||||||
|
"""Load word vectors from an installed model or path into a model instance."""
|
||||||
|
try:
|
||||||
|
vectors_nlp = load_model(name)
|
||||||
|
except ConfigValidationError as e:
|
||||||
|
title = f"Config validation error for vectors {name}"
|
||||||
|
desc = (
|
||||||
|
"This typically means that there's a problem in the config.cfg included "
|
||||||
|
"with the packaged vectors. Make sure that the vectors package you're "
|
||||||
|
"loading is compatible with the current version of spaCy."
|
||||||
|
)
|
||||||
|
err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
|
||||||
|
raise err from None
|
||||||
|
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||||
|
if add_strings:
|
||||||
|
# I guess we should add the strings from the vectors_nlp model?
|
||||||
|
# E.g. if someone does a similarity query, they might expect the strings.
|
||||||
|
for key in nlp.vocab.vectors.key2row:
|
||||||
|
if key in vectors_nlp.vocab.strings:
|
||||||
|
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||||
|
|
||||||
|
|
||||||
|
def add_tok2vec_weights(
|
||||||
|
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||||
|
) -> bool:
|
||||||
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
|
P = pretrain_config
|
||||||
|
V = vocab_config
|
||||||
|
weights_data = None
|
||||||
|
init_tok2vec = ensure_path(V["init_tok2vec"])
|
||||||
|
if init_tok2vec is not None:
|
||||||
|
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||||
|
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
|
||||||
|
errors = [{"loc": ["initialize", "vectors"], "msg": err}]
|
||||||
|
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||||
|
if not init_tok2vec.exists():
|
||||||
|
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
||||||
|
errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
|
||||||
|
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||||
|
with init_tok2vec.open("rb") as file_:
|
||||||
|
weights_data = file_.read()
|
||||||
|
if weights_data is not None:
|
||||||
|
tok2vec_component = P["component"]
|
||||||
|
if tok2vec_component is None:
|
||||||
|
desc = (
|
||||||
|
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||||
|
f"needs to specify the component that should load them."
|
||||||
|
)
|
||||||
|
err = "component can't be null"
|
||||||
|
errors = [{"loc": ["pretraining", "component"], "msg": err}]
|
||||||
|
raise ConfigValidationError(
|
||||||
|
config=nlp.config["pretraining"], errors=errors, desc=desc
|
||||||
|
)
|
||||||
|
layer = nlp.get_pipe(tok2vec_component).model
|
||||||
|
if P["layer"]:
|
||||||
|
layer = layer.get_ref(P["layer"])
|
||||||
|
layer.from_bytes(weights_data)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def verify_config(nlp: Language) -> None:
|
||||||
|
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||||
|
# TODO: maybe we should validate based on the actual components, the list
|
||||||
|
# in config["nlp"]["pipeline"] instead?
|
||||||
|
for pipe_config in nlp.config["components"].values():
|
||||||
|
# We can't assume that the component name == the factory
|
||||||
|
factory = pipe_config["factory"]
|
||||||
|
if factory == "textcat":
|
||||||
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||||
|
# if 'positive_label' is provided: double check whether it's in the data and
|
||||||
|
# the task is binary
|
||||||
|
if pipe_config.get("positive_label"):
|
||||||
|
textcat_labels = nlp.get_pipe("textcat").labels
|
||||||
|
pos_label = pipe_config.get("positive_label")
|
||||||
|
if pos_label not in textcat_labels:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
||||||
|
)
|
||||||
|
if len(list(textcat_labels)) != 2:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||||
|
"""RETURNS (List[str]): All sourced components in the original config,
|
||||||
|
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||||
|
"factory", we assume it refers to a component factory.
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
name
|
||||||
|
for name, cfg in config.get("components", {}).items()
|
||||||
|
if "factory" not in cfg and "source" in cfg
|
||||||
|
]
|
301
spacy/training/loop.py
Normal file
301
spacy/training/loop.py
Normal file
|
@ -0,0 +1,301 @@
|
||||||
|
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||||
|
import random
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
from .example import Example
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..language import Language
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..util import resolve_dot_names, registry, logger
|
||||||
|
|
||||||
|
|
||||||
|
def train(
|
||||||
|
nlp: Language,
|
||||||
|
output_path: Optional[Path] = None,
|
||||||
|
*,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
logger: Callable[[Any], Any] = logger,
|
||||||
|
) -> Optional[Path]:
|
||||||
|
"""Train a pipeline.
|
||||||
|
|
||||||
|
nlp (Language): The initialized nlp object with the full config.
|
||||||
|
output_path (Path): Optional output path to save trained model to.
|
||||||
|
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
|
||||||
|
before calling this function.
|
||||||
|
logger (Callable[[Any], Any]): Optional logger exposing the methods info,
|
||||||
|
error, debug and warn. Defaults to regular spaCy logger but can be
|
||||||
|
swapped for CLI logger.
|
||||||
|
RETURNS (Path / None): The path to the final exported model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create iterator, which yields out info after each optimization step.
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
if config["training"]["seed"] is not None:
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
|
allocator = config["training"]["gpu_allocator"]
|
||||||
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
|
optimizer = T["optimizer"]
|
||||||
|
score_weights = T["score_weights"]
|
||||||
|
batcher = T["batcher"]
|
||||||
|
train_logger = T["logger"]
|
||||||
|
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||||
|
# Components that shouldn't be updated during training
|
||||||
|
frozen_components = T["frozen_components"]
|
||||||
|
# Create iterator, which yields out info after each optimization step.
|
||||||
|
training_step_iterator = train_while_improving(
|
||||||
|
nlp,
|
||||||
|
optimizer,
|
||||||
|
create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
|
||||||
|
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
||||||
|
dropout=T["dropout"],
|
||||||
|
accumulate_gradient=T["accumulate_gradient"],
|
||||||
|
patience=T["patience"],
|
||||||
|
max_steps=T["max_steps"],
|
||||||
|
eval_frequency=T["eval_frequency"],
|
||||||
|
exclude=frozen_components,
|
||||||
|
)
|
||||||
|
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
|
if frozen_components:
|
||||||
|
logger.info(f"Frozen components: {frozen_components}")
|
||||||
|
logger.info(f"Initial learn rate: {optimizer.learn_rate}")
|
||||||
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
|
print_row, finalize_logger = train_logger(nlp)
|
||||||
|
try:
|
||||||
|
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||||
|
progress.set_description(f"Epoch 1")
|
||||||
|
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||||
|
progress.update(1)
|
||||||
|
if is_best_checkpoint is not None:
|
||||||
|
progress.close()
|
||||||
|
print_row(info)
|
||||||
|
if is_best_checkpoint and output_path is not None:
|
||||||
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
|
update_meta(T, nlp, info)
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
nlp = before_to_disk(nlp)
|
||||||
|
nlp.to_disk(output_path / "model-best")
|
||||||
|
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
||||||
|
progress.set_description(f"Epoch {info['epoch']}")
|
||||||
|
except Exception as e:
|
||||||
|
finalize_logger()
|
||||||
|
if output_path is not None:
|
||||||
|
# We don't want to swallow the traceback if we don't have a
|
||||||
|
# specific error.
|
||||||
|
logger.warn(
|
||||||
|
f"Aborting and saving the final best model. "
|
||||||
|
f"Encountered exception: {str(e)}"
|
||||||
|
)
|
||||||
|
nlp = before_to_disk(nlp)
|
||||||
|
nlp.to_disk(output_path / "model-final")
|
||||||
|
raise e
|
||||||
|
finally:
|
||||||
|
finalize_logger()
|
||||||
|
if output_path is not None:
|
||||||
|
final_model_path = output_path / "model-final"
|
||||||
|
if optimizer.averages:
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
nlp.to_disk(final_model_path)
|
||||||
|
else:
|
||||||
|
nlp.to_disk(final_model_path)
|
||||||
|
return final_model_path
|
||||||
|
|
||||||
|
|
||||||
|
def train_while_improving(
|
||||||
|
nlp: Language,
|
||||||
|
optimizer: Optimizer,
|
||||||
|
train_data,
|
||||||
|
evaluate,
|
||||||
|
*,
|
||||||
|
dropout: float,
|
||||||
|
eval_frequency: int,
|
||||||
|
accumulate_gradient: int,
|
||||||
|
patience: int,
|
||||||
|
max_steps: int,
|
||||||
|
exclude: List[str],
|
||||||
|
):
|
||||||
|
"""Train until an evaluation stops improving. Works as a generator,
|
||||||
|
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||||
|
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
||||||
|
None indicating that the iteration was not evaluated as a checkpoint.
|
||||||
|
The evaluation is conducted by calling the evaluate callback.
|
||||||
|
|
||||||
|
Positional arguments:
|
||||||
|
nlp: The spaCy pipeline to evaluate.
|
||||||
|
optimizer: The optimizer callable.
|
||||||
|
train_data (Iterable[Batch]): A generator of batches, with the training
|
||||||
|
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
||||||
|
data iterable needs to take care of iterating over the epochs and
|
||||||
|
shuffling.
|
||||||
|
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
||||||
|
The callback should take no arguments and return a tuple
|
||||||
|
`(main_score, other_scores)`. The main_score should be a float where
|
||||||
|
higher is better. other_scores can be any object.
|
||||||
|
|
||||||
|
Every iteration, the function yields out a tuple with:
|
||||||
|
|
||||||
|
* batch: A list of Example objects.
|
||||||
|
* info: A dict with various information about the last update (see below).
|
||||||
|
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||||
|
was the best evaluation so far. You should use this to save the model
|
||||||
|
checkpoints during training. If None, evaluation was not conducted on
|
||||||
|
that iteration. False means evaluation was conducted, but a previous
|
||||||
|
evaluation was better.
|
||||||
|
|
||||||
|
The info dict provides the following information:
|
||||||
|
|
||||||
|
epoch (int): How many passes over the data have been completed.
|
||||||
|
step (int): How many steps have been completed.
|
||||||
|
score (float): The main score from the last evaluation.
|
||||||
|
other_scores: : The other scores from the last evaluation.
|
||||||
|
losses: The accumulated losses throughout training.
|
||||||
|
checkpoints: A list of previous results, where each result is a
|
||||||
|
(score, step, epoch) tuple.
|
||||||
|
"""
|
||||||
|
if isinstance(dropout, float):
|
||||||
|
dropouts = constant(dropout)
|
||||||
|
else:
|
||||||
|
dropouts = dropout
|
||||||
|
results = []
|
||||||
|
losses = {}
|
||||||
|
words_seen = 0
|
||||||
|
start_time = timer()
|
||||||
|
for step, (epoch, batch) in enumerate(train_data):
|
||||||
|
dropout = next(dropouts)
|
||||||
|
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||||
|
nlp.update(
|
||||||
|
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
||||||
|
)
|
||||||
|
# TODO: refactor this so we don't have to run it separately in here
|
||||||
|
for name, proc in nlp.pipeline:
|
||||||
|
if (
|
||||||
|
name not in exclude
|
||||||
|
and hasattr(proc, "model")
|
||||||
|
and proc.model not in (True, False, None)
|
||||||
|
):
|
||||||
|
proc.model.finish_update(optimizer)
|
||||||
|
optimizer.step_schedules()
|
||||||
|
if not (step % eval_frequency):
|
||||||
|
if optimizer.averages:
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
score, other_scores = evaluate()
|
||||||
|
else:
|
||||||
|
score, other_scores = evaluate()
|
||||||
|
results.append((score, step))
|
||||||
|
is_best_checkpoint = score == max(results)[0]
|
||||||
|
else:
|
||||||
|
score, other_scores = (None, None)
|
||||||
|
is_best_checkpoint = None
|
||||||
|
words_seen += sum(len(eg) for eg in batch)
|
||||||
|
info = {
|
||||||
|
"epoch": epoch,
|
||||||
|
"step": step,
|
||||||
|
"score": score,
|
||||||
|
"other_scores": other_scores,
|
||||||
|
"losses": losses,
|
||||||
|
"checkpoints": results,
|
||||||
|
"seconds": int(timer() - start_time),
|
||||||
|
"words": words_seen,
|
||||||
|
}
|
||||||
|
yield batch, info, is_best_checkpoint
|
||||||
|
if is_best_checkpoint is not None:
|
||||||
|
losses = {}
|
||||||
|
# Stop if no improvement in `patience` updates (if specified)
|
||||||
|
best_score, best_step = max(results)
|
||||||
|
if patience and (step - best_step) >= patience:
|
||||||
|
break
|
||||||
|
# Stop if we've exhausted our max steps (if specified)
|
||||||
|
if max_steps and step >= max_steps:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def subdivide_batch(batch, accumulate_gradient):
|
||||||
|
batch = list(batch)
|
||||||
|
batch.sort(key=lambda eg: len(eg.predicted))
|
||||||
|
sub_len = len(batch) // accumulate_gradient
|
||||||
|
start = 0
|
||||||
|
for i in range(accumulate_gradient):
|
||||||
|
subbatch = batch[start : start + sub_len]
|
||||||
|
if subbatch:
|
||||||
|
yield subbatch
|
||||||
|
start += len(subbatch)
|
||||||
|
subbatch = batch[start:]
|
||||||
|
if subbatch:
|
||||||
|
yield subbatch
|
||||||
|
|
||||||
|
|
||||||
|
def create_evaluation_callback(
|
||||||
|
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||||
|
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||||
|
weights = {key: value for key, value in weights.items() if value is not None}
|
||||||
|
|
||||||
|
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||||
|
dev_examples = list(dev_corpus(nlp))
|
||||||
|
scores = nlp.evaluate(dev_examples)
|
||||||
|
# Calculate a weighted sum based on score_weights for the main score.
|
||||||
|
# We can only consider scores that are ints/floats, not dicts like
|
||||||
|
# entity scores per type etc.
|
||||||
|
for key, value in scores.items():
|
||||||
|
if key in weights and not isinstance(value, (int, float)):
|
||||||
|
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||||
|
try:
|
||||||
|
weighted_score = sum(
|
||||||
|
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||||
|
)
|
||||||
|
except KeyError as e:
|
||||||
|
keys = list(scores.keys())
|
||||||
|
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||||
|
raise KeyError(err) from None
|
||||||
|
return weighted_score, scores
|
||||||
|
|
||||||
|
return evaluate
|
||||||
|
|
||||||
|
|
||||||
|
def create_train_batches(
|
||||||
|
iterator: Iterator[Example],
|
||||||
|
batcher: Callable[[Iterable[Example]], Iterable[Example]],
|
||||||
|
max_epochs: int,
|
||||||
|
):
|
||||||
|
epoch = 0
|
||||||
|
examples = list(iterator)
|
||||||
|
if not examples:
|
||||||
|
# Raise error if no data
|
||||||
|
raise ValueError(Errors.E986)
|
||||||
|
while max_epochs < 1 or epoch != max_epochs:
|
||||||
|
random.shuffle(examples)
|
||||||
|
for batch in batcher(examples):
|
||||||
|
yield epoch, batch
|
||||||
|
epoch += 1
|
||||||
|
|
||||||
|
|
||||||
|
def update_meta(
|
||||||
|
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
nlp.meta["performance"] = {}
|
||||||
|
for metric in training["score_weights"]:
|
||||||
|
if metric is not None:
|
||||||
|
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||||
|
for pipe_name in nlp.pipe_names:
|
||||||
|
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||||
|
|
||||||
|
|
||||||
|
def create_before_to_disk_callback(
|
||||||
|
callback: Optional[Callable[[Language], Language]]
|
||||||
|
) -> Callable[[Language], Language]:
|
||||||
|
def before_to_disk(nlp: Language) -> Language:
|
||||||
|
if not callback:
|
||||||
|
return nlp
|
||||||
|
modified_nlp = callback(nlp)
|
||||||
|
if not isinstance(modified_nlp, Language):
|
||||||
|
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
||||||
|
raise ValueError(err)
|
||||||
|
return modified_nlp
|
||||||
|
|
||||||
|
return before_to_disk
|
267
spacy/training/pretrain.py
Normal file
267
spacy/training/pretrain.py
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
from typing import Optional, Callable, Any, Iterable, Union, List
|
||||||
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||||
|
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
||||||
|
from pathlib import Path
|
||||||
|
from functools import partial
|
||||||
|
from collections import Counter
|
||||||
|
import srsly
|
||||||
|
import numpy
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from .example import Example
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..attrs import ID
|
||||||
|
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||||
|
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||||
|
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||||
|
from ..util import registry, load_model_from_config, dot_to_object, logger
|
||||||
|
|
||||||
|
|
||||||
|
def pretrain(
|
||||||
|
config: Config,
|
||||||
|
output_dir: Path,
|
||||||
|
resume_path: Optional[Path] = None,
|
||||||
|
epoch_resume: Optional[int] = None,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
logger: Callable[[Any], Any] = logger,
|
||||||
|
):
|
||||||
|
if config["training"]["seed"] is not None:
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
|
allocator = config["training"]["gpu_allocator"]
|
||||||
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
|
nlp = load_model_from_config(config)
|
||||||
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
|
P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
|
corpus = dot_to_object(T, P["corpus"])
|
||||||
|
batcher = P["batcher"]
|
||||||
|
model = create_pretraining_model(nlp, P)
|
||||||
|
optimizer = P["optimizer"]
|
||||||
|
# Load in pretrained weights to resume from
|
||||||
|
if resume_path is not None:
|
||||||
|
_resume_model(model, resume_path, epoch_resume)
|
||||||
|
else:
|
||||||
|
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||||
|
epoch_resume = 0
|
||||||
|
|
||||||
|
# TODO: move this to logger function?
|
||||||
|
tracker = ProgressTracker(frequency=10000)
|
||||||
|
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||||
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
|
def _save_model(epoch, is_temp=False):
|
||||||
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
|
with model.use_params(optimizer.averages):
|
||||||
|
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||||
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
|
log = {
|
||||||
|
"nr_word": tracker.nr_word,
|
||||||
|
"loss": tracker.loss,
|
||||||
|
"epoch_loss": tracker.epoch_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
}
|
||||||
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
|
|
||||||
|
objective = create_objective(P["objective"])
|
||||||
|
# TODO: I think we probably want this to look more like the
|
||||||
|
# 'create_train_batches' function?
|
||||||
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
|
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||||
|
docs = ensure_docs(batch)
|
||||||
|
loss = make_update(model, docs, optimizer, objective)
|
||||||
|
progress = tracker.update(epoch, loss, docs)
|
||||||
|
if progress:
|
||||||
|
msg.row(progress, **row_settings)
|
||||||
|
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||||
|
_save_model(epoch, is_temp=True)
|
||||||
|
_save_model(epoch)
|
||||||
|
tracker.epoch_loss = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
docs = []
|
||||||
|
for eg_or_doc in examples_or_docs:
|
||||||
|
if isinstance(eg_or_doc, Doc):
|
||||||
|
docs.append(eg_or_doc)
|
||||||
|
else:
|
||||||
|
docs.append(eg_or_doc.reference)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def _resume_model(
|
||||||
|
model: Model,
|
||||||
|
resume_path: Path,
|
||||||
|
epoch_resume: int,
|
||||||
|
logger: Callable[[Any], Any] = logger,
|
||||||
|
) -> None:
|
||||||
|
logger.info(f"Resume training tok2vec from: {resume_path}")
|
||||||
|
with resume_path.open("rb") as file_:
|
||||||
|
weights_data = file_.read()
|
||||||
|
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||||
|
# Parse the epoch number from the given weight file
|
||||||
|
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||||
|
if model_name:
|
||||||
|
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||||
|
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||||
|
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||||
|
else:
|
||||||
|
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||||
|
|
||||||
|
|
||||||
|
def make_update(
|
||||||
|
model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
|
||||||
|
) -> float:
|
||||||
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
optimizer (callable): An optimizer.
|
||||||
|
RETURNS loss: A float for the loss.
|
||||||
|
"""
|
||||||
|
predictions, backprop = model.begin_update(docs)
|
||||||
|
loss, gradients = objective_func(model.ops, docs, predictions)
|
||||||
|
backprop(gradients)
|
||||||
|
model.finish_update(optimizer)
|
||||||
|
# Don't want to return a cupy object here
|
||||||
|
# The gradients are modified in-place by the BERT MLM,
|
||||||
|
# so we get an accurate loss
|
||||||
|
return float(loss)
|
||||||
|
|
||||||
|
|
||||||
|
def create_objective(config: Config):
|
||||||
|
"""Create the objective for pretraining.
|
||||||
|
|
||||||
|
We'd like to replace this with a registry function but it's tricky because
|
||||||
|
we're also making a model choice based on this. For now we hard-code support
|
||||||
|
for two types (characters, vectors). For characters you can specify
|
||||||
|
n_characters, for vectors you can specify the loss.
|
||||||
|
|
||||||
|
Bleh.
|
||||||
|
"""
|
||||||
|
objective_type = config["type"]
|
||||||
|
if objective_type == "characters":
|
||||||
|
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||||
|
elif objective_type == "vectors":
|
||||||
|
if config["loss"] == "cosine":
|
||||||
|
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
elif config["loss"] == "L2":
|
||||||
|
distance = L2Distance(normalize=True, ignore_zeros=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unexpected loss type", config["loss"])
|
||||||
|
else:
|
||||||
|
raise ValueError("Unexpected objective_type", objective_type)
|
||||||
|
|
||||||
|
|
||||||
|
def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
|
the prediction.
|
||||||
|
"""
|
||||||
|
# The simplest way to implement this would be to vstack the
|
||||||
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
|
# and look them up all at once. This prevents data copying.
|
||||||
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||||
|
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||||
|
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||||
|
target_ids = target_ids.reshape((-1,))
|
||||||
|
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||||
|
target = target.reshape((-1, 256 * nr_char))
|
||||||
|
diff = prediction - target
|
||||||
|
loss = (diff ** 2).sum()
|
||||||
|
d_target = diff / float(prediction.shape[0])
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
def create_pretraining_model(nlp, pretrain_config):
|
||||||
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
|
Each array in the output needs to have one row per token in the doc.
|
||||||
|
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||||
|
serialized to file and read back in when calling the 'train' command.
|
||||||
|
"""
|
||||||
|
component = nlp.get_pipe(pretrain_config["component"])
|
||||||
|
if pretrain_config.get("layer"):
|
||||||
|
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||||
|
else:
|
||||||
|
tok2vec = component.model
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
maxout_pieces = 3
|
||||||
|
hidden_size = 300
|
||||||
|
if pretrain_config["objective"]["type"] == "vectors":
|
||||||
|
model = build_cloze_multi_task_model(
|
||||||
|
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||||
|
)
|
||||||
|
elif pretrain_config["objective"]["type"] == "characters":
|
||||||
|
model = build_cloze_characters_multi_task_model(
|
||||||
|
nlp.vocab,
|
||||||
|
tok2vec,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
maxout_pieces=maxout_pieces,
|
||||||
|
nr_char=pretrain_config["objective"]["n_characters"],
|
||||||
|
)
|
||||||
|
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
|
set_dropout_rate(model, pretrain_config["dropout"])
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
class ProgressTracker:
|
||||||
|
def __init__(self, frequency=1000000):
|
||||||
|
self.loss = 0.0
|
||||||
|
self.prev_loss = 0.0
|
||||||
|
self.nr_word = 0
|
||||||
|
self.words_per_epoch = Counter()
|
||||||
|
self.frequency = frequency
|
||||||
|
self.last_time = time.time()
|
||||||
|
self.last_update = 0
|
||||||
|
self.epoch_loss = 0.0
|
||||||
|
|
||||||
|
def update(self, epoch, loss, docs):
|
||||||
|
self.loss += loss
|
||||||
|
self.epoch_loss += loss
|
||||||
|
words_in_batch = sum(len(doc) for doc in docs)
|
||||||
|
self.words_per_epoch[epoch] += words_in_batch
|
||||||
|
self.nr_word += words_in_batch
|
||||||
|
words_since_update = self.nr_word - self.last_update
|
||||||
|
if words_since_update >= self.frequency:
|
||||||
|
wps = words_since_update / (time.time() - self.last_time)
|
||||||
|
self.last_update = self.nr_word
|
||||||
|
self.last_time = time.time()
|
||||||
|
loss_per_word = self.loss - self.prev_loss
|
||||||
|
status = (
|
||||||
|
epoch,
|
||||||
|
self.nr_word,
|
||||||
|
_smart_round(self.loss, width=10),
|
||||||
|
_smart_round(loss_per_word, width=6),
|
||||||
|
int(wps),
|
||||||
|
)
|
||||||
|
self.prev_loss = float(self.loss)
|
||||||
|
return status
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _smart_round(
|
||||||
|
figure: Union[float, int], width: int = 10, max_decimal: int = 4
|
||||||
|
) -> str:
|
||||||
|
"""Round large numbers as integers, smaller numbers as decimals."""
|
||||||
|
n_digits = len(str(int(figure)))
|
||||||
|
n_decimal = width - (n_digits + 1)
|
||||||
|
if n_decimal <= 1:
|
||||||
|
return str(int(figure))
|
||||||
|
else:
|
||||||
|
n_decimal = min(n_decimal, max_decimal)
|
||||||
|
format_str = "%." + str(n_decimal) + "f"
|
||||||
|
return format_str % figure
|
|
@ -8,6 +8,7 @@ import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import thinc
|
import thinc
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
||||||
|
from thinc.api import ConfigValidationError
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
@ -56,6 +57,7 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
|
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
|
DEFAULT_OOV_PROB = -20
|
||||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||||
|
@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
|
||||||
return Path(sys.modules[module.__module__].__file__).parent
|
return Path(sys.modules[module.__module__].__file__).parent
|
||||||
|
|
||||||
|
|
||||||
def load_vectors_into_model(
|
|
||||||
nlp: "Language", name: Union[str, Path], *, add_strings=True
|
|
||||||
) -> None:
|
|
||||||
"""Load word vectors from an installed model or path into a model instance."""
|
|
||||||
vectors_nlp = load_model(name)
|
|
||||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
|
||||||
if add_strings:
|
|
||||||
# I guess we should add the strings from the vectors_nlp model?
|
|
||||||
# E.g. if someone does a similarity query, they might expect the strings.
|
|
||||||
for key in nlp.vocab.vectors.key2row:
|
|
||||||
if key in vectors_nlp.vocab.strings:
|
|
||||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
|
||||||
|
|
||||||
|
|
||||||
def load_model(
|
def load_model(
|
||||||
name: Union[str, Path],
|
name: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
|
@ -391,32 +379,9 @@ def load_model_from_config(
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def resolve_training_config(
|
|
||||||
config: Config,
|
|
||||||
exclude: Iterable[str] = ("nlp", "components"),
|
|
||||||
validate: bool = True,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Resolve the config sections relevant for trainig and create all objects.
|
|
||||||
Mostly used in the CLI to separate training config (not resolved by default
|
|
||||||
because not runtime-relevant – an nlp object should load fine even if it's
|
|
||||||
[training] block refers to functions that are not available etc.).
|
|
||||||
|
|
||||||
config (Config): The config to resolve.
|
|
||||||
exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
|
|
||||||
be available in the final resolved config.
|
|
||||||
validate (bool): Whether to validate the config.
|
|
||||||
RETURNS (Dict[str, Any]): The resolved config.
|
|
||||||
"""
|
|
||||||
config = config.copy()
|
|
||||||
for key in exclude:
|
|
||||||
if key in config:
|
|
||||||
config.pop(key)
|
|
||||||
return registry.resolve(config, validate=validate)
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_dot_names(
|
def resolve_dot_names(
|
||||||
config: Config, dot_names: List[Optional[str]]
|
config: Config, dot_names: List[Optional[str]]
|
||||||
) -> List[Optional[Callable]]:
|
) -> Tuple[Any]:
|
||||||
"""Resolve one or more "dot notation" names, e.g. corpora.train.
|
"""Resolve one or more "dot notation" names, e.g. corpora.train.
|
||||||
The paths could point anywhere into the config, so we don't know which
|
The paths could point anywhere into the config, so we don't know which
|
||||||
top-level section we'll be looking within.
|
top-level section we'll be looking within.
|
||||||
|
@ -424,18 +389,42 @@ def resolve_dot_names(
|
||||||
We resolve the whole top-level section, although we could resolve less --
|
We resolve the whole top-level section, although we could resolve less --
|
||||||
we could find the lowest part of the tree.
|
we could find the lowest part of the tree.
|
||||||
"""
|
"""
|
||||||
|
# TODO: include schema?
|
||||||
|
# TODO: clean this up and avoid duplication
|
||||||
resolved = {}
|
resolved = {}
|
||||||
output = []
|
output = []
|
||||||
|
errors = []
|
||||||
for name in dot_names:
|
for name in dot_names:
|
||||||
if name is None:
|
if name is None:
|
||||||
output.append(name)
|
output.append(name)
|
||||||
else:
|
else:
|
||||||
section = name.split(".")[0]
|
section = name.split(".")[0]
|
||||||
# We want to avoid resolving the same thing twice.
|
# We want to avoid resolving the same thing twice
|
||||||
if section not in resolved:
|
if section not in resolved:
|
||||||
resolved[section] = registry.resolve(config[section])
|
resolved[section] = registry.resolve(config[section])
|
||||||
|
try:
|
||||||
output.append(dot_to_object(resolved, name))
|
output.append(dot_to_object(resolved, name))
|
||||||
return output
|
except KeyError:
|
||||||
|
msg = f"not a valid section reference: {name}"
|
||||||
|
errors.append({"loc": name.split("."), "msg": msg})
|
||||||
|
objects = []
|
||||||
|
for ref in output:
|
||||||
|
if not isinstance(ref, str):
|
||||||
|
msg = f"not a valid section reference: {ref} ({type(ref)})"
|
||||||
|
errors.append({"loc": ref.split("."), "msg": msg})
|
||||||
|
continue
|
||||||
|
section = ref.split(".")[0]
|
||||||
|
# We want to avoid resolving the same thing twice
|
||||||
|
if section not in resolved:
|
||||||
|
resolved[section] = registry.resolve(config[section])
|
||||||
|
try:
|
||||||
|
objects.append(dot_to_object(resolved, ref))
|
||||||
|
except KeyError:
|
||||||
|
msg = f"not a valid section reference: {name}"
|
||||||
|
errors.append({"loc": ref.split("."), "msg": msg})
|
||||||
|
if errors:
|
||||||
|
raise ConfigValidationError(config=config, errors=errors)
|
||||||
|
return tuple(objects)
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_init_py(
|
def load_model_from_init_py(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user