Simplify config use in Language.initialize

This commit is contained in:
Ines Montani 2020-09-29 16:05:48 +02:00
parent 56f8bc73ef
commit 63d1598137
4 changed files with 59 additions and 49 deletions

View File

@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer
from .util import registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION
from . import util
from . import about
@ -1161,7 +1163,6 @@ class Language:
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
sgd: Optional[Optimizer] = None,
) -> Optimizer:
"""Initialize the pipe for training, using data examples if available.
@ -1198,28 +1199,38 @@ class Language:
if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
init_tok2vec(self, P, V)
if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
self._optimizer = sgd
if hasattr(self.tokenizer, "initialize"):
tok_settings = settings.get("tokenizer", {})
tok_settings = validate_init_settings(
self.tokenizer.initialize,
tok_settings,
I["tokenizer"],
section="tokenizer",
name="tokenizer",
)
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
proc_settings = settings.get("components", {})
for name, proc in self.pipeline:
if hasattr(proc, "initialize"):
p_settings = proc_settings.get(name, {})
p_settings = I["components"].get(name, {})
p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name
)
proc.initialize(get_examples, nlp=self, **p_settings)
self._link_components()
self._optimizer = sgd
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:

View File

@ -37,30 +37,33 @@ def test_initialize_arguments():
get_examples = lambda: [example]
nlp.add_pipe(name)
# The settings here will typically come from the [initialize] block
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e:
# Empty settings, no required custom1 argument
settings = {"tokenizer": {"custom": 1}, "components": {name: {}}}
nlp.initialize(get_examples, settings=settings)
# Empty config for component, no required custom1 argument
nlp.initialize(get_examples)
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ("custom1",)
assert errors[0]["type"] == "value_error.missing"
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": 1}},
}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e:
# Wrong type
settings = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": 1}},
}
nlp.initialize(get_examples, settings=settings)
# Wrong type of custom 2
nlp.initialize(get_examples)
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ("custom2",)
assert errors[0]["type"] == "value_error.strictbool"
settings = {
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": True}},
}
nlp.initialize(get_examples, settings=settings)
nlp.config["initialize"].update(init_cfg)
nlp.initialize(get_examples)
assert nlp.tokenizer.from_initialize == 1
pipe = nlp.get_pipe(name)
assert pipe.from_initialize == ("x", True)

View File

@ -1,4 +1,4 @@
from typing import Union, Dict, Optional, Any, List, IO
from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError
from pathlib import Path
@ -11,16 +11,18 @@ import zipfile
import tqdm
from .loop import create_before_to_disk_callback
from ..language import Language
from ..lookups import Lookups
from ..vectors import Vectors
from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, resolve_dot_names
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
msg = Printer(no_print=silent)
raw_config = config
config = raw_config.interpolate()
@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
)
optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good("Initialized pipeline components")
# Verify the config after calling 'initialize' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, V)
if loaded and P["component"]:
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp)
return nlp
@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
def init_vocab(
nlp: Language,
nlp: "Language",
*,
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
silent: bool = True,
) -> Language:
) -> "Language":
msg = Printer(no_print=silent)
if lookups:
nlp.vocab.lookups = lookups
@ -109,7 +101,7 @@ def init_vocab(
def load_vectors_into_model(
nlp: Language, name: Union[str, Path], *, add_strings: bool = True
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
@ -132,8 +124,8 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def add_tok2vec_weights(
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
def init_tok2vec(
nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
@ -171,7 +163,7 @@ def add_tok2vec_weights(
return False
def verify_config(nlp: Language) -> None:
def verify_config(nlp: "Language") -> None:
"""Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None:
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if pipe_config.get("positive_label"):
@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
def convert_vectors(
nlp: Language,
nlp: "Language",
vectors_loc: Optional[Path],
*,
truncate: int,

View File

@ -1,5 +1,5 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
from typing import Optional
from typing import Optional, TYPE_CHECKING
from pathlib import Path
from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
@ -9,13 +9,15 @@ from wasabi import Printer
from .example import Example
from ..schemas import ConfigSchemaTraining
from ..language import Language
from ..errors import Errors
from ..util import resolve_dot_names, registry
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def train(
nlp: Language,
nlp: "Language",
output_path: Optional[Path] = None,
*,
use_gpu: int = -1,
@ -110,7 +112,7 @@ def train(
def train_while_improving(
nlp: Language,
nlp: "Language",
optimizer: Optimizer,
train_data,
evaluate,
@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
@ -277,7 +279,7 @@ def create_train_batches(
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
) -> None:
nlp.meta["performance"] = {}
for metric in training["score_weights"]:
@ -288,8 +290,10 @@ def update_meta(
def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]]
) -> Callable[[Language], Language]:
callback: Optional[Callable[["Language"], "Language"]]
) -> Callable[["Language"], "Language"]:
from ..language import Language # noqa: F811
def before_to_disk(nlp: Language) -> Language:
if not callback:
return nlp