Simplify config use in Language.initialize

This commit is contained in:
Ines Montani 2020-09-29 16:05:48 +02:00
parent 56f8bc73ef
commit 63d1598137
4 changed files with 59 additions and 49 deletions

View File

@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer from .scorer import Scorer
from .util import registry, SimpleFrozenList from .util import registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc from .tokens import Doc
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
@ -1161,7 +1163,6 @@ class Language:
self, self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None, get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*, *,
settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
@ -1198,28 +1199,38 @@ class Language:
if not valid_examples: if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list") err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err) raise ValueError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
init_tok2vec(self, P, V)
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
self._optimizer = sgd
if hasattr(self.tokenizer, "initialize"): if hasattr(self.tokenizer, "initialize"):
tok_settings = settings.get("tokenizer", {})
tok_settings = validate_init_settings( tok_settings = validate_init_settings(
self.tokenizer.initialize, self.tokenizer.initialize,
tok_settings, I["tokenizer"],
section="tokenizer", section="tokenizer",
name="tokenizer", name="tokenizer",
) )
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
proc_settings = settings.get("components", {})
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "initialize"): if hasattr(proc, "initialize"):
p_settings = proc_settings.get(name, {}) p_settings = I["components"].get(name, {})
p_settings = validate_init_settings( p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name proc.initialize, p_settings, section="components", name=name
) )
proc.initialize(get_examples, nlp=self, **p_settings) proc.initialize(get_examples, nlp=self, **p_settings)
self._link_components() self._link_components()
self._optimizer = sgd
if sgd is not None: if sgd is not None:
self._optimizer = sgd self._optimizer = sgd
elif self._optimizer is None: elif self._optimizer is None:

View File

@ -37,30 +37,33 @@ def test_initialize_arguments():
get_examples = lambda: [example] get_examples = lambda: [example]
nlp.add_pipe(name) nlp.add_pipe(name)
# The settings here will typically come from the [initialize] block # The settings here will typically come from the [initialize] block
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e: with pytest.raises(ConfigValidationError) as e:
# Empty settings, no required custom1 argument # Empty config for component, no required custom1 argument
settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} nlp.initialize(get_examples)
nlp.initialize(get_examples, settings=settings)
errors = e.value.errors errors = e.value.errors
assert len(errors) == 1 assert len(errors) == 1
assert errors[0]["loc"] == ("custom1",) assert errors[0]["loc"] == ("custom1",)
assert errors[0]["type"] == "value_error.missing" assert errors[0]["type"] == "value_error.missing"
with pytest.raises(ConfigValidationError) as e: init_cfg = {
# Wrong type
settings = {
"tokenizer": {"custom": 1}, "tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": 1}}, "components": {name: {"custom1": "x", "custom2": 1}},
} }
nlp.initialize(get_examples, settings=settings) nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e:
# Wrong type of custom 2
nlp.initialize(get_examples)
errors = e.value.errors errors = e.value.errors
assert len(errors) == 1 assert len(errors) == 1
assert errors[0]["loc"] == ("custom2",) assert errors[0]["loc"] == ("custom2",)
assert errors[0]["type"] == "value_error.strictbool" assert errors[0]["type"] == "value_error.strictbool"
settings = { init_cfg = {
"tokenizer": {"custom": 1}, "tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": True}}, "components": {name: {"custom1": "x", "custom2": True}},
} }
nlp.initialize(get_examples, settings=settings) nlp.config["initialize"].update(init_cfg)
nlp.initialize(get_examples)
assert nlp.tokenizer.from_initialize == 1 assert nlp.tokenizer.from_initialize == 1
pipe = nlp.get_pipe(name) pipe = nlp.get_pipe(name)
assert pipe.from_initialize == ("x", True) assert pipe.from_initialize == ("x", True)

View File

@ -1,4 +1,4 @@
from typing import Union, Dict, Optional, Any, List, IO from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError from thinc.api import ConfigValidationError
from pathlib import Path from pathlib import Path
@ -11,16 +11,18 @@ import zipfile
import tqdm import tqdm
from .loop import create_before_to_disk_callback from .loop import create_before_to_disk_callback
from ..language import Language
from ..lookups import Lookups from ..lookups import Lookups
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, resolve_dot_names from ..util import registry, load_model_from_config, resolve_dot_names
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
raw_config = config raw_config = config
config = raw_config.interpolate() config = raw_config.interpolate()
@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]] dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
)
optimizer = T["optimizer"] optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training # Components that shouldn't be updated during training
@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
msg.info(f"Resuming training for: {resume_components}") msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good("Initialized pipeline components") msg.good("Initialized pipeline components")
# Verify the config after calling 'initialize' to ensure labels # Verify the config after calling 'initialize' to ensure labels
# are properly initialized # are properly initialized
verify_config(nlp) verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, V)
if loaded and P["component"]:
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp) nlp = before_to_disk(nlp)
return nlp return nlp
@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
def init_vocab( def init_vocab(
nlp: Language, nlp: "Language",
*, *,
data: Optional[Path] = None, data: Optional[Path] = None,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
vectors: Optional[str] = None, vectors: Optional[str] = None,
silent: bool = True, silent: bool = True,
) -> Language: ) -> "Language":
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
if lookups: if lookups:
nlp.vocab.lookups = lookups nlp.vocab.lookups = lookups
@ -109,7 +101,7 @@ def init_vocab(
def load_vectors_into_model( def load_vectors_into_model(
nlp: Language, name: Union[str, Path], *, add_strings: bool = True nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
) -> None: ) -> None:
"""Load word vectors from an installed model or path into a model instance.""" """Load word vectors from an installed model or path into a model instance."""
try: try:
@ -132,8 +124,8 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def add_tok2vec_weights( def init_tok2vec(
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> bool: ) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain' # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config P = pretrain_config
@ -171,7 +163,7 @@ def add_tok2vec_weights(
return False return False
def verify_config(nlp: Language) -> None: def verify_config(nlp: "Language") -> None:
"""Perform additional checks based on the config, loaded nlp object and training data.""" """Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list # TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead? # in config["nlp"]["pipeline"] instead?
@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None:
verify_textcat_config(nlp, pipe_config) verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and # if 'positive_label' is provided: double check whether it's in the data and
# the task is binary # the task is binary
if pipe_config.get("positive_label"): if pipe_config.get("positive_label"):
@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
def convert_vectors( def convert_vectors(
nlp: Language, nlp: "Language",
vectors_loc: Optional[Path], vectors_loc: Optional[Path],
*, *,
truncate: int, truncate: int,

View File

@ -1,5 +1,5 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
from typing import Optional from typing import Optional, TYPE_CHECKING
from pathlib import Path from pathlib import Path
from timeit import default_timer as timer from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
@ -9,13 +9,15 @@ from wasabi import Printer
from .example import Example from .example import Example
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import resolve_dot_names, registry from ..util import resolve_dot_names, registry
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def train( def train(
nlp: Language, nlp: "Language",
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
*, *,
use_gpu: int = -1, use_gpu: int = -1,
@ -110,7 +112,7 @@ def train(
def train_while_improving( def train_while_improving(
nlp: Language, nlp: "Language",
optimizer: Optimizer, optimizer: Optimizer,
train_data, train_data,
evaluate, evaluate,
@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
def create_evaluation_callback( def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float] nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]: ) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None} weights = {key: value for key, value in weights.items() if value is not None}
@ -277,7 +279,7 @@ def create_train_batches(
def update_meta( def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
) -> None: ) -> None:
nlp.meta["performance"] = {} nlp.meta["performance"] = {}
for metric in training["score_weights"]: for metric in training["score_weights"]:
@ -288,8 +290,10 @@ def update_meta(
def create_before_to_disk_callback( def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]] callback: Optional[Callable[["Language"], "Language"]]
) -> Callable[[Language], Language]: ) -> Callable[["Language"], "Language"]:
from ..language import Language # noqa: F811
def before_to_disk(nlp: Language) -> Language: def before_to_disk(nlp: Language) -> Language:
if not callback: if not callback:
return nlp return nlp