From 56f8bc73ef1880ded2abe9da5a5ff26ca6babc20 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 15:23:34 +0200 Subject: [PATCH 1/4] Add more tests --- spacy/tests/pipeline/test_initialize.py | 32 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 974556b1c..1d2e7e5a3 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -9,34 +9,58 @@ from pydantic import StrictBool def test_initialize_arguments(): name = "test_initialize_arguments" + class CustomTokenizer: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.from_initialize = None + + def __call__(self, text): + return self.tokenizer(text) + + def initialize(self, get_examples, nlp, custom: int): + self.from_initialize = custom + class Component: def __init__(self): - ... + self.from_initialize = None def initialize( self, get_examples, nlp, custom1: str, custom2: StrictBool = False ): - ... + self.from_initialize = (custom1, custom2) Language.factory(name, func=lambda nlp, name: Component()) nlp = English() + nlp.tokenizer = CustomTokenizer(nlp.tokenizer) example = Example.from_dict(nlp("x"), {}) get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block with pytest.raises(ConfigValidationError) as e: # Empty settings, no required custom1 argument - nlp.initialize(get_examples, settings={"components": {name: {}}}) + settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.initialize(get_examples, settings=settings) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) assert errors[0]["type"] == "value_error.missing" with pytest.raises(ConfigValidationError) as e: # Wrong type - settings = {"components": {name: {"custom1": "x", "custom2": 1}}} + settings = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } nlp.initialize(get_examples, settings=settings) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) assert errors[0]["type"] == "value_error.strictbool" + settings = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": True}}, + } + nlp.initialize(get_examples, settings=settings) + assert nlp.tokenizer.from_initialize == 1 + pipe = nlp.get_pipe(name) + assert pipe.from_initialize == ("x", True) From 63d15981377aa207591380ba6eaf816c7696830c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:05:48 +0200 Subject: [PATCH 2/4] Simplify config use in Language.initialize --- spacy/language.py | 25 +++++++++++----- spacy/tests/pipeline/test_initialize.py | 25 +++++++++------- spacy/training/initialize.py | 38 ++++++++++--------------- spacy/training/loop.py | 20 +++++++------ 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 8ef2f1d61..8d546529d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,6 +18,7 @@ from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples +from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer from .util import registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER @@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .tokenizer import Tokenizer from .errors import Errors, Warnings -from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings +from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit +from .schemas import ConfigSchemaPretrain, validate_init_settings from .git_info import GIT_VERSION from . import util from . import about @@ -1161,7 +1163,6 @@ class Language: self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, - settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), sgd: Optional[Optimizer] = None, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. @@ -1198,28 +1199,38 @@ class Language: if not valid_examples: err = Errors.E930.format(name="Language", obj="empty list") raise ValueError(err) + # Make sure the config is interpolated so we can resolve subsections + config = self.config.interpolate() + # These are the settings provided in the [initialize] block in the config + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + V = I["vocab"] + init_vocab( + self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], + ) + pretrain_cfg = config.get("pretraining") + if pretrain_cfg: + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + init_tok2vec(self, P, V) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - self._optimizer = sgd if hasattr(self.tokenizer, "initialize"): - tok_settings = settings.get("tokenizer", {}) tok_settings = validate_init_settings( self.tokenizer.initialize, - tok_settings, + I["tokenizer"], section="tokenizer", name="tokenizer", ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) - proc_settings = settings.get("components", {}) for name, proc in self.pipeline: if hasattr(proc, "initialize"): - p_settings = proc_settings.get(name, {}) + p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) proc.initialize(get_examples, nlp=self, **p_settings) self._link_components() + self._optimizer = sgd if sgd is not None: self._optimizer = sgd elif self._optimizer is None: diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 1d2e7e5a3..b6c22ee09 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -37,30 +37,33 @@ def test_initialize_arguments(): get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block + init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Empty settings, no required custom1 argument - settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} - nlp.initialize(get_examples, settings=settings) + # Empty config for component, no required custom1 argument + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) assert errors[0]["type"] == "value_error.missing" + init_cfg = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Wrong type - settings = { - "tokenizer": {"custom": 1}, - "components": {name: {"custom1": "x", "custom2": 1}}, - } - nlp.initialize(get_examples, settings=settings) + # Wrong type of custom 2 + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) assert errors[0]["type"] == "value_error.strictbool" - settings = { + init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x", "custom2": True}}, } - nlp.initialize(get_examples, settings=settings) + nlp.config["initialize"].update(init_cfg) + nlp.initialize(get_examples) assert nlp.tokenizer.from_initialize == 1 pipe = nlp.get_pipe(name) assert pipe.from_initialize == ("x", True) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index b42732d48..9517c6c48 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Optional, Any, List, IO +from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -11,16 +11,18 @@ import zipfile import tqdm from .loop import create_before_to_disk_callback -from ..language import Language from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB +if TYPE_CHECKING: + from ..language import Language # noqa: F401 -def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: + +def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": msg = Printer(no_print=silent) raw_config = config config = raw_config.interpolate() @@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] - init_vocab( - nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent - ) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) - if "pretraining" in config and config["pretraining"]: - P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - loaded = add_tok2vec_weights(nlp, P, V) - if loaded and P["component"]: - msg.good(f"Loaded pretrained weights into component '{P['component']}'") nlp = before_to_disk(nlp) return nlp @@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool: def init_vocab( - nlp: Language, + nlp: "Language", *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, silent: bool = True, -) -> Language: +) -> "Language": msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups @@ -109,7 +101,7 @@ def init_vocab( def load_vectors_into_model( - nlp: Language, name: Union[str, Path], *, add_strings: bool = True + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: @@ -132,8 +124,8 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) -def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] +def init_tok2vec( + nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] ) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config @@ -171,7 +163,7 @@ def add_tok2vec_weights( return False -def verify_config(nlp: Language) -> None: +def verify_config(nlp: "Language") -> None: """Perform additional checks based on the config, loaded nlp object and training data.""" # TODO: maybe we should validate based on the actual components, the list # in config["nlp"]["pipeline"] instead? @@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None: verify_textcat_config(nlp, pipe_config) -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: +def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: # if 'positive_label' is provided: double check whether it's in the data and # the task is binary if pipe_config.get("positive_label"): @@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: def convert_vectors( - nlp: Language, + nlp: "Language", vectors_loc: Optional[Path], *, truncate: int, diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 5153be66c..41e6464e0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,5 +1,5 @@ from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any -from typing import Optional +from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator @@ -9,13 +9,15 @@ from wasabi import Printer from .example import Example from ..schemas import ConfigSchemaTraining -from ..language import Language from ..errors import Errors from ..util import resolve_dot_names, registry +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + def train( - nlp: Language, + nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, @@ -110,7 +112,7 @@ def train( def train_while_improving( - nlp: Language, + nlp: "Language", optimizer: Optimizer, train_data, evaluate, @@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient): def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] + nlp: "Language", dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: weights = {key: value for key, value in weights.items() if value is not None} @@ -277,7 +279,7 @@ def create_train_batches( def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] + training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: @@ -288,8 +290,10 @@ def update_meta( def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: + callback: Optional[Callable[["Language"], "Language"]] +) -> Callable[["Language"], "Language"]: + from ..language import Language # noqa: F811 + def before_to_disk(nlp: Language) -> Language: if not callback: return nlp From aa2a6882d064924165ee697cac0e431a92e64eb2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:08:39 +0200 Subject: [PATCH 3/4] Fix logging --- spacy/cli/init_pipeline.py | 11 ++++++++--- spacy/training/initialize.py | 35 ++++++++++++++--------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 0e9de0eb4..ac1cdb7be 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -19,13 +19,18 @@ def init_vectors_cli( prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): + """Convert word vectors for use with spaCy. Will export an nlp object that + you can use in the [initialize.vocab] block of your config to initialize + a model with vectors. + """ + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() - convert_vectors( - nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False - ) + convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) + msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 9517c6c48..ef0938321 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -2,7 +2,6 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path -from wasabi import Printer import srsly import numpy import tarfile @@ -14,16 +13,15 @@ from .loop import create_before_to_disk_callback from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain -from ..util import registry, load_model_from_config, resolve_dot_names +from ..schemas import ConfigSchemaTraining +from ..util import registry, load_model_from_config, resolve_dot_names, logger from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB if TYPE_CHECKING: from ..language import Language # noqa: F401 -def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": - msg = Printer(no_print=silent) +def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -34,7 +32,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) - msg.good("Set up nlp object from config") + logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) @@ -46,14 +44,14 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") + logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") + logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - msg.good("Initialized pipeline components") + logger.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -72,12 +70,10 @@ def init_vocab( data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, - silent: bool = True, ) -> "Language": - msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups - msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -93,11 +89,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - msg.good("Created vocabulary") + logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.good("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - msg.good(f"Added vectors: {vectors}") + logger.good(f"Added vectors: {vectors}") def load_vectors_into_model( @@ -209,9 +205,7 @@ def convert_vectors( truncate: int, prune: int, name: Optional[str] = None, - silent: bool = True, ) -> None: - msg = Printer(no_print=silent) vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -220,9 +214,9 @@ def convert_vectors( nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: - with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(vectors_loc, truncate) - msg.good(f"Loaded vectors from {vectors_loc}") + logger.info(f"Reading vectors from {vectors_loc}") + vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + logger.info(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) if vector_keys is not None: @@ -239,7 +233,6 @@ def convert_vectors( nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune >= 1: nlp.vocab.prune_vectors(prune) - msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") def read_vectors(vectors_loc: Path, truncate_vectors: int): From 978ab54a84262682f75b8bb0aa196cd4f93976aa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:22:41 +0200 Subject: [PATCH 4/4] Fix logging --- spacy/training/initialize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index ef0938321..862c76448 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -51,7 +51,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.good("Initialized pipeline components") + logger.info("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -89,11 +89,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - logger.good("Created vocabulary") + logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.good(f"Added vectors: {vectors}") + logger.info(f"Added vectors: {vectors}") def load_vectors_into_model(