From 39b178999c67fc8512b93a8c83ca90676351d7c9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 27 Sep 2020 20:13:38 +0200 Subject: [PATCH 01/66] Tmp notes --- spacy/cli/init_model.py | 295 +-------------------------------------- spacy/cli/train.py | 78 ++--------- spacy/default_config.cfg | 9 ++ 3 files changed, 21 insertions(+), 361 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 05bf99ccd..6decb6172 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -13,18 +13,6 @@ import warnings from wasabi import msg, Printer import typer -from ._util import app, init_cli, Arg, Opt -from ..vectors import Vectors -from ..errors import Errors, Warnings -from ..language import Language -from ..util import ensure_path, get_lang_class, load_model, OOV_RANK - -try: - import ftfy -except ImportError: - ftfy = None - - DEFAULT_OOV_PROB = -20 @@ -63,7 +51,7 @@ def init_model_cli( "'python -m spacy init --help' for an overview of the other " "available initialization commands." ) - init_model( + init_vocab( lang, output_dir, freqs_loc=freqs_loc, @@ -77,284 +65,3 @@ def init_model_cli( base_model=base_model, silent=False, ) - - -def init_model( - lang: str, - output_dir: Path, - freqs_loc: Optional[Path] = None, - clusters_loc: Optional[Path] = None, - jsonl_loc: Optional[Path] = None, - vectors_loc: Optional[Path] = None, - prune_vectors: int = -1, - truncate_vectors: int = 0, - vectors_name: Optional[str] = None, - model_name: Optional[str] = None, - base_model: Optional[str] = None, - silent: bool = True, -) -> Language: - msg = Printer(no_print=silent, pretty=not silent) - if jsonl_loc is not None: - if freqs_loc is not None or clusters_loc is not None: - settings = ["-j"] - if freqs_loc: - settings.append("-f") - if clusters_loc: - settings.append("-c") - msg.warn( - "Incompatible arguments", - "The -f and -c arguments are deprecated, and not compatible " - "with the -j argument, which should specify the same " - "information. Either merge the frequencies and clusters data " - "into the JSONL-formatted file (recommended), or use only the " - "-f and -c files, without the other lexical attributes.", - ) - jsonl_loc = ensure_path(jsonl_loc) - lex_attrs = srsly.read_jsonl(jsonl_loc) - else: - clusters_loc = ensure_path(clusters_loc) - freqs_loc = ensure_path(freqs_loc) - if freqs_loc is not None and not freqs_loc.exists(): - msg.fail("Can't find words frequencies file", freqs_loc, exits=1) - lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) - - with msg.loading("Creating blank pipeline..."): - nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) - - msg.good("Successfully created blank pipeline") - if vectors_loc is not None: - add_vectors( - msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name - ) - vec_added = len(nlp.vocab.vectors) - lex_added = len(nlp.vocab) - msg.good( - "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", - ) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - return nlp - - -def open_file(loc: Union[str, Path]) -> IO: - """Handle .gz, .tar.gz or unzipped files""" - loc = ensure_path(loc) - if tarfile.is_tarfile(str(loc)): - return tarfile.open(str(loc), "r:gz") - elif loc.parts[-1].endswith("gz"): - return (line.decode("utf8") for line in gzip.open(str(loc), "r")) - elif loc.parts[-1].endswith("zip"): - zip_file = zipfile.ZipFile(str(loc)) - names = zip_file.namelist() - file_ = zip_file.open(names[0]) - return (line.decode("utf8") for line in file_) - else: - return loc.open("r", encoding="utf8") - - -def read_attrs_from_deprecated( - msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] -) -> List[Dict[str, Any]]: - if freqs_loc is not None: - with msg.loading("Counting frequencies..."): - probs, _ = read_freqs(freqs_loc) - msg.good("Counted frequencies") - else: - probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841 - if clusters_loc: - with msg.loading("Reading clusters..."): - clusters = read_clusters(clusters_loc) - msg.good("Read clusters") - else: - clusters = {} - lex_attrs = [] - sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) - if len(sorted_probs): - for i, (word, prob) in tqdm(enumerate(sorted_probs)): - attrs = {"orth": word, "id": i, "prob": prob} - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - attrs["cluster"] = int(clusters[word][::-1], 2) - else: - attrs["cluster"] = 0 - lex_attrs.append(attrs) - return lex_attrs - - -def create_model( - lang: str, - lex_attrs: List[Dict[str, Any]], - name: Optional[str] = None, - base_model: Optional[Union[str, Path]] = None, -) -> Language: - if base_model: - nlp = load_model(base_model) - # keep the tokenizer but remove any existing pipeline components due to - # potentially conflicting vectors - for pipe in nlp.pipe_names: - nlp.remove_pipe(pipe) - else: - lang_class = get_lang_class(lang) - nlp = lang_class() - for lexeme in nlp.vocab: - lexeme.rank = OOV_RANK - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) - if len(nlp.vocab): - oov_prob = min(lex.prob for lex in nlp.vocab) - 1 - else: - oov_prob = DEFAULT_OOV_PROB - nlp.vocab.cfg.update({"oov_prob": oov_prob}) - if name: - nlp.meta["name"] = name - return nlp - - -def add_vectors( - msg: Printer, - nlp: Language, - vectors_loc: Optional[Path], - truncate_vectors: int, - prune_vectors: int, - name: Optional[str] = None, -) -> None: - vectors_loc = ensure_path(vectors_loc) - if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): - nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) - for lex in nlp.vocab: - if lex.rank and lex.rank != OOV_RANK: - nlp.vocab.vectors.add(lex.orth, row=lex.rank) - else: - if vectors_loc: - with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors( - msg, vectors_loc, truncate_vectors - ) - msg.good(f"Loaded vectors from {vectors_loc}") - else: - vectors_data, vector_keys = (None, None) - if vector_keys is not None: - for word in vector_keys: - if word not in nlp.vocab: - nlp.vocab[word] - if vectors_data is not None: - nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name - if prune_vectors >= 1: - nlp.vocab.prune_vectors(prune_vectors) - - -def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): - f = open_file(vectors_loc) - f = ensure_shape(f) - shape = tuple(int(size) for size in next(f).split()) - if truncate_vectors >= 1: - shape = (truncate_vectors, shape[1]) - vectors_data = numpy.zeros(shape=shape, dtype="f") - vectors_keys = [] - for i, line in enumerate(tqdm(f)): - line = line.rstrip() - pieces = line.rsplit(" ", vectors_data.shape[1]) - word = pieces.pop(0) - if len(pieces) != vectors_data.shape[1]: - msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) - vectors_data[i] = numpy.asarray(pieces, dtype="f") - vectors_keys.append(word) - if i == truncate_vectors - 1: - break - return vectors_data, vectors_keys - - -def ensure_shape(lines): - """Ensure that the first line of the data is the vectors shape. - - If it's not, we read in the data and output the shape as the first result, - so that the reader doesn't have to deal with the problem. - """ - first_line = next(lines) - try: - shape = tuple(int(size) for size in first_line.split()) - except ValueError: - shape = None - if shape is not None: - # All good, give the data - yield first_line - yield from lines - else: - # Figure out the shape, make it the first value, and then give the - # rest of the data. - width = len(first_line.split()) - 1 - captured = [first_line] + list(lines) - length = len(captured) - yield f"{length} {width}" - yield from captured - - -def read_freqs( - freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 -): - counts = PreshCounter() - total = 0 - with freqs_loc.open() as f: - for i, line in enumerate(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - freq = int(freq) - counts.inc(i + 1, freq) - total += freq - counts.smooth() - log_total = math.log(total) - probs = {} - with freqs_loc.open() as f: - for line in tqdm(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - doc_freq = int(doc_freq) - freq = int(freq) - if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - try: - word = literal_eval(key) - except SyntaxError: - # Take odd strings literally. - word = literal_eval(f"'{key}'") - smooth_count = counts.smoother(int(freq)) - probs[word] = math.log(smooth_count) - log_total - oov_prob = math.log(counts.smoother(0)) - log_total - return probs, oov_prob - - -def read_clusters(clusters_loc: Path) -> dict: - clusters = {} - if ftfy is None: - warnings.warn(Warnings.W004) - with clusters_loc.open() as f: - for line in tqdm(f): - try: - cluster, word, freq = line.split() - if ftfy is not None: - word = ftfy.fix_text(word) - except ValueError: - continue - # If the clusterer has only seen the word a few times, its - # cluster is unreliable. - if int(freq) >= 3: - clusters[word] = cluster - else: - clusters[word] = "0" - # Expand clusters with re-casing - for word, cluster in list(clusters.items()): - if word.lower() not in clusters: - clusters[word.lower()] = cluster - if word.title() not in clusters: - clusters[word.title()] = cluster - if word.upper() not in clusters: - clusters[word.upper()] = cluster - return clusters diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5fc4ff035..bb1bba4d5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,6 +32,7 @@ def train_cli( verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), resume: bool = Opt(False, "--resume", "-R", help="Resume training"), + dave_path: Optional[Path] = Opt(None, "--dave", "-D", help="etc etc"), # fmt: on ): """ @@ -52,9 +53,12 @@ def train_cli( verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) + if prepared is None: + prepare(config_path, output_path / "prepared", config_overrides=overrides) train( config_path, output_path=output_path, + dave_path=dave_path, config_overrides=overrides, use_gpu=use_gpu, resume_training=resume, @@ -62,8 +66,7 @@ def train_cli( def train( - config_path: Path, - output_path: Optional[Path] = None, + output_path: Path, config_overrides: Dict[str, Any] = {}, use_gpu: int = -1, resume_training: bool = False, @@ -74,73 +77,14 @@ def train( else: msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") + # TODO: The details of this will change + dave_path = output_path / "dave" + config_path = dave_path / "config.cfg" with show_validation_error(config_path): - config = util.load_config( - config_path, overrides=config_overrides, interpolate=True - ) - # Keep a second un-interpolated config so we can preserve variables in - # the final nlp object we train and serialize - raw_config = util.load_config(config_path, overrides=config_overrides) - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) - with show_validation_error(config_path): - nlp, config = util.load_model_from_config(raw_config) - util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) - if config["training"]["vectors"] is not None: - add_vectors(nlp, config["training"]["vectors"]) - raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) - T_cfg = config["training"] - optimizer = T_cfg["optimizer"] - train_corpus = dot_to_object(config, T_cfg["train_corpus"]) - dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) - batcher = T_cfg["batcher"] - train_logger = T_cfg["logger"] - before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T_cfg["frozen_components"] - # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") - if resume_components: - with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") - nlp.resume_training(sgd=optimizer) - with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - # Verify the config after calling 'begin_training' to ensure labels are properly initialized - verify_config(nlp) + config = fill_config_etc_etc(config_path) + nlp = make_and_load_nlp_etc_etc(config, dave_path) + optimizer, train_corpus, dev_corpus, score_weights, T_cfg = resolve_more_things_etc_etc(config) - if tag_map: - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - if morph_rules: - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - - # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - if weights_data is not None: - tok2vec_component = config["pretraining"]["component"] - if tok2vec_component is None: - msg.fail( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them.", - exits=1, - ) - layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = config["pretraining"]["layer"] - if tok2vec_layer: - layer = layer.get_ref(tok2vec_layer) - layer.from_bytes(weights_data) - msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") - - # Create iterator, which yields out info after each optimization step. - msg.info("Start training") - score_weights = T_cfg["score_weights"] training_step_iterator = train_while_improving( nlp, optimizer, diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 6f8c0aa00..a8f4a9497 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -48,6 +48,15 @@ max_length = 0 # Limitation on number of training examples limit = 0 +[prepare] +# The 'prepare' step is run before training or pretraining. Components and +# the tokenizer can each define their own prepare step, giving them a chance +# to gather resources like lookup-tables, build label sets, construct vocabularies, +# etc. After 'prepare' is finished, the result will be saved out to disk, which +# will then be read in at the start of training. You can call the prepare step +# separately with the `spacy prepare` command, or you can let the train script +# do it for you. + # Training hyper-parameters and additional features. [training] seed = ${system.seed} From b5556093e251e4cfd31efda5f828fff98ba7f438 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 27 Sep 2020 23:59:44 +0200 Subject: [PATCH 02/66] Start updating train script --- spacy/cli/train.py | 55 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index bb1bba4d5..ab71dac26 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -16,6 +16,7 @@ from ._util import import_code, get_sourced_components from ..language import Language from .. import util from ..training.example import Example +from ..training.initialize import must_initialize, init_pipeline from ..errors import Errors from ..util import dot_to_object @@ -31,8 +32,6 @@ def train_cli( code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - resume: bool = Opt(False, "--resume", "-R", help="Resume training"), - dave_path: Optional[Path] = Opt(None, "--dave", "-D", help="etc etc"), # fmt: on ): """ @@ -53,38 +52,37 @@ def train_cli( verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) - if prepared is None: - prepare(config_path, output_path / "prepared", config_overrides=overrides) - train( - config_path, - output_path=output_path, - dave_path=dave_path, - config_overrides=overrides, - use_gpu=use_gpu, - resume_training=resume, - ) - - -def train( - output_path: Path, - config_overrides: Dict[str, Any] = {}, - use_gpu: int = -1, - resume_training: bool = False, -) -> None: if use_gpu >= 0: msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: msg.info("Using CPU") - msg.info(f"Loading config and nlp from: {config_path}") - # TODO: The details of this will change - dave_path = output_path / "dave" - config_path = dave_path / "config.cfg" - with show_validation_error(config_path): - config = fill_config_etc_etc(config_path) - nlp = make_and_load_nlp_etc_etc(config, dave_path) - optimizer, train_corpus, dev_corpus, score_weights, T_cfg = resolve_more_things_etc_etc(config) + config = util.load_config( + config_path, overrides=config_overrides, interpolate=True + ) + if output_path is None: + nlp = init_pipeline(config) + else: + init_path = output_path / "model-initial" + if must_reinitialize(config, init_path): + nlp = init_pipeline(config) + nlp.to_disk(init_path) + else: + nlp = spacy.load(output_path / "model-initial") + msg.info("Start training") + train(nlp, config, output_path) + +def train(nlp: Language, output_path: Optional[Path]=None) -> None: + # Create iterator, which yields out info after each optimization step. + config = nlp.config + T_cfg = config["training"] + score_weights = T_cfg["score_weights"] + optimizer = T_cfg["optimizer"] + train_corpus = dot_to_object(config, T_cfg["train_corpus"]) + dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) + batcher = T_cfg["batcher"] + training_step_iterator = train_while_improving( nlp, optimizer, @@ -142,6 +140,7 @@ def train( msg.good(f"Saved pipeline to output directory {final_model_path}") + def add_vectors(nlp: Language, vectors: str) -> None: title = f"Config validation error for vectors {vectors}" desc = ( From 13b1605ee6fddc527f703ed86715ef4f4cb24a50 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 01:08:49 +0200 Subject: [PATCH 03/66] Add init script --- spacy/training/initialize.py | 378 +++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 spacy/training/initialize.py diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py new file mode 100644 index 000000000..07bbced8d --- /dev/null +++ b/spacy/training/initialize.py @@ -0,0 +1,378 @@ +from pathlib import Path +from typing import Dict +from ._util import app, init_cli, Arg, Opt +from ..vectors import Vectors +from ..errors import Errors, Warnings +from ..language import Language +from ..util import ensure_path, get_lang_class, load_model, OOV_RANK + +try: + import ftfy +except ImportError: + ftfy = None + + +def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: + config = util.load_config(config_path, overrides=overrides) + if not init_path.exists(): + return True + elif not (init_path / "config.cfg").exists(): + return True + else: + init_cfg = util.load_config(init_path / "config.cfg", interpolate=True) + if config.to_str() != init_cfg.to_str(): + return True + else: + return False + + +def init_pipeline(config: Config, use_gpu: int=-1): + raw_config = config + config = raw_config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + with show_validation_error(config_path): + nlp = util.load_model_from_config(raw_config) + # Resolve all training-relevant sections using the filled nlp config + T = registry.resolve( + config["training"], + schema=TrainingSchema, + validate=validate, + ) + # TODO: It might not be 'corpora' + corpora = registry.resolve(config["corpora"], validate=True) + raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) + util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) + if T["vectors"] is not None: + add_vectors(nlp, T["vectors"]) + score_weights = T["score_weights"] + optimizer = T["optimizer"] + train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"]) + dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"]) + batcher = T["batcher"] + train_logger = T["logger"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + msg.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + msg.info(f"Resuming training for: {resume_components}") + nlp.resume_training(sgd=optimizer) + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + # Verify the config after calling 'begin_training' to ensure labels + # are properly initialized + verify_config(nlp) + + if tag_map: + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) + if morph_rules: + # Load morph rules + nlp.vocab.morphology.load_morph_exceptions(morph_rules) + + # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + if weights_data is not None: + tok2vec_component = C["pretraining"]["component"] + if tok2vec_component is None: + msg.fail( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them.", + exits=1, + ) + layer = nlp.get_pipe(tok2vec_component).model + tok2vec_layer = C["pretraining"]["layer"] + if tok2vec_layer: + layer = layer.get_ref(tok2vec_layer) + layer.from_bytes(weights_data) + msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") + return nlp + + +def init_vocab( + lang: str, + output_dir: Path, + freqs_loc: Optional[Path] = None, + clusters_loc: Optional[Path] = None, + jsonl_loc: Optional[Path] = None, + vectors_loc: Optional[Path] = None, + prune_vectors: int = -1, + truncate_vectors: int = 0, + vectors_name: Optional[str] = None, + model_name: Optional[str] = None, + base_model: Optional[str] = None, + silent: bool = True, +) -> Language: + msg = Printer(no_print=silent, pretty=not silent) + if jsonl_loc is not None: + if freqs_loc is not None or clusters_loc is not None: + settings = ["-j"] + if freqs_loc: + settings.append("-f") + if clusters_loc: + settings.append("-c") + msg.warn( + "Incompatible arguments", + "The -f and -c arguments are deprecated, and not compatible " + "with the -j argument, which should specify the same " + "information. Either merge the frequencies and clusters data " + "into the JSONL-formatted file (recommended), or use only the " + "-f and -c files, without the other lexical attributes.", + ) + jsonl_loc = ensure_path(jsonl_loc) + lex_attrs = srsly.read_jsonl(jsonl_loc) + else: + clusters_loc = ensure_path(clusters_loc) + freqs_loc = ensure_path(freqs_loc) + if freqs_loc is not None and not freqs_loc.exists(): + msg.fail("Can't find words frequencies file", freqs_loc, exits=1) + lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) + + with msg.loading("Creating blank pipeline..."): + nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) + + msg.good("Successfully created blank pipeline") + if vectors_loc is not None: + add_vectors( + msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name + ) + vec_added = len(nlp.vocab.vectors) + lex_added = len(nlp.vocab) + msg.good( + "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", + ) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + return nlp + + +def open_file(loc: Union[str, Path]) -> IO: + """Handle .gz, .tar.gz or unzipped files""" + loc = ensure_path(loc) + if tarfile.is_tarfile(str(loc)): + return tarfile.open(str(loc), "r:gz") + elif loc.parts[-1].endswith("gz"): + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + elif loc.parts[-1].endswith("zip"): + zip_file = zipfile.ZipFile(str(loc)) + names = zip_file.namelist() + file_ = zip_file.open(names[0]) + return (line.decode("utf8") for line in file_) + else: + return loc.open("r", encoding="utf8") + + +def read_attrs_from_deprecated( + msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] +) -> List[Dict[str, Any]]: + if freqs_loc is not None: + with msg.loading("Counting frequencies..."): + probs, _ = read_freqs(freqs_loc) + msg.good("Counted frequencies") + else: + probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841 + if clusters_loc: + with msg.loading("Reading clusters..."): + clusters = read_clusters(clusters_loc) + msg.good("Read clusters") + else: + clusters = {} + lex_attrs = [] + sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) + if len(sorted_probs): + for i, (word, prob) in tqdm(enumerate(sorted_probs)): + attrs = {"orth": word, "id": i, "prob": prob} + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See _parse_features.pyx + if word in clusters: + attrs["cluster"] = int(clusters[word][::-1], 2) + else: + attrs["cluster"] = 0 + lex_attrs.append(attrs) + return lex_attrs + + +def create_model( + lang: str, + lex_attrs: List[Dict[str, Any]], + name: Optional[str] = None, + base_model: Optional[Union[str, Path]] = None, +) -> Language: + if base_model: + nlp = load_model(base_model) + # keep the tokenizer but remove any existing pipeline components due to + # potentially conflicting vectors + for pipe in nlp.pipe_names: + nlp.remove_pipe(pipe) + else: + lang_class = get_lang_class(lang) + nlp = lang_class() + for lexeme in nlp.vocab: + lexeme.rank = OOV_RANK + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) + if name: + nlp.meta["name"] = name + return nlp + + +def add_vectors( + msg: Printer, + nlp: Language, + vectors_loc: Optional[Path], + truncate_vectors: int, + prune_vectors: int, + name: Optional[str] = None, +) -> None: + vectors_loc = ensure_path(vectors_loc) + if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): + nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) + for lex in nlp.vocab: + if lex.rank and lex.rank != OOV_RANK: + nlp.vocab.vectors.add(lex.orth, row=lex.rank) + else: + if vectors_loc: + with msg.loading(f"Reading vectors from {vectors_loc}"): + vectors_data, vector_keys = read_vectors( + msg, vectors_loc, truncate_vectors + ) + msg.good(f"Loaded vectors from {vectors_loc}") + else: + vectors_data, vector_keys = (None, None) + if vector_keys is not None: + for word in vector_keys: + if word not in nlp.vocab: + nlp.vocab[word] + if vectors_data is not None: + nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) + if name is None: + # TODO: Is this correct? Does this matter? + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" + else: + nlp.vocab.vectors.name = name + nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name + if prune_vectors >= 1: + nlp.vocab.prune_vectors(prune_vectors) + + +def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): + f = open_file(vectors_loc) + f = ensure_shape(f) + shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) + vectors_data = numpy.zeros(shape=shape, dtype="f") + vectors_keys = [] + for i, line in enumerate(tqdm(f)): + line = line.rstrip() + pieces = line.rsplit(" ", vectors_data.shape[1]) + word = pieces.pop(0) + if len(pieces) != vectors_data.shape[1]: + msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) + vectors_data[i] = numpy.asarray(pieces, dtype="f") + vectors_keys.append(word) + if i == truncate_vectors - 1: + break + return vectors_data, vectors_keys + + +def ensure_shape(lines): + """Ensure that the first line of the data is the vectors shape. + + If it's not, we read in the data and output the shape as the first result, + so that the reader doesn't have to deal with the problem. + """ + first_line = next(lines) + try: + shape = tuple(int(size) for size in first_line.split()) + except ValueError: + shape = None + if shape is not None: + # All good, give the data + yield first_line + yield from lines + else: + # Figure out the shape, make it the first value, and then give the + # rest of the data. + width = len(first_line.split()) - 1 + captured = [first_line] + list(lines) + length = len(captured) + yield f"{length} {width}" + yield from captured + + +def read_freqs( + freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 +): + counts = PreshCounter() + total = 0 + with freqs_loc.open() as f: + for i, line in enumerate(f): + freq, doc_freq, key = line.rstrip().split("\t", 2) + freq = int(freq) + counts.inc(i + 1, freq) + total += freq + counts.smooth() + log_total = math.log(total) + probs = {} + with freqs_loc.open() as f: + for line in tqdm(f): + freq, doc_freq, key = line.rstrip().split("\t", 2) + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: + try: + word = literal_eval(key) + except SyntaxError: + # Take odd strings literally. + word = literal_eval(f"'{key}'") + smooth_count = counts.smoother(int(freq)) + probs[word] = math.log(smooth_count) - log_total + oov_prob = math.log(counts.smoother(0)) - log_total + return probs, oov_prob + + +def read_clusters(clusters_loc: Path) -> dict: + clusters = {} + if ftfy is None: + warnings.warn(Warnings.W004) + with clusters_loc.open() as f: + for line in tqdm(f): + try: + cluster, word, freq = line.split() + if ftfy is not None: + word = ftfy.fix_text(word) + except ValueError: + continue + # If the clusterer has only seen the word a few times, its + # cluster is unreliable. + if int(freq) >= 3: + clusters[word] = cluster + else: + clusters[word] = "0" + # Expand clusters with re-casing + for word, cluster in list(clusters.items()): + if word.lower() not in clusters: + clusters[word.lower()] = cluster + if word.title() not in clusters: + clusters[word.title()] = cluster + if word.upper() not in clusters: + clusters[word.upper()] = cluster + return clusters From a023cf3ecc6e0b433250b56101c40e67eb58f735 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:06:12 +0200 Subject: [PATCH 04/66] Add (untested) resolve_dot_names util --- spacy/util.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 01232f5c5..fb3381f55 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -422,6 +422,28 @@ def resolve_training_config( return registry.resolve(config, validate=validate) +def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]: + """Resolve one or more "dot notation" names, e.g. corpora.train. + The paths could point anywhere into the config, so we don't know which + top-level section we'll be looking within. + + We resolve the whole top-level section, although we could resolve less -- + we could find the lowest part of the tree. + """ + resolved = {} + output = [] + for name in dot_names: + if name is None: + output.append(name) + else: + section = name.split(".")[0] + # We want to avoid resolving the same thing twice. + if section not in resolved: + resolved[section] = registry.resolve(config[section], schema=None) + output.append(dot_to_object(resolved, name)) + return output + + def load_model_from_init_py( init_file: Union[Path, str], *, From 3a0a3b8db684eb4cc67551814e7f8f8be1675362 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:06:33 +0200 Subject: [PATCH 05/66] Dont hard-code for 'corpora' name --- spacy/cli/train.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e27a499a7..e7b36a38f 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -77,12 +77,10 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] + train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) optimizer T["optimizer"] score_weights = T["score_weights"] - # TODO: This might not be called corpora - corpora = registry.resolve(config["corpora"], schema=ConfigSchemaCorpora) - train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"]) - dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"]) batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) @@ -101,7 +99,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], - raw_text=None, + raw_text=raw_text, exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") From ed2aff2db346d7be9d94e73e0e2e2921cf966ccf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:12:31 +0200 Subject: [PATCH 06/66] Remove unused train code --- spacy/cli/train.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e7b36a38f..468de583b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,7 +18,7 @@ from .. import util from ..training.example import Example from ..training.initialize import must_initialize, init_pipeline from ..errors import Errors -from ..util import dot_to_object +from ..util import resolve_dot_names @app.command( @@ -363,27 +363,6 @@ def update_meta( nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] -def load_from_paths( - config: Config, -) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - # TODO: separate checks from loading - raw_text = util.ensure_path(config["training"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) - tag_map = {} - morph_rules = {} - weights_data = None - init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) - if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - return raw_text, tag_map, morph_rules, weights_data - - def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): From b886f53c31204b3c71c5a5b42435e7de85ee7fbc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:42:47 +0200 Subject: [PATCH 07/66] init-pipeline runs (maybe doesnt work) --- spacy/cli/__init__.py | 3 ++- spacy/cli/init_model.py | 13 +++++++------ spacy/cli/train.py | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 92cb76971..5569e630d 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,8 @@ from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 -from .init_model import init_model # noqa: F401 +#from .init_model import init_model # noqa: F401 +from .init_pipeline import init_pipeline # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 6decb6172..4194f1bd0 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -12,16 +12,17 @@ import srsly import warnings from wasabi import msg, Printer import typer +from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error DEFAULT_OOV_PROB = -20 -@init_cli.command("vocab") -@app.command( - "init-model", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, - hidden=True, # hide this from main CLI help but still allow it to work with warning -) +#@init_cli.command("vocab") +#@app.command( +# "init-model", +# context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +# hidden=True, # hide this from main CLI help but still allow it to work with warning +#) def init_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 468de583b..8a360ad44 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -16,7 +16,6 @@ from ._util import import_code, get_sourced_components from ..language import Language from .. import util from ..training.example import Example -from ..training.initialize import must_initialize, init_pipeline from ..errors import Errors from ..util import resolve_dot_names @@ -79,7 +78,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) - optimizer T["optimizer"] + optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] From 65448b2e34ab55291a52caaa950e9c427f85902c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:42:58 +0200 Subject: [PATCH 08/66] Remove schema=None until Optional --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index fb3381f55..90ae9cf20 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -439,7 +439,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Op section = name.split(".")[0] # We want to avoid resolving the same thing twice. if section not in resolved: - resolved[section] = registry.resolve(config[section], schema=None) + resolved[section] = registry.resolve(config[section]) output.append(dot_to_object(resolved, name)) return output From 44bad1474c7be6b6fce31aa7a69352b7288135ce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 09:47:34 +0200 Subject: [PATCH 09/66] Add init_pipeline file --- spacy/cli/init_pipeline.py | 111 +++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 spacy/cli/init_pipeline.py diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py new file mode 100644 index 000000000..34b396a50 --- /dev/null +++ b/spacy/cli/init_pipeline.py @@ -0,0 +1,111 @@ +from typing import Optional, Dict, Any, Tuple, Union, Callable, List +import logging +import srsly +from pathlib import Path +from wasabi import msg +import typer +from thinc.api import Config, fix_random_seed + +from .train import create_before_to_disk_callback +from .. import util +from ..util import registry +from ..schemas import ConfigSchemaTraining +from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code, get_sourced_components +from ..util import resolve_dot_names + + +@init_cli.command( + "pipeline", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def init_pipeline_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the prepared data"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + # fmt: on +): + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(config_path): + nlp = init_pipeline(config) + nlp.to_disk(output_path) + + +def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: + config = util.load_config(config_path, overrides=overrides) + if not init_path.exists(): + return True + elif not (init_path / "config.cfg").exists(): + return True + else: + init_cfg = util.load_config(init_path / "config.cfg", interpolate=True) + if config.to_str() != init_cfg.to_str(): + return True + else: + return False + + +def init_pipeline(config: Config, use_gpu=-1): + raw_config = config + config = raw_config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + nlp = util.load_model_from_config(raw_config) + # Resolve all training-relevant sections using the filled nlp config + T = registry.resolve( + config["training"], + schema=ConfigSchemaTraining, + validate=True, + ) + dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] + train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) + util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) + if T["vectors"] is not None: + add_vectors(nlp, T["vectors"]) + score_weights = T["score_weights"] + optimizer = T["optimizer"] + batcher = T["batcher"] + train_logger = T["logger"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + msg.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + msg.info(f"Resuming training for: {resume_components}") + nlp.resume_training(sgd=optimizer) + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + # Verify the config after calling 'begin_training' to ensure labels + # are properly initialized + verify_config(nlp) + + # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + if weights_data is not None: + tok2vec_component = C["pretraining"]["component"] + if tok2vec_component is None: + msg.fail( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them.", + exits=1, + ) + layer = nlp.get_pipe(tok2vec_component).model + tok2vec_layer = C["pretraining"]["layer"] + if tok2vec_layer: + layer = layer.get_ref(tok2vec_layer) + layer.from_bytes(weights_data) + msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") + return nlp From 553bfea6418e76c28b8786de35df7a3df0e0b56a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 10:53:17 +0200 Subject: [PATCH 10/66] Fix commands --- spacy/cli/_util.py | 22 ++++++++ spacy/cli/init_model.py | 68 ----------------------- spacy/cli/init_pipeline.py | 110 +++++++++++++++++++++++++++++-------- spacy/cli/train.py | 84 +++++++++------------------- 4 files changed, 134 insertions(+), 150 deletions(-) delete mode 100644 spacy/cli/init_model.py diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 68cb572ea..6eafee4df 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -16,6 +16,7 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger +from ..util import ensure_path if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -458,3 +459,24 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result + + +def load_from_paths( + config: Config, +) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: + # TODO: separate checks from loading + raw_text = ensure_path(config["training"]["raw_text"]) + if raw_text is not None: + if not raw_text.exists(): + msg.fail("Can't find raw text", raw_text, exits=1) + raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) + tag_map = {} + morph_rules = {} + weights_data = None + init_tok2vec = ensure_path(config["training"]["init_tok2vec"]) + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + return raw_text, tag_map, morph_rules, weights_data diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py deleted file mode 100644 index 4194f1bd0..000000000 --- a/spacy/cli/init_model.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import Optional, List, Dict, Any, Union, IO -import math -from tqdm import tqdm -import numpy -from ast import literal_eval -from pathlib import Path -from preshed.counter import PreshCounter -import tarfile -import gzip -import zipfile -import srsly -import warnings -from wasabi import msg, Printer -import typer -from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error - -DEFAULT_OOV_PROB = -20 - - -#@init_cli.command("vocab") -#@app.command( -# "init-model", -# context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -# hidden=True, # hide this from main CLI help but still allow it to work with warning -#) -def init_model_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - lang: str = Arg(..., help="Pipeline language"), - output_dir: Path = Arg(..., help="Pipeline output directory"), - freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), - clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), - jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), - vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), - prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"), - truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), - vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), - model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"), - base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)") - # fmt: on -): - """ - Create a new blank pipeline directory with vocab and vectors from raw data. - If vectors are provided in Word2Vec format, they can be either a .txt or - zipped as a .zip or .tar.gz. - - DOCS: https://nightly.spacy.io/api/cli#init-vocab - """ - if ctx.command.name == "init-model": - msg.warn( - "The init-model command is now called 'init vocab'. You can run " - "'python -m spacy init --help' for an overview of the other " - "available initialization commands." - ) - init_vocab( - lang, - output_dir, - freqs_loc=freqs_loc, - clusters_loc=clusters_loc, - jsonl_loc=jsonl_loc, - vectors_loc=vectors_loc, - prune_vectors=prune_vectors, - truncate_vectors=truncate_vectors, - vectors_name=vectors_name, - model_name=model_name, - base_model=base_model, - silent=False, - ) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 34b396a50..ca70b51d1 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -1,18 +1,17 @@ -from typing import Optional, Dict, Any, Tuple, Union, Callable, List +from typing import Optional, Dict, Callable, Any import logging -import srsly from pathlib import Path from wasabi import msg import typer -from thinc.api import Config, fix_random_seed +from thinc.api import Config, fix_random_seed, set_gpu_allocator -from .train import create_before_to_disk_callback from .. import util -from ..util import registry -from ..schemas import ConfigSchemaTraining +from ..util import registry, resolve_dot_names +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..language import Language +from ..errors import Errors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components -from ..util import resolve_dot_names +from ._util import import_code, get_sourced_components, load_from_paths @init_cli.command( @@ -31,10 +30,12 @@ def init_pipeline_cli( util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) overrides = parse_config_overrides(ctx.args) import_code(code_path) - config = util.load_config(config_path, overrides=overrides) with show_validation_error(config_path): - nlp = init_pipeline(config) + config = util.load_config(config_path, overrides=overrides) + nlp = init_pipeline(config) nlp.to_disk(output_path) + # TODO: add more instructions + msg.good(f"Saved initialized pipeline to {output_path}") def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: @@ -51,7 +52,7 @@ def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool return False -def init_pipeline(config: Config, use_gpu=-1): +def init_pipeline(config: Config, use_gpu: int = -1) -> Language: raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -61,22 +62,19 @@ def init_pipeline(config: Config, use_gpu=-1): set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) - nlp = util.load_model_from_config(raw_config) + with show_validation_error(): + nlp = util.load_model_from_config(raw_config) + msg.good("Set up nlp object from config") # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve( - config["training"], - schema=ConfigSchemaTraining, - validate=True, - ) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) + msg.good("Created vocabulary") if T["vectors"] is not None: add_vectors(nlp, T["vectors"]) - score_weights = T["score_weights"] + msg.good(f"Added vectors: {T['vectors']}") optimizer = T["optimizer"] - batcher = T["batcher"] - train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] @@ -89,13 +87,23 @@ def init_pipeline(config: Config, use_gpu=-1): nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + msg.good(f"Initialized pipeline components") # Verify the config after calling 'begin_training' to ensure labels # are properly initialized verify_config(nlp) + if "pretraining" in config and config["pretraining"]: + P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) + add_tok2vec_weights({"training": T, "pretraining": P}, nlp) + # TODO: this should be handled better? + nlp = before_to_disk(nlp) + return nlp + +def add_tok2vec_weights(config: Config, nlp: Language) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + weights_data = load_from_paths(config) if weights_data is not None: - tok2vec_component = C["pretraining"]["component"] + tok2vec_component = config["pretraining"]["component"] if tok2vec_component is None: msg.fail( f"To use pretrained tok2vec weights, [pretraining.component] " @@ -103,9 +111,63 @@ def init_pipeline(config: Config, use_gpu=-1): exits=1, ) layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = C["pretraining"]["layer"] + tok2vec_layer = config["pretraining"]["layer"] if tok2vec_layer: layer = layer.get_ref(tok2vec_layer) layer.from_bytes(weights_data) - msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") - return nlp + msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'") + + +def add_vectors(nlp: Language, vectors: str) -> None: + title = f"Config validation error for vectors {vectors}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + with show_validation_error( + title=title, desc=desc, hint_fill=False, show_config=False + ): + util.load_vectors_into_model(nlp, vectors) + + +def verify_config(nlp: Language) -> None: + """Perform additional checks based on the config, loaded nlp object and training data.""" + # TODO: maybe we should validate based on the actual components, the list + # in config["nlp"]["pipeline"] instead? + for pipe_config in nlp.config["components"].values(): + # We can't assume that the component name == the factory + factory = pipe_config["factory"] + if factory == "textcat": + verify_textcat_config(nlp, pipe_config) + + +def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if pipe_config.get("positive_label"): + textcat_labels = nlp.get_pipe("textcat").labels + pos_label = pipe_config.get("positive_label") + if pos_label not in textcat_labels: + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) + ) + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) + ) + + +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8a360ad44..3476d5966 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List from timeit import default_timer as timer -import srsly import tqdm from pathlib import Path from wasabi import msg @@ -11,13 +10,17 @@ import random import typer import logging +from .init_pipeline import init_pipeline, must_initialize +from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components +from ._util import import_code +from ._util import load_from_paths # noqa: F401 (needed for Ray extension for now) from ..language import Language from .. import util from ..training.example import Example from ..errors import Errors -from ..util import resolve_dot_names +from ..util import resolve_dot_names, registry +from ..schemas import ConfigSchemaTraining @app.command( @@ -56,25 +59,35 @@ def train_cli( require_gpu(use_gpu) else: msg.info("Using CPU") - config = util.load_config( - config_path, overrides=config_overrides, interpolate=False - ) + config = util.load_config(config_path, overrides=overrides, interpolate=False) + msg.divider("Initializing pipeline") + # TODO: add warnings / --initialize (?) argument if output_path is None: nlp = init_pipeline(config) else: - init_path = output_path / "model-initial" - if must_reinitialize(config, init_path): + init_path = output_path / "model-initial" + if must_initialize(config, init_path): nlp = init_pipeline(config) nlp.to_disk(init_path) + msg.good(f"Saved initialized pipeline to {init_path}") else: - nlp = spacy.load(output_path / "model-initial") - msg.info("Start training") - train(nlp, config, output_path) + nlp = util.load_model(init_path) + msg.good(f"Loaded initialized pipeline from {init_path}") + msg.divider("Training pipeline") + train(nlp, output_path, use_gpu=use_gpu) -def train(nlp: Language, output_path: Optional[Path]=None) -> None: +def train( + nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1 +) -> None: + # TODO: random seed, GPU allocator # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) @@ -85,9 +98,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] - # Create iterator, which yields out info after each optimization step. - msg.info("Start training") training_step_iterator = train_while_improving( nlp, optimizer, @@ -101,7 +112,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: raw_text=raw_text, exclude=frozen_components, ) - msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") + msg.info(f"Initial learn rate: {optimizer.learn_rate}") with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) @@ -145,7 +156,6 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: msg.good(f"Saved pipeline to output directory {final_model_path}") - def add_vectors(nlp: Language, vectors: str) -> None: title = f"Config validation error for vectors {vectors}" desc = ( @@ -199,21 +209,6 @@ def create_evaluation_callback( return evaluate -def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: - def before_to_disk(nlp: Language) -> Language: - if not callback: - return nlp - modified_nlp = callback(nlp) - if not isinstance(modified_nlp, Language): - err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) - raise ValueError(err) - return modified_nlp - - return before_to_disk - - def train_while_improving( nlp: Language, optimizer: Optimizer, @@ -370,30 +365,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - - -def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - ) From 2fdb7285a02be4148610aaadd77861a2170dcbd5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 11:06:07 +0200 Subject: [PATCH 11/66] Update CLI --- spacy/cli/train.py | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3476d5966..7a83646ef 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,8 +32,9 @@ def train_cli( config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + init_path: Optional[Path] = Opt(None, "--init", "-i", help="Path to already initialized pipeline directory, e.g. created with 'spacy init pipeline' (will speed up training)"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ @@ -61,26 +62,38 @@ def train_cli( msg.info("Using CPU") config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") - # TODO: add warnings / --initialize (?) argument - if output_path is None: - nlp = init_pipeline(config) - else: - init_path = output_path / "model-initial" - if must_initialize(config, init_path): - nlp = init_pipeline(config) - nlp.to_disk(init_path) - msg.good(f"Saved initialized pipeline to {init_path}") - else: - nlp = util.load_model(init_path) - msg.good(f"Loaded initialized pipeline from {init_path}") + nlp = init_nlp(config, output_path, init_path) msg.divider("Training pipeline") train(nlp, output_path, use_gpu=use_gpu) +def init_nlp( + config: Config, output_path: Optional[Path], init_path: Optional[Path] +) -> None: + + if init_path is not None: + nlp = util.load_model(init_path) + # TODO: how to handle provided pipeline that needs to be reinitialized? + msg.good(f"Loaded initialized pipeline from {init_path}") + return nlp + if output_path is not None: + output_init_path = output_path / "model-initial" + if must_initialize(config, output_init_path): + msg.warn("TODO:") + nlp = init_pipeline(config) + nlp.to_disk(init_path) + msg.good(f"Saved initialized pipeline to {output_init_path}") + else: + nlp = util.load_model(output_init_path) + msg.good(f"Loaded initialized pipeline from {output_init_path}") + return nlp + msg.warn("TODO:") + return init_pipeline(config) + + def train( nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1 ) -> None: - # TODO: random seed, GPU allocator # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -112,6 +125,9 @@ def train( raw_text=raw_text, exclude=frozen_components, ) + msg.info(f"Pipeline: {nlp.pipe_names}") + if frozen_components: + msg.info(f"Frozen components: {frozen_components}") msg.info(f"Initial learn rate: {optimizer.learn_rate}") with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) From 8b74fd19df8f7af566f6e657376e9f13bc189f36 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 11:13:38 +0200 Subject: [PATCH 12/66] init pipeline -> init nlp --- spacy/cli/init_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index ca70b51d1..2dc7a741e 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -15,8 +15,7 @@ from ._util import import_code, get_sourced_components, load_from_paths @init_cli.command( - "pipeline", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) def init_pipeline_cli( # fmt: off From d5155376fd7d913734507f0647ddd4d33c625bbe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 11:30:18 +0200 Subject: [PATCH 13/66] Update vocab init --- spacy/cli/init_pipeline.py | 38 +++- spacy/training/initialize.py | 378 ----------------------------------- spacy/util.py | 16 +- 3 files changed, 41 insertions(+), 391 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 2dc7a741e..8ff47d4a8 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -4,16 +4,21 @@ from pathlib import Path from wasabi import msg import typer from thinc.api import Config, fix_random_seed, set_gpu_allocator +import srsly from .. import util -from ..util import registry, resolve_dot_names +from ..util import registry, resolve_dot_names, OOV_RANK from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..language import Language +from ..lookups import Lookups from ..errors import Errors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, get_sourced_components, load_from_paths +DEFAULT_OOV_PROB = -20 + + @init_cli.command( "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) @@ -68,7 +73,8 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) - util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) + # TODO: move lookups to [initialize], add vocab data + init_vocab(nlp, lookups=T["lookups"]) msg.good("Created vocabulary") if T["vectors"] is not None: add_vectors(nlp, T["vectors"]) @@ -98,6 +104,33 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: return nlp +def init_vocab( + nlp: Language, + *, + vocab_data: Optional[Path] = None, + lookups: Optional[Lookups] = None, +) -> Language: + if lookups: + nlp.vocab.lookups = lookups + msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") + data_path = util.ensure_path(vocab_data) + if data_path is not None: + lex_attrs = srsly.read_jsonl(data_path) + for lexeme in nlp.vocab: + lexeme.rank = OOV_RANK + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) + msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + + def add_tok2vec_weights(config: Config, nlp: Language) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' weights_data = load_from_paths(config) @@ -128,6 +161,7 @@ def add_vectors(nlp: Language, vectors: str) -> None: title=title, desc=desc, hint_fill=False, show_config=False ): util.load_vectors_into_model(nlp, vectors) + msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}") def verify_config(nlp: Language) -> None: diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 07bbced8d..e69de29bb 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,378 +0,0 @@ -from pathlib import Path -from typing import Dict -from ._util import app, init_cli, Arg, Opt -from ..vectors import Vectors -from ..errors import Errors, Warnings -from ..language import Language -from ..util import ensure_path, get_lang_class, load_model, OOV_RANK - -try: - import ftfy -except ImportError: - ftfy = None - - -def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: - config = util.load_config(config_path, overrides=overrides) - if not init_path.exists(): - return True - elif not (init_path / "config.cfg").exists(): - return True - else: - init_cfg = util.load_config(init_path / "config.cfg", interpolate=True) - if config.to_str() != init_cfg.to_str(): - return True - else: - return False - - -def init_pipeline(config: Config, use_gpu: int=-1): - raw_config = config - config = raw_config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) - with show_validation_error(config_path): - nlp = util.load_model_from_config(raw_config) - # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve( - config["training"], - schema=TrainingSchema, - validate=validate, - ) - # TODO: It might not be 'corpora' - corpora = registry.resolve(config["corpora"], validate=True) - raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) - util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) - if T["vectors"] is not None: - add_vectors(nlp, T["vectors"]) - score_weights = T["score_weights"] - optimizer = T["optimizer"] - train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"]) - dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"]) - batcher = T["batcher"] - train_logger = T["logger"] - before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T["frozen_components"] - # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") - if resume_components: - with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") - nlp.resume_training(sgd=optimizer) - with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - # Verify the config after calling 'begin_training' to ensure labels - # are properly initialized - verify_config(nlp) - - if tag_map: - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - if morph_rules: - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - - # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - if weights_data is not None: - tok2vec_component = C["pretraining"]["component"] - if tok2vec_component is None: - msg.fail( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them.", - exits=1, - ) - layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = C["pretraining"]["layer"] - if tok2vec_layer: - layer = layer.get_ref(tok2vec_layer) - layer.from_bytes(weights_data) - msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") - return nlp - - -def init_vocab( - lang: str, - output_dir: Path, - freqs_loc: Optional[Path] = None, - clusters_loc: Optional[Path] = None, - jsonl_loc: Optional[Path] = None, - vectors_loc: Optional[Path] = None, - prune_vectors: int = -1, - truncate_vectors: int = 0, - vectors_name: Optional[str] = None, - model_name: Optional[str] = None, - base_model: Optional[str] = None, - silent: bool = True, -) -> Language: - msg = Printer(no_print=silent, pretty=not silent) - if jsonl_loc is not None: - if freqs_loc is not None or clusters_loc is not None: - settings = ["-j"] - if freqs_loc: - settings.append("-f") - if clusters_loc: - settings.append("-c") - msg.warn( - "Incompatible arguments", - "The -f and -c arguments are deprecated, and not compatible " - "with the -j argument, which should specify the same " - "information. Either merge the frequencies and clusters data " - "into the JSONL-formatted file (recommended), or use only the " - "-f and -c files, without the other lexical attributes.", - ) - jsonl_loc = ensure_path(jsonl_loc) - lex_attrs = srsly.read_jsonl(jsonl_loc) - else: - clusters_loc = ensure_path(clusters_loc) - freqs_loc = ensure_path(freqs_loc) - if freqs_loc is not None and not freqs_loc.exists(): - msg.fail("Can't find words frequencies file", freqs_loc, exits=1) - lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) - - with msg.loading("Creating blank pipeline..."): - nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) - - msg.good("Successfully created blank pipeline") - if vectors_loc is not None: - add_vectors( - msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name - ) - vec_added = len(nlp.vocab.vectors) - lex_added = len(nlp.vocab) - msg.good( - "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", - ) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - return nlp - - -def open_file(loc: Union[str, Path]) -> IO: - """Handle .gz, .tar.gz or unzipped files""" - loc = ensure_path(loc) - if tarfile.is_tarfile(str(loc)): - return tarfile.open(str(loc), "r:gz") - elif loc.parts[-1].endswith("gz"): - return (line.decode("utf8") for line in gzip.open(str(loc), "r")) - elif loc.parts[-1].endswith("zip"): - zip_file = zipfile.ZipFile(str(loc)) - names = zip_file.namelist() - file_ = zip_file.open(names[0]) - return (line.decode("utf8") for line in file_) - else: - return loc.open("r", encoding="utf8") - - -def read_attrs_from_deprecated( - msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] -) -> List[Dict[str, Any]]: - if freqs_loc is not None: - with msg.loading("Counting frequencies..."): - probs, _ = read_freqs(freqs_loc) - msg.good("Counted frequencies") - else: - probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841 - if clusters_loc: - with msg.loading("Reading clusters..."): - clusters = read_clusters(clusters_loc) - msg.good("Read clusters") - else: - clusters = {} - lex_attrs = [] - sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) - if len(sorted_probs): - for i, (word, prob) in tqdm(enumerate(sorted_probs)): - attrs = {"orth": word, "id": i, "prob": prob} - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - attrs["cluster"] = int(clusters[word][::-1], 2) - else: - attrs["cluster"] = 0 - lex_attrs.append(attrs) - return lex_attrs - - -def create_model( - lang: str, - lex_attrs: List[Dict[str, Any]], - name: Optional[str] = None, - base_model: Optional[Union[str, Path]] = None, -) -> Language: - if base_model: - nlp = load_model(base_model) - # keep the tokenizer but remove any existing pipeline components due to - # potentially conflicting vectors - for pipe in nlp.pipe_names: - nlp.remove_pipe(pipe) - else: - lang_class = get_lang_class(lang) - nlp = lang_class() - for lexeme in nlp.vocab: - lexeme.rank = OOV_RANK - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) - if len(nlp.vocab): - oov_prob = min(lex.prob for lex in nlp.vocab) - 1 - else: - oov_prob = DEFAULT_OOV_PROB - nlp.vocab.cfg.update({"oov_prob": oov_prob}) - if name: - nlp.meta["name"] = name - return nlp - - -def add_vectors( - msg: Printer, - nlp: Language, - vectors_loc: Optional[Path], - truncate_vectors: int, - prune_vectors: int, - name: Optional[str] = None, -) -> None: - vectors_loc = ensure_path(vectors_loc) - if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): - nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) - for lex in nlp.vocab: - if lex.rank and lex.rank != OOV_RANK: - nlp.vocab.vectors.add(lex.orth, row=lex.rank) - else: - if vectors_loc: - with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors( - msg, vectors_loc, truncate_vectors - ) - msg.good(f"Loaded vectors from {vectors_loc}") - else: - vectors_data, vector_keys = (None, None) - if vector_keys is not None: - for word in vector_keys: - if word not in nlp.vocab: - nlp.vocab[word] - if vectors_data is not None: - nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name - if prune_vectors >= 1: - nlp.vocab.prune_vectors(prune_vectors) - - -def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): - f = open_file(vectors_loc) - f = ensure_shape(f) - shape = tuple(int(size) for size in next(f).split()) - if truncate_vectors >= 1: - shape = (truncate_vectors, shape[1]) - vectors_data = numpy.zeros(shape=shape, dtype="f") - vectors_keys = [] - for i, line in enumerate(tqdm(f)): - line = line.rstrip() - pieces = line.rsplit(" ", vectors_data.shape[1]) - word = pieces.pop(0) - if len(pieces) != vectors_data.shape[1]: - msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) - vectors_data[i] = numpy.asarray(pieces, dtype="f") - vectors_keys.append(word) - if i == truncate_vectors - 1: - break - return vectors_data, vectors_keys - - -def ensure_shape(lines): - """Ensure that the first line of the data is the vectors shape. - - If it's not, we read in the data and output the shape as the first result, - so that the reader doesn't have to deal with the problem. - """ - first_line = next(lines) - try: - shape = tuple(int(size) for size in first_line.split()) - except ValueError: - shape = None - if shape is not None: - # All good, give the data - yield first_line - yield from lines - else: - # Figure out the shape, make it the first value, and then give the - # rest of the data. - width = len(first_line.split()) - 1 - captured = [first_line] + list(lines) - length = len(captured) - yield f"{length} {width}" - yield from captured - - -def read_freqs( - freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 -): - counts = PreshCounter() - total = 0 - with freqs_loc.open() as f: - for i, line in enumerate(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - freq = int(freq) - counts.inc(i + 1, freq) - total += freq - counts.smooth() - log_total = math.log(total) - probs = {} - with freqs_loc.open() as f: - for line in tqdm(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - doc_freq = int(doc_freq) - freq = int(freq) - if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - try: - word = literal_eval(key) - except SyntaxError: - # Take odd strings literally. - word = literal_eval(f"'{key}'") - smooth_count = counts.smoother(int(freq)) - probs[word] = math.log(smooth_count) - log_total - oov_prob = math.log(counts.smoother(0)) - log_total - return probs, oov_prob - - -def read_clusters(clusters_loc: Path) -> dict: - clusters = {} - if ftfy is None: - warnings.warn(Warnings.W004) - with clusters_loc.open() as f: - for line in tqdm(f): - try: - cluster, word, freq = line.split() - if ftfy is not None: - word = ftfy.fix_text(word) - except ValueError: - continue - # If the clusterer has only seen the word a few times, its - # cluster is unreliable. - if int(freq) >= 3: - clusters[word] = cluster - else: - clusters[word] = "0" - # Expand clusters with re-casing - for word, cluster in list(clusters.items()): - if word.lower() not in clusters: - clusters[word.lower()] = cluster - if word.title() not in clusters: - clusters[word.title()] = cluster - if word.upper() not in clusters: - clusters[word.upper()] = cluster - return clusters diff --git a/spacy/util.py b/spacy/util.py index 90ae9cf20..de1fd7f81 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,14 +253,6 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) -def load_vocab_data_into_model( - nlp: "Language", *, lookups: Optional["Lookups"] = None -) -> None: - """Load vocab data.""" - if lookups: - nlp.vocab.lookups = lookups - - def load_model( name: Union[str, Path], *, @@ -422,11 +414,13 @@ def resolve_training_config( return registry.resolve(config, validate=validate) -def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]: - """Resolve one or more "dot notation" names, e.g. corpora.train. +def resolve_dot_names( + config: Config, dot_names: List[Optional[str]] +) -> List[Optional[Callable]]: + """Resolve one or more "dot notation" names, e.g. corpora.train. The paths could point anywhere into the config, so we don't know which top-level section we'll be looking within. - + We resolve the whole top-level section, although we could resolve less -- we could find the lowest part of the tree. """ From e44a7519cdac903a64b0dec5e98b8b828952d4b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 11:56:14 +0200 Subject: [PATCH 14/66] Update CLI and add [initialize] block --- setup.cfg | 2 +- spacy/cli/_util.py | 21 ---------------- spacy/cli/init_pipeline.py | 49 +++++++++++++++++++++++++------------- spacy/cli/train.py | 24 ++++++++++++++++++- spacy/default_config.cfg | 12 ++++++++++ spacy/schemas.py | 35 +++++++++++++++++++-------- spacy/util.py | 2 +- 7 files changed, 94 insertions(+), 51 deletions(-) diff --git a/setup.cfg b/setup.cfg index b55c0d376..9ce361bc1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -98,7 +98,7 @@ universal = false formats = gztar [flake8] -ignore = E203, E266, E501, E731, W503 +ignore = E203, E266, E501, E731, W503, E741 max-line-length = 80 select = B,C,E,F,W,T4,B9 exclude = diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 6eafee4df..7ff2c6199 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -459,24 +459,3 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result - - -def load_from_paths( - config: Config, -) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - # TODO: separate checks from loading - raw_text = ensure_path(config["training"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) - tag_map = {} - morph_rules = {} - weights_data = None - init_tok2vec = ensure_path(config["training"]["init_tok2vec"]) - if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - return raw_text, tag_map, morph_rules, weights_data diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 8ff47d4a8..5ca565d88 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -8,12 +8,12 @@ import srsly from .. import util from ..util import registry, resolve_dot_names, OOV_RANK -from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit from ..language import Language from ..lookups import Lookups from ..errors import Errors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components, load_from_paths +from ._util import import_code, get_sourced_components DEFAULT_OOV_PROB = -20 @@ -67,14 +67,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) with show_validation_error(): - nlp = util.load_model_from_config(raw_config) + nlp = util.load_model_from_config(raw_config, auto_fill=True) msg.good("Set up nlp object from config") + config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) - # TODO: move lookups to [initialize], add vocab data - init_vocab(nlp, lookups=T["lookups"]) + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"]) msg.good("Created vocabulary") if T["vectors"] is not None: add_vectors(nlp, T["vectors"]) @@ -98,22 +99,19 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: verify_config(nlp) if "pretraining" in config and config["pretraining"]: P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - add_tok2vec_weights({"training": T, "pretraining": P}, nlp) + add_tok2vec_weights(nlp, P, I) # TODO: this should be handled better? nlp = before_to_disk(nlp) return nlp def init_vocab( - nlp: Language, - *, - vocab_data: Optional[Path] = None, - lookups: Optional[Lookups] = None, + nlp: Language, *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, ) -> Language: if lookups: nlp.vocab.lookups = lookups msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") - data_path = util.ensure_path(vocab_data) + data_path = util.ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) for lexeme in nlp.vocab: @@ -131,11 +129,29 @@ def init_vocab( msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") -def add_tok2vec_weights(config: Config, nlp: Language) -> None: +def add_tok2vec_weights( + nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any] +) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - weights_data = load_from_paths(config) + P = pretrain_config + I = init_config + raw_text = util.ensure_path(I["vocab"]["raw_text"]) + if raw_text is not None: + if not raw_text.exists(): + msg.fail("Can't find raw text", raw_text, exits=1) + raw_text = list(srsly.read_jsonl(raw_text)) + weights_data = None + init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"]) + if init_tok2vec is not None: + if P["objective"].get("type") == "vectors" and not I["vectors"]: + err = "Need initialize.vectors if pretraining.objective.type is vectors" + msg.fail(err, exits=1) + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() if weights_data is not None: - tok2vec_component = config["pretraining"]["component"] + tok2vec_component = P["component"] if tok2vec_component is None: msg.fail( f"To use pretrained tok2vec weights, [pretraining.component] " @@ -143,9 +159,8 @@ def add_tok2vec_weights(config: Config, nlp: Language) -> None: exits=1, ) layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = config["pretraining"]["layer"] - if tok2vec_layer: - layer = layer.get_ref(tok2vec_layer) + if P["layer"]: + layer = layer.get_ref(P["layer"]) layer.from_bytes(weights_data) msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7a83646ef..d69b3bd36 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -14,7 +14,6 @@ from .init_pipeline import init_pipeline, must_initialize from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code -from ._util import load_from_paths # noqa: F401 (needed for Ray extension for now) from ..language import Language from .. import util from ..training.example import Example @@ -381,3 +380,26 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") + + +# TODO: this is currently imported by the ray extension and not used otherwise +def load_from_paths( + config: Config, +) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: + import srsly + # TODO: separate checks from loading + raw_text = util.ensure_path(config["training"]["raw_text"]) + if raw_text is not None: + if not raw_text.exists(): + msg.fail("Can't find raw text", raw_text, exits=1) + raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) + tag_map = {} + morph_rules = {} + weights_data = None + init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + return raw_text, tag_map, morph_rules, weights_data diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index a8f4a9497..800a2b4a3 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -108,3 +108,15 @@ grad_clip = 1.0 use_averages = false eps = 1e-8 learn_rate = 0.001 + +[initialize] +tokenizer = {} +components = {} + +[initialize.vocab] +data = null +lookups = null +vectors = null +# Extra resources for transfer-learning or pseudo-rehearsal +init_tok2vec = ${paths.init_tok2vec} +raw_text = ${paths.raw} diff --git a/spacy/schemas.py b/spacy/schemas.py index 7951b851b..6553892d3 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -273,22 +273,37 @@ class ConfigSchemaPretrain(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaInitVocab(BaseModel): + # fmt: off + data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file") + lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") + vectors: Optional[StrictStr] = Field(..., title="Path to vectors") + init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") + raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + +class ConfigSchemaInit(BaseModel): + vocab: ConfigSchemaInitVocab + tokenizer: Any + components: Dict[str, Any] + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] corpora: Dict[str, Reader] - - @root_validator(allow_reuse=True) - def validate_config(cls, values): - """Perform additional validation for settings with dependencies.""" - pt = values.get("pretraining") - if pt and not isinstance(pt, ConfigSchemaPretrainEmpty): - if pt.objective.get("type") == "vectors" and not values["nlp"].vectors: - err = "Need nlp.vectors if pretraining.objective.type is vectors" - raise ValueError(err) - return values + initialize: ConfigSchemaInit class Config: extra = "allow" diff --git a/spacy/util.py b/spacy/util.py index de1fd7f81..cab7af8fb 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. # fmt: off -CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"] +CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on From 9f6ad06452cd389d68cc63f5ae9a88a9943d2d72 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 12:00:23 +0200 Subject: [PATCH 15/66] Upd default config --- spacy/default_config.cfg | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 800a2b4a3..0ab27f499 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -48,15 +48,6 @@ max_length = 0 # Limitation on number of training examples limit = 0 -[prepare] -# The 'prepare' step is run before training or pretraining. Components and -# the tokenizer can each define their own prepare step, giving them a chance -# to gather resources like lookup-tables, build label sets, construct vocabularies, -# etc. After 'prepare' is finished, the result will be saved out to disk, which -# will then be read in at the start of training. You can call the prepare step -# separately with the `spacy prepare` command, or you can let the train script -# do it for you. - # Training hyper-parameters and additional features. [training] seed = ${system.seed} @@ -109,6 +100,13 @@ use_averages = false eps = 1e-8 learn_rate = 0.001 +# The 'initialize' step is run before training or pretraining. Components and +# the tokenizer can each define their own prepare step, giving them a chance +# to gather resources like lookup-tables, build label sets, construct vocabularies, +# etc. After 'prepare' is finished, the result will be saved out to disk, which +# will then be read in at the start of training. You can call the prepare step +# separately with the `spacy prepare` command, or you can let the train script +# do it for you. [initialize] tokenizer = {} components = {} From 1590de11b1e794ac4c48b21e56c81b164de57ee7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:05:23 +0200 Subject: [PATCH 16/66] Update config --- spacy/cli/init_pipeline.py | 26 ++++++++----------- spacy/cli/templates/quickstart_training.jinja | 14 ++++++---- spacy/default_config.cfg | 11 +++----- spacy/default_config_pretraining.cfg | 2 +- spacy/schemas.py | 10 ++----- 5 files changed, 26 insertions(+), 37 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 5ca565d88..78d828719 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] - train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"]) + V = I["vocab"] + init_vocab(nlp, data=V["data"], lookups=V["lookups"]) msg.good("Created vocabulary") - if T["vectors"] is not None: - add_vectors(nlp, T["vectors"]) - msg.good(f"Added vectors: {T['vectors']}") + if V["vectors"] is not None: + add_vectors(nlp, V["vectors"]) + msg.good(f"Added vectors: {V['vectors']}") optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -130,20 +131,15 @@ def init_vocab( def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any] + nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] ) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config - I = init_config - raw_text = util.ensure_path(I["vocab"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(raw_text)) + V = vocab_config weights_data = None - init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"]) + init_tok2vec = util.ensure_path(V["init_tok2vec"]) if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not I["vectors"]: + if P["objective"].get("type") == "vectors" and not V["vectors"]: err = "Need initialize.vectors if pretraining.objective.type is vectors" msg.fail(err, exits=1) if not init_tok2vec.exists(): diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 9a8b9d1d7..5e990611e 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -277,11 +277,6 @@ path = ${paths.dev} max_length = 0 [training] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = null -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} {% endif -%} @@ -317,3 +312,12 @@ start = 100 stop = 1000 compound = 1.001 {% endif %} + +[initialize] + +[initialize.vocab] +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 0ab27f499..083b6a702 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,8 +1,9 @@ [paths] train = "" dev = "" -raw = null +raw_text = null init_tok2vec = null +vocab_data = null [system] seed = 0 @@ -54,11 +55,6 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Extra resources for transfer-learning or pseudo-rehearsal -init_tok2vec = ${paths.init_tok2vec} -raw_text = ${paths.raw} -vectors = null -lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 @@ -112,9 +108,8 @@ tokenizer = {} components = {} [initialize.vocab] -data = null +data = ${paths.vocab_data} lookups = null vectors = null # Extra resources for transfer-learning or pseudo-rehearsal init_tok2vec = ${paths.init_tok2vec} -raw_text = ${paths.raw} diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index bbd595308..122a7803a 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -32,7 +32,7 @@ learn_rate = 0.001 [corpora.pretrain] @readers = "spacy.JsonlReader.v1" -path = ${paths.raw} +path = ${paths.raw_text} min_length = 5 max_length = 500 limit = 0 diff --git a/spacy/schemas.py b/spacy/schemas.py index 6553892d3..b98498b8b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool -from pydantic import root_validator from thinc.config import Promise from collections import defaultdict from thinc.api import Optimizer @@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off - vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel): gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") - init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") @@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel): class ConfigSchemaInitVocab(BaseModel): # fmt: off - data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file") + data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") # fmt: on class Config: @@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel): class ConfigSchemaInit(BaseModel): vocab: ConfigSchemaInitVocab tokenizer: Any - components: Dict[str, Any] + components: Dict[StrictStr, Any] class Config: extra = "forbid" From a5f2cc05090a3fde472b7a61958cc08c86099a8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:30:13 +0200 Subject: [PATCH 17/66] Tidy up and remove raw text (rehearsal) for now --- spacy/cli/init_pipeline.py | 14 -------- spacy/cli/train.py | 67 ++++++++++++++++++-------------------- spacy/default_config.cfg | 1 - 3 files changed, 31 insertions(+), 51 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 78d828719..a2fd4c838 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -42,20 +42,6 @@ def init_pipeline_cli( msg.good(f"Saved initialized pipeline to {output_path}") -def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: - config = util.load_config(config_path, overrides=overrides) - if not init_path.exists(): - return True - elif not (init_path / "config.cfg").exists(): - return True - else: - init_cfg = util.load_config(init_path / "config.cfg", interpolate=True) - if config.to_str() != init_cfg.to_str(): - return True - else: - return False - - def init_pipeline(config: Config, use_gpu: int = -1) -> Language: raw_config = config config = raw_config.interpolate() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d69b3bd36..e179a1e3d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -10,13 +10,12 @@ import random import typer import logging -from .init_pipeline import init_pipeline, must_initialize +from .init_pipeline import init_pipeline from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code from ..language import Language from .. import util -from ..training.example import Example from ..errors import Errors from ..util import resolve_dot_names, registry from ..schemas import ConfigSchemaTraining @@ -69,24 +68,39 @@ def train_cli( def init_nlp( config: Config, output_path: Optional[Path], init_path: Optional[Path] ) -> None: - if init_path is not None: nlp = util.load_model(init_path) - # TODO: how to handle provided pipeline that needs to be reinitialized? + if must_reinitialize(config, nlp.config): + msg.fail( + f"Config has changed: can't use initialized pipeline from " + f"{init_path}. Please re-run 'spacy init nlp'.", + exits=1, + ) msg.good(f"Loaded initialized pipeline from {init_path}") return nlp if output_path is not None: output_init_path = output_path / "model-initial" - if must_initialize(config, output_init_path): - msg.warn("TODO:") + if not output_init_path.exists(): + msg.info(f"Initializing the pipeline in {output_init_path}") nlp = init_pipeline(config) - nlp.to_disk(init_path) + nlp.to_disk(output_init_path) msg.good(f"Saved initialized pipeline to {output_init_path}") else: nlp = util.load_model(output_init_path) - msg.good(f"Loaded initialized pipeline from {output_init_path}") + if must_reinitialize(config, nlp.config): + msg.warn("Config has changed: need to re-initialize pipeline") + nlp = init_pipeline(config) + nlp.to_disk(output_init_path) + msg.good(f"Re-initialized pipeline in {output_init_path}") + else: + msg.good(f"Loaded initialized pipeline from {output_init_path}") return nlp - msg.warn("TODO:") + msg.warn( + "Not saving initialized model: no output directory specified. " + "To speed up training, spaCy can save the initialized nlp object with " + "the vocabulary, vectors and label scheme. To take advantage of this, " + "provide an output directory or use the 'spacy init nlp' command." + ) return init_pipeline(config) @@ -101,8 +115,8 @@ def train( if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] - train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] @@ -121,7 +135,6 @@ def train( patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], - raw_text=raw_text, exclude=frozen_components, ) msg.info(f"Pipeline: {nlp.pipe_names}") @@ -171,6 +184,11 @@ def train( msg.good(f"Saved pipeline to output directory {final_model_path}") +def must_reinitialize(train_config: Config, init_config: Config) -> bool: + # TODO: do this better and more fine-grained + return train_config.interpolate().to_str() == init_config.interpolate().to_str() + + def add_vectors(nlp: Language, vectors: str) -> None: title = f"Config validation error for vectors {vectors}" desc = ( @@ -235,7 +253,6 @@ def train_while_improving( accumulate_gradient: int, patience: int, max_steps: int, - raw_text: List[Dict[str, str]], exclude: List[str], ): """Train until an evaluation stops improving. Works as a generator, @@ -282,27 +299,14 @@ def train_while_improving( dropouts = dropout results = [] losses = {} - if raw_text: - random.shuffle(raw_text) - raw_examples = [ - Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text - ] - raw_batches = util.minibatch(raw_examples, size=8) - words_seen = 0 start_time = timer() for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update( subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude ) - if raw_text: - # If raw text is available, perform 'rehearsal' updates, - # which use unlabelled data to reduce overfitting. - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: if ( @@ -386,15 +390,6 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No def load_from_paths( config: Config, ) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - import srsly - # TODO: separate checks from loading - raw_text = util.ensure_path(config["training"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) - tag_map = {} - morph_rules = {} weights_data = None init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) if init_tok2vec is not None: @@ -402,4 +397,4 @@ def load_from_paths( msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) with init_tok2vec.open("rb") as file_: weights_data = file_.read() - return raw_text, tag_map, morph_rules, weights_data + return None, {}, {}, weights_data diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 083b6a702..86293fd40 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,7 +1,6 @@ [paths] train = "" dev = "" -raw_text = null init_tok2vec = null vocab_data = null From f49288ab81d9d2b2095eb5513b6fc79fcc68cac1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:31:54 +0200 Subject: [PATCH 18/66] Update default_config_pretraining.cfg --- spacy/default_config_pretraining.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index 122a7803a..4011159a4 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -1,3 +1,6 @@ +[paths] +raw_text = null + [pretraining] max_epochs = 1000 dropout = 0.2 From c22ecc66bbed5a98242d0b8b45c145f6abc5598f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:46:28 +0200 Subject: [PATCH 19/66] Don't support init path for now --- spacy/cli/init_pipeline.py | 5 +++-- spacy/cli/train.py | 37 ++++++++++++------------------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index a2fd4c838..e64683fe1 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -20,7 +20,9 @@ DEFAULT_OOV_PROB = -20 @init_cli.command( - "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + "nlp", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + hidden=True, ) def init_pipeline_cli( # fmt: off @@ -38,7 +40,6 @@ def init_pipeline_cli( config = util.load_config(config_path, overrides=overrides) nlp = init_pipeline(config) nlp.to_disk(output_path) - # TODO: add more instructions msg.good(f"Saved initialized pipeline to {output_path}") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e179a1e3d..afaf230d1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -30,7 +30,6 @@ def train_cli( config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - init_path: Optional[Path] = Opt(None, "--init", "-i", help="Path to already initialized pipeline directory, e.g. created with 'spacy init pipeline' (will speed up training)"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on @@ -60,46 +59,34 @@ def train_cli( msg.info("Using CPU") config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") - nlp = init_nlp(config, output_path, init_path) + nlp = init_nlp(config, output_path) msg.divider("Training pipeline") train(nlp, output_path, use_gpu=use_gpu) -def init_nlp( - config: Config, output_path: Optional[Path], init_path: Optional[Path] -) -> None: - if init_path is not None: - nlp = util.load_model(init_path) - if must_reinitialize(config, nlp.config): - msg.fail( - f"Config has changed: can't use initialized pipeline from " - f"{init_path}. Please re-run 'spacy init nlp'.", - exits=1, - ) - msg.good(f"Loaded initialized pipeline from {init_path}") - return nlp +def init_nlp(config: Config, output_path: Optional[Path]) -> Language: if output_path is not None: - output_init_path = output_path / "model-initial" - if not output_init_path.exists(): - msg.info(f"Initializing the pipeline in {output_init_path}") + init_path = output_path / "model-initial" + if not init_path.exists(): + msg.info(f"Initializing the pipeline in {init_path}") nlp = init_pipeline(config) - nlp.to_disk(output_init_path) - msg.good(f"Saved initialized pipeline to {output_init_path}") + nlp.to_disk(init_path) + msg.good(f"Saved initialized pipeline to {init_path}") else: - nlp = util.load_model(output_init_path) + nlp = util.load_model(init_path) if must_reinitialize(config, nlp.config): msg.warn("Config has changed: need to re-initialize pipeline") nlp = init_pipeline(config) - nlp.to_disk(output_init_path) - msg.good(f"Re-initialized pipeline in {output_init_path}") + nlp.to_disk(init_path) + msg.good(f"Re-initialized pipeline in {init_path}") else: - msg.good(f"Loaded initialized pipeline from {output_init_path}") + msg.good(f"Loaded initialized pipeline from {init_path}") return nlp msg.warn( "Not saving initialized model: no output directory specified. " "To speed up training, spaCy can save the initialized nlp object with " "the vocabulary, vectors and label scheme. To take advantage of this, " - "provide an output directory or use the 'spacy init nlp' command." + "provide an output directory." ) return init_pipeline(config) From a62337b3f381b061b2ec27e6d9e9ba718276131b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:53:06 +0200 Subject: [PATCH 20/66] Tidy up vocab init --- spacy/cli/init_pipeline.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index e64683fe1..28e314d0a 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -63,11 +63,7 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: train_corpus, dev_corpus = resolve_dot_names(config, dot_names) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) V = I["vocab"] - init_vocab(nlp, data=V["data"], lookups=V["lookups"]) - msg.good("Created vocabulary") - if V["vectors"] is not None: - add_vectors(nlp, V["vectors"]) - msg.good(f"Added vectors: {V['vectors']}") + init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -94,7 +90,11 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: def init_vocab( - nlp: Language, *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, + nlp: Language, + *, + data: Optional[Path] = None, + lookups: Optional[Lookups] = None, + vectors: Optional[str] = None, ) -> Language: if lookups: nlp.vocab.lookups = lookups @@ -115,6 +115,10 @@ def init_vocab( oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + msg.good("Created vocabulary") + if vectors is not None: + add_vectors(nlp, vectors) + msg.good(f"Added vectors: {V['vectors']}") def add_tok2vec_weights( From a89e0ff7cb6cb120652ca7994e078778d2b8804a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 12:55:21 +0200 Subject: [PATCH 21/66] Fix typo --- spacy/cli/init_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 28e314d0a..0c4b6ec70 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -118,7 +118,7 @@ def init_vocab( msg.good("Created vocabulary") if vectors is not None: add_vectors(nlp, vectors) - msg.good(f"Added vectors: {V['vectors']}") + msg.good(f"Added vectors: {vectors}") def add_tok2vec_weights( From 822ea4ef619072a94ce565bf78add9f9ea9d2866 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 15:09:59 +0200 Subject: [PATCH 22/66] Refactor CLI --- spacy/cli/__init__.py | 3 +- spacy/cli/_util.py | 35 +-- spacy/cli/debug_config.py | 28 +-- spacy/cli/debug_data.py | 27 +-- spacy/cli/debug_model.py | 18 +- spacy/cli/evaluate.py | 7 +- spacy/cli/init_pipeline.py | 185 +-------------- spacy/cli/pretrain.py | 269 +--------------------- spacy/cli/train.py | 330 ++------------------------- spacy/tests/pipeline/test_textcat.py | 4 +- spacy/tests/test_cli.py | 13 -- spacy/tests/test_misc.py | 14 -- spacy/tests/test_util.py | 31 ++- spacy/tests/training/test_readers.py | 22 +- spacy/training/initialize.py | 205 +++++++++++++++++ spacy/training/loop.py | 301 ++++++++++++++++++++++++ spacy/training/pretrain.py | 267 ++++++++++++++++++++++ spacy/util.py | 71 +++--- 18 files changed, 917 insertions(+), 913 deletions(-) create mode 100644 spacy/training/loop.py create mode 100644 spacy/training/pretrain.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5569e630d..7368bcef3 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,8 +15,7 @@ from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 -#from .init_model import init_model # noqa: F401 -from .init_pipeline import init_pipeline # noqa: F401 +from .init_pipeline import init_pipeline_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 7ff2c6199..c41905970 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -10,13 +10,12 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.api import Config, ConfigValidationError +from thinc.api import Config, ConfigValidationError, require_gpu from configparser import InterpolationError import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import ensure_path if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: - """RETURNS (List[str]): All sourced components in the original config, - e.g. {"source": "en_core_web_sm"}. If the config contains a key - "factory", we assume it refers to a component factory. - """ - return [ - name - for name, cfg in config.get("components", {}).items() - if "factory" not in cfg and "source" in cfg - ] - - def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: """Upload a file. @@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result + + +class CliLogger: + """Helper mocking up the most commonly used logger methods. Can be passed + into functions like train() to make them output pretty-printed messages + on the CLI and regular logging if used from within Python. + """ + + debug = msg.text + info = msg.info + warn = msg.info + error = msg.fail + + +def setup_gpu(use_gpu: int): + if use_gpu >= 0: + msg.info(f"Using GPU: {use_gpu}") + require_gpu(use_gpu) + else: + msg.info("Using CPU") diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 131fecf6d..d1dcc45b9 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -1,7 +1,7 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table -from thinc.api import Config, ConfigValidationError +from thinc.api import Config from thinc.config import VARIABLE_RE import typer @@ -52,10 +52,8 @@ def debug_config( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) - # Use the resolved config here in case user has one function returning - # a dict of corpora etc. - resolved = util.resolve_training_config(nlp.config) - check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) + dot_names = ["training.dev_corpus", "training.train_corpus"] + util.resolve_dot_names(nlp.config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) @@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]: value = util.dot_to_object(config, path) result[variable] = repr(value) return result - - -def check_section_refs(config: Config, fields: List[str]) -> None: - """Validate fields in the config that refer to other sections or values - (e.g. in the corpora) and make sure that those references exist. - """ - errors = [] - for field in fields: - # If the field doesn't exist in the config, we ignore it - try: - value = util.dot_to_object(config, field) - except KeyError: - continue - try: - util.dot_to_object(config, value) - except KeyError: - msg = f"not a valid section reference: {value}" - errors.append({"loc": field.split("."), "msg": msg}) - if errors: - raise ConfigValidationError(config=config, errors=errors) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 302bfd563..f0e76be2b 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides -from ._util import import_code, debug_cli, get_sourced_components +from ._util import import_code, debug_cli from ..training import Corpus, Example +from ..training.initialize import get_sourced_components +from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..language import Language +from ..util import registry from .. import util @@ -94,26 +97,13 @@ def debug_data( with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) - C = util.resolve_training_config(nlp.config) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] - tag_map_path = util.ensure_path(C["training"]["tag_map"]) - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - morph_rules_path = util.ensure_path(C["training"]["morph_rules"]) - morph_rules = {} - if morph_rules_path is not None: - morph_rules = srsly.read_json(morph_rules_path) - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data @@ -145,10 +135,10 @@ def debug_data( train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] msg.divider("Training stats") - msg.text(f"Language: {C['nlp']['lang']}") + msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text(f"Components from other pipelines: {', '.join(resume_components)}") @@ -355,6 +345,7 @@ def debug_data( if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] + # TODO: does this need to be updated? tag_map = nlp.vocab.morphology.tag_map msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") labels_with_counts = _format_labels( diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 6f554ed2d..f8fc687fa 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -4,12 +4,14 @@ from pathlib import Path from spacy.training import Example from spacy.util import dot_to_object from wasabi import msg -from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam +from thinc.api import fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation, set_gpu_allocator import typer from ._util import Arg, Opt, debug_cli, show_validation_error -from ._util import parse_config_overrides, string_to_list +from ._util import parse_config_overrides, string_to_list, setup_gpu +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util @@ -37,11 +39,7 @@ def debug_model_cli( DOCS: https://nightly.spacy.io/api/cli#debug-model """ - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, @@ -65,8 +63,8 @@ def debug_model_cli( set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - C = util.resolve_training_config(nlp.config) - seed = C["training"]["seed"] + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) @@ -77,7 +75,7 @@ def debug_model_cli( exits=1, ) model = pipe.model - debug_model(C, nlp, model, print_settings=print_settings) + debug_model(T, nlp, model, print_settings=print_settings) def debug_model( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index f9954d9ad..4c1eeb9e8 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,11 +3,11 @@ from wasabi import Printer from pathlib import Path import re import srsly -from thinc.api import require_gpu, fix_random_seed +from thinc.api import fix_random_seed from ..training import Corpus from ..tokens import Doc -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, setup_gpu from ..scorer import Scorer from .. import util from .. import displacy @@ -61,8 +61,7 @@ def evaluate( ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - if use_gpu >= 0: - require_gpu(use_gpu) + setup_gpu(use_gpu) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 0c4b6ec70..de1dc8a46 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -1,22 +1,13 @@ -from typing import Optional, Dict, Callable, Any +from typing import Optional import logging from pathlib import Path from wasabi import msg import typer -from thinc.api import Config, fix_random_seed, set_gpu_allocator -import srsly from .. import util -from ..util import registry, resolve_dot_names, OOV_RANK -from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit -from ..language import Language -from ..lookups import Lookups -from ..errors import Errors +from ..training.initialize import init_nlp from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components - - -DEFAULT_OOV_PROB = -20 +from ._util import import_code, CliLogger, setup_gpu @init_cli.command( @@ -31,178 +22,16 @@ def init_pipeline_cli( output_path: Path = Arg(..., help="Output directory for the prepared data"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) overrides = parse_config_overrides(ctx.args) import_code(code_path) + setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) - nlp = init_pipeline(config) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") - - -def init_pipeline(config: Config, use_gpu: int = -1) -> Language: - raw_config = config - config = raw_config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) - with show_validation_error(): - nlp = util.load_model_from_config(raw_config, auto_fill=True) - msg.good("Set up nlp object from config") - config = nlp.config.interpolate() - # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"]] - train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] - init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) - optimizer = T["optimizer"] - before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T["frozen_components"] - # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") - if resume_components: - with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") - nlp.resume_training(sgd=optimizer) - with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - msg.good(f"Initialized pipeline components") - # Verify the config after calling 'begin_training' to ensure labels - # are properly initialized - verify_config(nlp) - if "pretraining" in config and config["pretraining"]: - P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - add_tok2vec_weights(nlp, P, I) - # TODO: this should be handled better? - nlp = before_to_disk(nlp) - return nlp - - -def init_vocab( - nlp: Language, - *, - data: Optional[Path] = None, - lookups: Optional[Lookups] = None, - vectors: Optional[str] = None, -) -> Language: - if lookups: - nlp.vocab.lookups = lookups - msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") - data_path = util.ensure_path(data) - if data_path is not None: - lex_attrs = srsly.read_jsonl(data_path) - for lexeme in nlp.vocab: - lexeme.rank = OOV_RANK - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) - if len(nlp.vocab): - oov_prob = min(lex.prob for lex in nlp.vocab) - 1 - else: - oov_prob = DEFAULT_OOV_PROB - nlp.vocab.cfg.update({"oov_prob": oov_prob}) - msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - msg.good("Created vocabulary") - if vectors is not None: - add_vectors(nlp, vectors) - msg.good(f"Added vectors: {vectors}") - - -def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] -) -> None: - # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - P = pretrain_config - V = vocab_config - weights_data = None - init_tok2vec = util.ensure_path(V["init_tok2vec"]) - if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not V["vectors"]: - err = "Need initialize.vectors if pretraining.objective.type is vectors" - msg.fail(err, exits=1) - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - if weights_data is not None: - tok2vec_component = P["component"] - if tok2vec_component is None: - msg.fail( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them.", - exits=1, - ) - layer = nlp.get_pipe(tok2vec_component).model - if P["layer"]: - layer = layer.get_ref(P["layer"]) - layer.from_bytes(weights_data) - msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'") - - -def add_vectors(nlp: Language, vectors: str) -> None: - title = f"Config validation error for vectors {vectors}" - desc = ( - "This typically means that there's a problem in the config.cfg included " - "with the packaged vectors. Make sure that the vectors package you're " - "loading is compatible with the current version of spaCy." - ) - with show_validation_error( - title=title, desc=desc, hint_fill=False, show_config=False - ): - util.load_vectors_into_model(nlp, vectors) - msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}") - - -def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - ) - - -def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: - def before_to_disk(nlp: Language) -> Language: - if not callback: - return nlp - modified_nlp = callback(nlp) - if not isinstance(modified_nlp, Language): - err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) - raise ValueError(err) - return modified_nlp - - return before_to_disk diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 29e220b95..6494486a9 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,25 +1,13 @@ from typing import Optional -import numpy -import time -import re -from collections import Counter from pathlib import Path -from thinc.api import require_gpu, set_gpu_allocator -from thinc.api import set_dropout_rate, to_categorical, fix_random_seed -from thinc.api import Config, CosineDistance, L2Distance from wasabi import msg -import srsly -from functools import partial import typer +import re from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code -from ..ml.models.multi_task import build_cloze_multi_task_model -from ..ml.models.multi_task import build_cloze_characters_multi_task_model -from ..tokens import Doc -from ..attrs import ID -from .. import util -from ..util import dot_to_object +from ._util import import_code, setup_gpu, CliLogger +from ..training.pretrain import pretrain +from ..util import load_config @app.command( @@ -61,15 +49,11 @@ def pretrain_cli( config_overrides = parse_config_overrides(ctx.args) import_code(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") with show_validation_error(config_path): - raw_config = util.load_config( + raw_config = load_config( config_path, overrides=config_overrides, interpolate=False ) config = raw_config.interpolate() @@ -89,250 +73,11 @@ def pretrain_cli( resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, + logger=CliLogger, ) - - -def pretrain( - config: Config, - output_dir: Path, - resume_path: Optional[Path] = None, - epoch_resume: Optional[int] = None, - use_gpu: int = -1, -): - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - nlp = util.load_model_from_config(config) - C = util.resolve_training_config(nlp.config) - P_cfg = C["pretraining"] - corpus = dot_to_object(C, P_cfg["corpus"]) - batcher = P_cfg["batcher"] - model = create_pretraining_model(nlp, C["pretraining"]) - optimizer = C["pretraining"]["optimizer"] - # Load in pretrained weights to resume from - if resume_path is not None: - _resume_model(model, resume_path, epoch_resume) - else: - # Without '--resume-path' the '--epoch-resume' argument is ignored - epoch_resume = 0 - - tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") - row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} - msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - - def _save_model(epoch, is_temp=False): - is_temp_str = ".temp" if is_temp else "" - with model.use_params(optimizer.averages): - with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: - file_.write(model.get_ref("tok2vec").to_bytes()) - log = { - "nr_word": tracker.nr_word, - "loss": tracker.loss, - "epoch_loss": tracker.epoch_loss, - "epoch": epoch, - } - with (output_dir / "log.jsonl").open("a") as file_: - file_.write(srsly.json_dumps(log) + "\n") - - objective = create_objective(P_cfg["objective"]) - # TODO: I think we probably want this to look more like the - # 'create_train_batches' function? - for epoch in range(epoch_resume, P_cfg["max_epochs"]): - for batch_id, batch in enumerate(batcher(corpus(nlp))): - docs = ensure_docs(batch) - loss = make_update(model, docs, optimizer, objective) - progress = tracker.update(epoch, loss, docs) - if progress: - msg.row(progress, **row_settings) - if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0): - _save_model(epoch, is_temp=True) - _save_model(epoch) - tracker.epoch_loss = 0.0 msg.good("Successfully finished pretrain") -def ensure_docs(examples_or_docs): - docs = [] - for eg_or_doc in examples_or_docs: - if isinstance(eg_or_doc, Doc): - docs.append(eg_or_doc) - else: - docs.append(eg_or_doc.reference) - return docs - - -def _resume_model(model, resume_path, epoch_resume): - msg.info(f"Resume training tok2vec from: {resume_path}") - with resume_path.open("rb") as file_: - weights_data = file_.read() - model.get_ref("tok2vec").from_bytes(weights_data) - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(resume_path)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - msg.info(f"Resuming from epoch: {epoch_resume}") - else: - msg.info(f"Resuming from epoch: {epoch_resume}") - - -def make_update(model, docs, optimizer, objective_func): - """Perform an update over a single batch of documents. - - docs (iterable): A batch of `Doc` objects. - optimizer (callable): An optimizer. - RETURNS loss: A float for the loss. - """ - predictions, backprop = model.begin_update(docs) - loss, gradients = objective_func(model.ops, docs, predictions) - backprop(gradients) - model.finish_update(optimizer) - # Don't want to return a cupy object here - # The gradients are modified in-place by the BERT MLM, - # so we get an accurate loss - return float(loss) - - -def create_objective(config): - """Create the objective for pretraining. - - We'd like to replace this with a registry function but it's tricky because - we're also making a model choice based on this. For now we hard-code support - for two types (characters, vectors). For characters you can specify - n_characters, for vectors you can specify the loss. - - Bleh. - """ - objective_type = config["type"] - if objective_type == "characters": - return partial(get_characters_loss, nr_char=config["n_characters"]) - elif objective_type == "vectors": - if config["loss"] == "cosine": - return partial( - get_vectors_loss, - distance=CosineDistance(normalize=True, ignore_zeros=True), - ) - elif config["loss"] == "L2": - return partial( - get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True) - ) - else: - raise ValueError("Unexpected loss type", config["loss"]) - else: - raise ValueError("Unexpected objective_type", objective_type) - - -def get_vectors_loss(ops, docs, prediction, distance): - """Compute a loss based on a distance between the documents' vectors and - the prediction. - """ - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = docs[0].vocab.vectors.data[ids] - d_target, loss = distance(prediction, target) - return loss, d_target - - -def get_characters_loss(ops, docs, prediction, nr_char): - """Compute a loss based on a number of characters predicted from the docs.""" - target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) - target_ids = target_ids.reshape((-1,)) - target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") - target = target.reshape((-1, 256 * nr_char)) - diff = prediction - target - loss = (diff ** 2).sum() - d_target = diff / float(prediction.shape[0]) - return loss, d_target - - -def create_pretraining_model(nlp, pretrain_config): - """Define a network for the pretraining. We simply add an output layer onto - the tok2vec input model. The tok2vec input model needs to be a model that - takes a batch of Doc objects (as a list), and returns a list of arrays. - Each array in the output needs to have one row per token in the doc. - The actual tok2vec layer is stored as a reference, and only this bit will be - serialized to file and read back in when calling the 'train' command. - """ - component = nlp.get_pipe(pretrain_config["component"]) - if pretrain_config.get("layer"): - tok2vec = component.model.get_ref(pretrain_config["layer"]) - else: - tok2vec = component.model - - # TODO - maxout_pieces = 3 - hidden_size = 300 - if pretrain_config["objective"]["type"] == "vectors": - model = build_cloze_multi_task_model( - nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces - ) - elif pretrain_config["objective"]["type"] == "characters": - model = build_cloze_characters_multi_task_model( - nlp.vocab, - tok2vec, - hidden_size=hidden_size, - maxout_pieces=maxout_pieces, - nr_char=pretrain_config["objective"]["n_characters"], - ) - model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - set_dropout_rate(model, pretrain_config["dropout"]) - return model - - -class ProgressTracker: - def __init__(self, frequency=1000000): - self.loss = 0.0 - self.prev_loss = 0.0 - self.nr_word = 0 - self.words_per_epoch = Counter() - self.frequency = frequency - self.last_time = time.time() - self.last_update = 0 - self.epoch_loss = 0.0 - - def update(self, epoch, loss, docs): - self.loss += loss - self.epoch_loss += loss - words_in_batch = sum(len(doc) for doc in docs) - self.words_per_epoch[epoch] += words_in_batch - self.nr_word += words_in_batch - words_since_update = self.nr_word - self.last_update - if words_since_update >= self.frequency: - wps = words_since_update / (time.time() - self.last_time) - self.last_update = self.nr_word - self.last_time = time.time() - loss_per_word = self.loss - self.prev_loss - status = ( - epoch, - self.nr_word, - _smart_round(self.loss, width=10), - _smart_round(loss_per_word, width=6), - int(wps), - ) - self.prev_loss = float(self.loss) - return status - else: - return None - - -def _smart_round(figure, width=10, max_decimal=4): - """Round large numbers as integers, smaller numbers as decimals.""" - n_digits = len(str(int(figure))) - n_decimal = width - (n_digits + 1) - if n_decimal <= 1: - return str(int(figure)) - else: - n_decimal = min(n_decimal, max_decimal) - format_str = "%." + str(n_decimal) + "f" - return format_str % figure - - def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index afaf230d1..aa0e71b5a 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,24 +1,16 @@ -from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer -import tqdm +from typing import Optional from pathlib import Path from wasabi import msg -import thinc -import thinc.schedules -from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator -import random +from thinc.api import Config import typer import logging -from .init_pipeline import init_pipeline -from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code +from ._util import import_code, CliLogger, setup_gpu from ..language import Language +from ..training.loop import train +from ..training.initialize import init_nlp, must_reinitialize from .. import util -from ..errors import Errors -from ..util import resolve_dot_names, registry -from ..schemas import ConfigSchemaTraining @app.command( @@ -52,31 +44,33 @@ def train_cli( verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) - if use_gpu >= 0: - msg.info(f"Using GPU: {use_gpu}") - require_gpu(use_gpu) - else: - msg.info("Using CPU") - config = util.load_config(config_path, overrides=overrides, interpolate=False) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") - nlp = init_nlp(config, output_path) + nlp = init_pipeline(config, output_path, use_gpu=use_gpu) msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu) + final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger) + if final_path: + msg.good(f"Saved pipeline to output directory", final_path) -def init_nlp(config: Config, output_path: Optional[Path]) -> Language: +def init_pipeline( + config: Config, output_path: Optional[Path], *, use_gpu: int = -1 +) -> Language: + init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): msg.info(f"Initializing the pipeline in {init_path}") - nlp = init_pipeline(config) + nlp = init_nlp(config, **init_kwargs) nlp.to_disk(init_path) msg.good(f"Saved initialized pipeline to {init_path}") else: nlp = util.load_model(init_path) if must_reinitialize(config, nlp.config): msg.warn("Config has changed: need to re-initialize pipeline") - nlp = init_pipeline(config) + nlp = init_nlp(config, **init_kwargs) nlp.to_disk(init_path) msg.good(f"Re-initialized pipeline in {init_path}") else: @@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language: "the vocabulary, vectors and label scheme. To take advantage of this, " "provide an output directory." ) - return init_pipeline(config) - - -def train( - nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1 -) -> None: - # Create iterator, which yields out info after each optimization step. - config = nlp.config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"]] - train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - optimizer = T["optimizer"] - score_weights = T["score_weights"] - batcher = T["batcher"] - train_logger = T["logger"] - before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T["frozen_components"] - # Create iterator, which yields out info after each optimization step. - training_step_iterator = train_while_improving( - nlp, - optimizer, - create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), - create_evaluation_callback(nlp, dev_corpus, score_weights), - dropout=T["dropout"], - accumulate_gradient=T["accumulate_gradient"], - patience=T["patience"], - max_steps=T["max_steps"], - eval_frequency=T["eval_frequency"], - exclude=frozen_components, - ) - msg.info(f"Pipeline: {nlp.pipe_names}") - if frozen_components: - msg.info(f"Frozen components: {frozen_components}") - msg.info(f"Initial learn rate: {optimizer.learn_rate}") - with nlp.select_pipes(disable=frozen_components): - print_row, finalize_logger = train_logger(nlp) - - try: - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch 1") - for batch, info, is_best_checkpoint in training_step_iterator: - progress.update(1) - if is_best_checkpoint is not None: - progress.close() - print_row(info) - if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=frozen_components): - update_meta(T, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch {info['epoch']}") - except Exception as e: - finalize_logger() - if output_path is not None: - # We don't want to swallow the traceback if we don't have a - # specific error. - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" - ) - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-final") - raise e - finally: - finalize_logger() - if output_path is not None: - final_model_path = output_path / "model-final" - if optimizer.averages: - with nlp.use_params(optimizer.averages): - nlp.to_disk(final_model_path) - else: - nlp.to_disk(final_model_path) - msg.good(f"Saved pipeline to output directory {final_model_path}") - - -def must_reinitialize(train_config: Config, init_config: Config) -> bool: - # TODO: do this better and more fine-grained - return train_config.interpolate().to_str() == init_config.interpolate().to_str() - - -def add_vectors(nlp: Language, vectors: str) -> None: - title = f"Config validation error for vectors {vectors}" - desc = ( - "This typically means that there's a problem in the config.cfg included " - "with the packaged vectors. Make sure that the vectors package you're " - "loading is compatible with the current version of spaCy." - ) - with show_validation_error( - title=title, desc=desc, hint_fill=False, show_config=False - ): - util.load_vectors_into_model(nlp, vectors) - - -def create_train_batches(iterator, batcher, max_epochs: int): - epoch = 0 - examples = list(iterator) - if not examples: - # Raise error if no data - raise ValueError(Errors.E986) - while max_epochs < 1 or epoch != max_epochs: - random.shuffle(examples) - for batch in batcher(examples): - yield epoch, batch - epoch += 1 - - -def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] -) -> Callable[[], Tuple[float, Dict[str, float]]]: - weights = {key: value for key, value in weights.items() if value is not None} - - def evaluate() -> Tuple[float, Dict[str, float]]: - dev_examples = list(dev_corpus(nlp)) - scores = nlp.evaluate(dev_examples) - # Calculate a weighted sum based on score_weights for the main score. - # We can only consider scores that are ints/floats, not dicts like - # entity scores per type etc. - for key, value in scores.items(): - if key in weights and not isinstance(value, (int, float)): - raise ValueError(Errors.E915.format(name=key, score_type=type(value))) - try: - weighted_score = sum( - scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights - ) - except KeyError as e: - keys = list(scores.keys()) - err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) - raise KeyError(err) from None - return weighted_score, scores - - return evaluate - - -def train_while_improving( - nlp: Language, - optimizer: Optimizer, - train_data, - evaluate, - *, - dropout: float, - eval_frequency: int, - accumulate_gradient: int, - patience: int, - max_steps: int, - exclude: List[str], -): - """Train until an evaluation stops improving. Works as a generator, - with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, - where info is a dict, and is_best_checkpoint is in [True, False, None] -- - None indicating that the iteration was not evaluated as a checkpoint. - The evaluation is conducted by calling the evaluate callback. - - Positional arguments: - nlp: The spaCy pipeline to evaluate. - optimizer: The optimizer callable. - train_data (Iterable[Batch]): A generator of batches, with the training - data. Each batch should be a Sized[Tuple[Input, Annot]]. The training - data iterable needs to take care of iterating over the epochs and - shuffling. - evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. - The callback should take no arguments and return a tuple - `(main_score, other_scores)`. The main_score should be a float where - higher is better. other_scores can be any object. - - Every iteration, the function yields out a tuple with: - - * batch: A list of Example objects. - * info: A dict with various information about the last update (see below). - * is_best_checkpoint: A value in None, False, True, indicating whether this - was the best evaluation so far. You should use this to save the model - checkpoints during training. If None, evaluation was not conducted on - that iteration. False means evaluation was conducted, but a previous - evaluation was better. - - The info dict provides the following information: - - epoch (int): How many passes over the data have been completed. - step (int): How many steps have been completed. - score (float): The main score from the last evaluation. - other_scores: : The other scores from the last evaluation. - losses: The accumulated losses throughout training. - checkpoints: A list of previous results, where each result is a - (score, step, epoch) tuple. - """ - if isinstance(dropout, float): - dropouts = thinc.schedules.constant(dropout) - else: - dropouts = dropout - results = [] - losses = {} - words_seen = 0 - start_time = timer() - for step, (epoch, batch) in enumerate(train_data): - dropout = next(dropouts) - for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude - ) - # TODO: refactor this so we don't have to run it separately in here - for name, proc in nlp.pipeline: - if ( - name not in exclude - and hasattr(proc, "model") - and proc.model not in (True, False, None) - ): - proc.model.finish_update(optimizer) - optimizer.step_schedules() - if not (step % eval_frequency): - if optimizer.averages: - with nlp.use_params(optimizer.averages): - score, other_scores = evaluate() - else: - score, other_scores = evaluate() - results.append((score, step)) - is_best_checkpoint = score == max(results)[0] - else: - score, other_scores = (None, None) - is_best_checkpoint = None - words_seen += sum(len(eg) for eg in batch) - info = { - "epoch": epoch, - "step": step, - "score": score, - "other_scores": other_scores, - "losses": losses, - "checkpoints": results, - "seconds": int(timer() - start_time), - "words": words_seen, - } - yield batch, info, is_best_checkpoint - if is_best_checkpoint is not None: - losses = {} - # Stop if no improvement in `patience` updates (if specified) - best_score, best_step = max(results) - if patience and (step - best_step) >= patience: - break - # Stop if we've exhausted our max steps (if specified) - if max_steps and step >= max_steps: - break - - -def subdivide_batch(batch, accumulate_gradient): - batch = list(batch) - batch.sort(key=lambda eg: len(eg.predicted)) - sub_len = len(batch) // accumulate_gradient - start = 0 - for i in range(accumulate_gradient): - subbatch = batch[start : start + sub_len] - if subbatch: - yield subbatch - start += len(subbatch) - subbatch = batch[start:] - if subbatch: - yield subbatch - - -def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] -) -> None: - nlp.meta["performance"] = {} - for metric in training["score_weights"]: - if metric is not None: - nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) - for pipe_name in nlp.pipe_names: - nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + return init_nlp(config, **init_kwargs) def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: @@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - - -# TODO: this is currently imported by the ray extension and not used otherwise -def load_from_paths( - config: Config, -) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - weights_data = None - init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) - if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - return None, {}, {}, weights_data diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 232b53e1d..02e189834 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.training import Example +from spacy.training.initialize import verify_textcat_config from ..util import make_tempdir -from ...cli.train import verify_textcat_config -from ...training import Example TRAIN_DATA = [ diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index caf4ea890..ee103208c 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR -from spacy.cli.debug_config import check_section_refs from thinc.api import ConfigValidationError, Config import srsly import os @@ -414,15 +413,3 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] - - -def test_check_section_refs(): - config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} - config = Config(config) - # Valid section reference - check_section_refs(config, ["a.b.c"]) - # Section that doesn't exist in this config - check_section_refs(config, ["x.y.z"]) - # Invalid section reference - with pytest.raises(ConfigValidationError): - check_section_refs(config, ["a.b.c", "f.g"]) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4e079d29e..e6ef45f90 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,7 +7,6 @@ from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from thinc.api import Optimizer @pytest.fixture @@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected): result = util.dot_to_dict(dot_notation) assert result == expected assert util.dict_to_dot(result) == dot_notation - - -def test_resolve_training_config(): - config = { - "nlp": {"lang": "en", "disabled": []}, - "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}}, - "corpora": {}, - } - resolved = util.resolve_training_config(config) - assert resolved["training"]["dropout"] == 0.1 - assert isinstance(resolved["training"]["optimizer"], Optimizer) - assert resolved["corpora"] == {} - assert "nlp" not in resolved diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 0647b8556..f48cfba00 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,14 +1,15 @@ import pytest -from .util import get_random_doc - from spacy import util from spacy.util import dot_to_object, SimpleFrozenList -from thinc.api import Config, Optimizer +from thinc.api import Config, Optimizer, ConfigValidationError from spacy.training.batchers import minibatch_by_words -from ..lang.en import English -from ..lang.nl import Dutch -from ..language import DEFAULT_CONFIG_PATH +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import DEFAULT_CONFIG_PATH +from spacy.schemas import ConfigSchemaTraining + +from .util import get_random_doc @pytest.mark.parametrize( @@ -101,8 +102,8 @@ def test_util_dot_section(): dot_to_object(en_nlp.config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_nlp.config, "nlp.unknownattribute") - resolved = util.resolve_training_config(nl_nlp.config) - assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer) + T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) + assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) def test_simple_frozen_list(): @@ -120,3 +121,17 @@ def test_simple_frozen_list(): t = SimpleFrozenList(["foo", "bar"], error="Error!") with pytest.raises(NotImplementedError): t.append("baz") + + +def test_resolve_dot_names(): + config = { + "training": {"optimizer": {"@optimizers": "Adam.v1"}}, + "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, + } + result = util.resolve_dot_names(config, ["foo.bar"]) + assert isinstance(result[0], Optimizer) + with pytest.raises(ConfigValidationError) as e: + util.resolve_dot_names(config, ["foo.baz", "foo.bar"]) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c06c9d282..7d94d5ddc 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable import pytest from thinc.api import Config from spacy import Language -from spacy.util import load_model_from_config, registry, dot_to_object -from spacy.util import resolve_training_config +from spacy.util import load_model_from_config, registry, resolve_dot_names +from spacy.schemas import ConfigSchemaTraining from spacy.training import Example @@ -39,21 +39,21 @@ def test_readers(): config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) + dot_names = ["training.train_corpus", "training.dev_corpus"] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) - optimizer = resolved["training"]["optimizer"] + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) scores = nlp.evaluate(list(dev_corpus(nlp))) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats - extra_corpus = resolved["corpora"]["extra"] + extra_corpus = registry.resolve(nlp.config["corpora"])["extra"] assert isinstance(extra_corpus, Callable) @@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) - optimizer = resolved["training"]["optimizer"] + dot_names = ["training.train_corpus", "training.dev_corpus"] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): @@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config): assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e69de29bb..8938886fe 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -0,0 +1,205 @@ +from typing import Union, Dict, Optional, Any, List, Callable +from thinc.api import Config, fix_random_seed, set_gpu_allocator +from thinc.api import ConfigValidationError +from pathlib import Path +import srsly + +from .loop import create_before_to_disk_callback +from ..language import Language +from ..lookups import Lookups +from ..errors import Errors +from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain +from ..util import registry, load_model_from_config, resolve_dot_names +from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB + + +def init_nlp( + config: Config, + *, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, + on_success: Callable[[str], None] = lambda x: None, +) -> Language: + raw_config = config + config = raw_config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + nlp = load_model_from_config(raw_config, auto_fill=True) + on_success("Set up nlp object from config") + config = nlp.config.interpolate() + # Resolve all training-relevant sections using the filled nlp config + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + V = I["vocab"] + init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) + optimizer = T["optimizer"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + logger.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + logger.info(f"Resuming training for: {resume_components}") + nlp.resume_training(sgd=optimizer) + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + on_success(f"Initialized pipeline components") + # Verify the config after calling 'begin_training' to ensure labels + # are properly initialized + verify_config(nlp) + if "pretraining" in config and config["pretraining"]: + P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) + loaded = add_tok2vec_weights(nlp, P, I) + if loaded and P["component"]: + on_success(f"Loaded pretrained weights into component '{P['component']}'") + nlp = before_to_disk(nlp) + return nlp + + +def must_reinitialize(train_config: Config, init_config: Config) -> bool: + # TODO: do this better and more fine-grained + return train_config.interpolate().to_str() == init_config.interpolate().to_str() + + +def init_vocab( + nlp: Language, + *, + data: Optional[Path] = None, + lookups: Optional[Lookups] = None, + vectors: Optional[str] = None, + on_success: Callable[[str], None] = lambda x: None, +) -> Language: + if lookups: + nlp.vocab.lookups = lookups + on_success(f"Added vocab lookups: {', '.join(lookups.tables)}") + data_path = ensure_path(data) + if data_path is not None: + lex_attrs = srsly.read_jsonl(data_path) + for lexeme in nlp.vocab: + lexeme.rank = OOV_RANK + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) + on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab") + on_success("Created vocabulary") + if vectors is not None: + load_vectors_into_model(nlp, vectors) + on_success(f"Added vectors: {vectors}") + + +def load_vectors_into_model( + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True +) -> None: + """Load word vectors from an installed model or path into a model instance.""" + try: + vectors_nlp = load_model(name) + except ConfigValidationError as e: + title = f"Config validation error for vectors {name}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + err = ConfigValidationError.from_error(config=None, title=title, desc=desc) + raise err from None + nlp.vocab.vectors = vectors_nlp.vocab.vectors + if add_strings: + # I guess we should add the strings from the vectors_nlp model? + # E.g. if someone does a similarity query, they might expect the strings. + for key in nlp.vocab.vectors.key2row: + if key in vectors_nlp.vocab.strings: + nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) + + +def add_tok2vec_weights( + nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] +) -> bool: + # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + P = pretrain_config + V = vocab_config + weights_data = None + init_tok2vec = ensure_path(V["init_tok2vec"]) + if init_tok2vec is not None: + if P["objective"].get("type") == "vectors" and not V["vectors"]: + err = 'need initialize.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize", "vectors"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + if not init_tok2vec.exists(): + err = f"can't find pretrained tok2vec: {init_tok2vec}" + errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + if weights_data is not None: + tok2vec_component = P["component"] + if tok2vec_component is None: + desc = ( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them." + ) + err = "component can't be null" + errors = [{"loc": ["pretraining", "component"], "msg": err}] + raise ConfigValidationError( + config=nlp.config["pretraining"], errors=errors, desc=desc + ) + layer = nlp.get_pipe(tok2vec_component).model + if P["layer"]: + layer = layer.get_ref(P["layer"]) + layer.from_bytes(weights_data) + return True + return False + + +def verify_config(nlp: Language) -> None: + """Perform additional checks based on the config, loaded nlp object and training data.""" + # TODO: maybe we should validate based on the actual components, the list + # in config["nlp"]["pipeline"] instead? + for pipe_config in nlp.config["components"].values(): + # We can't assume that the component name == the factory + factory = pipe_config["factory"] + if factory == "textcat": + verify_textcat_config(nlp, pipe_config) + + +def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if pipe_config.get("positive_label"): + textcat_labels = nlp.get_pipe("textcat").labels + pos_label = pipe_config.get("positive_label") + if pos_label not in textcat_labels: + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) + ) + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) + ) + + +def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: + """RETURNS (List[str]): All sourced components in the original config, + e.g. {"source": "en_core_web_sm"}. If the config contains a key + "factory", we assume it refers to a component factory. + """ + return [ + name + for name, cfg in config.get("components", {}).items() + if "factory" not in cfg and "source" in cfg + ] diff --git a/spacy/training/loop.py b/spacy/training/loop.py new file mode 100644 index 000000000..3e3e9f5ce --- /dev/null +++ b/spacy/training/loop.py @@ -0,0 +1,301 @@ +from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any +from typing import Optional +from pathlib import Path +from timeit import default_timer as timer +from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +import random +import tqdm + +from .example import Example +from ..schemas import ConfigSchemaTraining +from ..language import Language +from ..errors import Errors +from ..util import resolve_dot_names, registry, logger + + +def train( + nlp: Language, + output_path: Optional[Path] = None, + *, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, +) -> Optional[Path]: + """Train a pipeline. + + nlp (Language): The initialized nlp object with the full config. + output_path (Path): Optional output path to save trained model to. + use_gpu (int): Whether to train on GPU. Make sure to call require_gpu + before calling this function. + logger (Callable[[Any], Any]): Optional logger exposing the methods info, + error, debug and warn. Defaults to regular spaCy logger but can be + swapped for CLI logger. + RETURNS (Path / None): The path to the final exported model. + """ + + # Create iterator, which yields out info after each optimization step. + config = nlp.config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + optimizer = T["optimizer"] + score_weights = T["score_weights"] + batcher = T["batcher"] + train_logger = T["logger"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Create iterator, which yields out info after each optimization step. + training_step_iterator = train_while_improving( + nlp, + optimizer, + create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), + create_evaluation_callback(nlp, dev_corpus, score_weights), + dropout=T["dropout"], + accumulate_gradient=T["accumulate_gradient"], + patience=T["patience"], + max_steps=T["max_steps"], + eval_frequency=T["eval_frequency"], + exclude=frozen_components, + ) + logger.info(f"Pipeline: {nlp.pipe_names}") + if frozen_components: + logger.info(f"Frozen components: {frozen_components}") + logger.info(f"Initial learn rate: {optimizer.learn_rate}") + with nlp.select_pipes(disable=frozen_components): + print_row, finalize_logger = train_logger(nlp) + try: + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch 1") + for batch, info, is_best_checkpoint in training_step_iterator: + progress.update(1) + if is_best_checkpoint is not None: + progress.close() + print_row(info) + if is_best_checkpoint and output_path is not None: + with nlp.select_pipes(disable=frozen_components): + update_meta(T, nlp, info) + with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-best") + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch {info['epoch']}") + except Exception as e: + finalize_logger() + if output_path is not None: + # We don't want to swallow the traceback if we don't have a + # specific error. + logger.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}" + ) + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-final") + raise e + finally: + finalize_logger() + if output_path is not None: + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: + nlp.to_disk(final_model_path) + return final_model_path + + +def train_while_improving( + nlp: Language, + optimizer: Optimizer, + train_data, + evaluate, + *, + dropout: float, + eval_frequency: int, + accumulate_gradient: int, + patience: int, + max_steps: int, + exclude: List[str], +): + """Train until an evaluation stops improving. Works as a generator, + with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, + where info is a dict, and is_best_checkpoint is in [True, False, None] -- + None indicating that the iteration was not evaluated as a checkpoint. + The evaluation is conducted by calling the evaluate callback. + + Positional arguments: + nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. + train_data (Iterable[Batch]): A generator of batches, with the training + data. Each batch should be a Sized[Tuple[Input, Annot]]. The training + data iterable needs to take care of iterating over the epochs and + shuffling. + evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. + The callback should take no arguments and return a tuple + `(main_score, other_scores)`. The main_score should be a float where + higher is better. other_scores can be any object. + + Every iteration, the function yields out a tuple with: + + * batch: A list of Example objects. + * info: A dict with various information about the last update (see below). + * is_best_checkpoint: A value in None, False, True, indicating whether this + was the best evaluation so far. You should use this to save the model + checkpoints during training. If None, evaluation was not conducted on + that iteration. False means evaluation was conducted, but a previous + evaluation was better. + + The info dict provides the following information: + + epoch (int): How many passes over the data have been completed. + step (int): How many steps have been completed. + score (float): The main score from the last evaluation. + other_scores: : The other scores from the last evaluation. + losses: The accumulated losses throughout training. + checkpoints: A list of previous results, where each result is a + (score, step, epoch) tuple. + """ + if isinstance(dropout, float): + dropouts = constant(dropout) + else: + dropouts = dropout + results = [] + losses = {} + words_seen = 0 + start_time = timer() + for step, (epoch, batch) in enumerate(train_data): + dropout = next(dropouts) + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( + subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + ) + # TODO: refactor this so we don't have to run it separately in here + for name, proc in nlp.pipeline: + if ( + name not in exclude + and hasattr(proc, "model") + and proc.model not in (True, False, None) + ): + proc.model.finish_update(optimizer) + optimizer.step_schedules() + if not (step % eval_frequency): + if optimizer.averages: + with nlp.use_params(optimizer.averages): + score, other_scores = evaluate() + else: + score, other_scores = evaluate() + results.append((score, step)) + is_best_checkpoint = score == max(results)[0] + else: + score, other_scores = (None, None) + is_best_checkpoint = None + words_seen += sum(len(eg) for eg in batch) + info = { + "epoch": epoch, + "step": step, + "score": score, + "other_scores": other_scores, + "losses": losses, + "checkpoints": results, + "seconds": int(timer() - start_time), + "words": words_seen, + } + yield batch, info, is_best_checkpoint + if is_best_checkpoint is not None: + losses = {} + # Stop if no improvement in `patience` updates (if specified) + best_score, best_step = max(results) + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and step >= max_steps: + break + + +def subdivide_batch(batch, accumulate_gradient): + batch = list(batch) + batch.sort(key=lambda eg: len(eg.predicted)) + sub_len = len(batch) // accumulate_gradient + start = 0 + for i in range(accumulate_gradient): + subbatch = batch[start : start + sub_len] + if subbatch: + yield subbatch + start += len(subbatch) + subbatch = batch[start:] + if subbatch: + yield subbatch + + +def create_evaluation_callback( + nlp: Language, dev_corpus: Callable, weights: Dict[str, float] +) -> Callable[[], Tuple[float, Dict[str, float]]]: + weights = {key: value for key, value in weights.items() if value is not None} + + def evaluate() -> Tuple[float, Dict[str, float]]: + dev_examples = list(dev_corpus(nlp)) + scores = nlp.evaluate(dev_examples) + # Calculate a weighted sum based on score_weights for the main score. + # We can only consider scores that are ints/floats, not dicts like + # entity scores per type etc. + for key, value in scores.items(): + if key in weights and not isinstance(value, (int, float)): + raise ValueError(Errors.E915.format(name=key, score_type=type(value))) + try: + weighted_score = sum( + scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights + ) + except KeyError as e: + keys = list(scores.keys()) + err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) + raise KeyError(err) from None + return weighted_score, scores + + return evaluate + + +def create_train_batches( + iterator: Iterator[Example], + batcher: Callable[[Iterable[Example]], Iterable[Example]], + max_epochs: int, +): + epoch = 0 + examples = list(iterator) + if not examples: + # Raise error if no data + raise ValueError(Errors.E986) + while max_epochs < 1 or epoch != max_epochs: + random.shuffle(examples) + for batch in batcher(examples): + yield epoch, batch + epoch += 1 + + +def update_meta( + training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] +) -> None: + nlp.meta["performance"] = {} + for metric in training["score_weights"]: + if metric is not None: + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) + for pipe_name in nlp.pipe_names: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + + +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py new file mode 100644 index 000000000..1e0f055ee --- /dev/null +++ b/spacy/training/pretrain.py @@ -0,0 +1,267 @@ +from typing import Optional, Callable, Any, Iterable, Union, List +from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer +from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance +from pathlib import Path +from functools import partial +from collections import Counter +import srsly +import numpy +import time +import re +from wasabi import msg + +from .example import Example +from ..tokens import Doc +from ..attrs import ID +from ..ml.models.multi_task import build_cloze_multi_task_model +from ..ml.models.multi_task import build_cloze_characters_multi_task_model +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..util import registry, load_model_from_config, dot_to_object, logger + + +def pretrain( + config: Config, + output_dir: Path, + resume_path: Optional[Path] = None, + epoch_resume: Optional[int] = None, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, +): + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + nlp = load_model_from_config(config) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain) + corpus = dot_to_object(T, P["corpus"]) + batcher = P["batcher"] + model = create_pretraining_model(nlp, P) + optimizer = P["optimizer"] + # Load in pretrained weights to resume from + if resume_path is not None: + _resume_model(model, resume_path, epoch_resume) + else: + # Without '--resume-path' the '--epoch-resume' argument is ignored + epoch_resume = 0 + + # TODO: move this to logger function? + tracker = ProgressTracker(frequency=10000) + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") + row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} + msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) + + def _save_model(epoch, is_temp=False): + is_temp_str = ".temp" if is_temp else "" + with model.use_params(optimizer.averages): + with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: + file_.write(model.get_ref("tok2vec").to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(srsly.json_dumps(log) + "\n") + + objective = create_objective(P["objective"]) + # TODO: I think we probably want this to look more like the + # 'create_train_batches' function? + for epoch in range(epoch_resume, P["max_epochs"]): + for batch_id, batch in enumerate(batcher(corpus(nlp))): + docs = ensure_docs(batch) + loss = make_update(model, docs, optimizer, objective) + progress = tracker.update(epoch, loss, docs) + if progress: + msg.row(progress, **row_settings) + if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): + _save_model(epoch, is_temp=True) + _save_model(epoch) + tracker.epoch_loss = 0.0 + + +def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: + docs = [] + for eg_or_doc in examples_or_docs: + if isinstance(eg_or_doc, Doc): + docs.append(eg_or_doc) + else: + docs.append(eg_or_doc.reference) + return docs + + +def _resume_model( + model: Model, + resume_path: Path, + epoch_resume: int, + logger: Callable[[Any], Any] = logger, +) -> None: + logger.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(resume_path)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + logger.info(f"Resuming from epoch: {epoch_resume}") + else: + logger.info(f"Resuming from epoch: {epoch_resume}") + + +def make_update( + model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable +) -> float: + """Perform an update over a single batch of documents. + + docs (iterable): A batch of `Doc` objects. + optimizer (callable): An optimizer. + RETURNS loss: A float for the loss. + """ + predictions, backprop = model.begin_update(docs) + loss, gradients = objective_func(model.ops, docs, predictions) + backprop(gradients) + model.finish_update(optimizer) + # Don't want to return a cupy object here + # The gradients are modified in-place by the BERT MLM, + # so we get an accurate loss + return float(loss) + + +def create_objective(config: Config): + """Create the objective for pretraining. + + We'd like to replace this with a registry function but it's tricky because + we're also making a model choice based on this. For now we hard-code support + for two types (characters, vectors). For characters you can specify + n_characters, for vectors you can specify the loss. + + Bleh. + """ + objective_type = config["type"] + if objective_type == "characters": + return partial(get_characters_loss, nr_char=config["n_characters"]) + elif objective_type == "vectors": + if config["loss"] == "cosine": + distance = CosineDistance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + elif config["loss"] == "L2": + distance = L2Distance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + else: + raise ValueError("Unexpected loss type", config["loss"]) + else: + raise ValueError("Unexpected objective_type", objective_type) + + +def get_vectors_loss(ops, docs, prediction, distance): + """Compute a loss based on a distance between the documents' vectors and + the prediction. + """ + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our tokens, + # and look them up all at once. This prevents data copying. + ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = docs[0].vocab.vectors.data[ids] + d_target, loss = distance(prediction, target) + return loss, d_target + + +def get_characters_loss(ops, docs, prediction, nr_char): + """Compute a loss based on a number of characters predicted from the docs.""" + target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) + target_ids = target_ids.reshape((-1,)) + target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") + target = target.reshape((-1, 256 * nr_char)) + diff = prediction - target + loss = (diff ** 2).sum() + d_target = diff / float(prediction.shape[0]) + return loss, d_target + + +def create_pretraining_model(nlp, pretrain_config): + """Define a network for the pretraining. We simply add an output layer onto + the tok2vec input model. The tok2vec input model needs to be a model that + takes a batch of Doc objects (as a list), and returns a list of arrays. + Each array in the output needs to have one row per token in the doc. + The actual tok2vec layer is stored as a reference, and only this bit will be + serialized to file and read back in when calling the 'train' command. + """ + component = nlp.get_pipe(pretrain_config["component"]) + if pretrain_config.get("layer"): + tok2vec = component.model.get_ref(pretrain_config["layer"]) + else: + tok2vec = component.model + + # TODO + maxout_pieces = 3 + hidden_size = 300 + if pretrain_config["objective"]["type"] == "vectors": + model = build_cloze_multi_task_model( + nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces + ) + elif pretrain_config["objective"]["type"] == "characters": + model = build_cloze_characters_multi_task_model( + nlp.vocab, + tok2vec, + hidden_size=hidden_size, + maxout_pieces=maxout_pieces, + nr_char=pretrain_config["objective"]["n_characters"], + ) + model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + set_dropout_rate(model, pretrain_config["dropout"]) + return model + + +class ProgressTracker: + def __init__(self, frequency=1000000): + self.loss = 0.0 + self.prev_loss = 0.0 + self.nr_word = 0 + self.words_per_epoch = Counter() + self.frequency = frequency + self.last_time = time.time() + self.last_update = 0 + self.epoch_loss = 0.0 + + def update(self, epoch, loss, docs): + self.loss += loss + self.epoch_loss += loss + words_in_batch = sum(len(doc) for doc in docs) + self.words_per_epoch[epoch] += words_in_batch + self.nr_word += words_in_batch + words_since_update = self.nr_word - self.last_update + if words_since_update >= self.frequency: + wps = words_since_update / (time.time() - self.last_time) + self.last_update = self.nr_word + self.last_time = time.time() + loss_per_word = self.loss - self.prev_loss + status = ( + epoch, + self.nr_word, + _smart_round(self.loss, width=10), + _smart_round(loss_per_word, width=6), + int(wps), + ) + self.prev_loss = float(self.loss) + return status + else: + return None + + +def _smart_round( + figure: Union[float, int], width: int = 10, max_decimal: int = 4 +) -> str: + """Round large numbers as integers, smaller numbers as decimals.""" + n_digits = len(str(int(figure))) + n_decimal = width - (n_digits + 1) + if n_decimal <= 1: + return str(int(figure)) + else: + n_decimal = min(n_decimal, max_decimal) + format_str = "%." + str(n_decimal) + "f" + return format_str % figure diff --git a/spacy/util.py b/spacy/util.py index cab7af8fb..9d7199d7f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,6 +8,7 @@ import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer +from thinc.api import ConfigValidationError import functools import itertools import numpy.random @@ -56,6 +57,7 @@ if TYPE_CHECKING: OOV_RANK = numpy.iinfo(numpy.uint64).max +DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, @@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path: return Path(sys.modules[module.__module__].__file__).parent -def load_vectors_into_model( - nlp: "Language", name: Union[str, Path], *, add_strings=True -) -> None: - """Load word vectors from an installed model or path into a model instance.""" - vectors_nlp = load_model(name) - nlp.vocab.vectors = vectors_nlp.vocab.vectors - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) - - def load_model( name: Union[str, Path], *, @@ -391,32 +379,9 @@ def load_model_from_config( return nlp -def resolve_training_config( - config: Config, - exclude: Iterable[str] = ("nlp", "components"), - validate: bool = True, -) -> Dict[str, Any]: - """Resolve the config sections relevant for trainig and create all objects. - Mostly used in the CLI to separate training config (not resolved by default - because not runtime-relevant – an nlp object should load fine even if it's - [training] block refers to functions that are not available etc.). - - config (Config): The config to resolve. - exclude (Iterable[str]): The config blocks to exclude. Those blocks won't - be available in the final resolved config. - validate (bool): Whether to validate the config. - RETURNS (Dict[str, Any]): The resolved config. - """ - config = config.copy() - for key in exclude: - if key in config: - config.pop(key) - return registry.resolve(config, validate=validate) - - def resolve_dot_names( config: Config, dot_names: List[Optional[str]] -) -> List[Optional[Callable]]: +) -> Tuple[Any]: """Resolve one or more "dot notation" names, e.g. corpora.train. The paths could point anywhere into the config, so we don't know which top-level section we'll be looking within. @@ -424,18 +389,42 @@ def resolve_dot_names( We resolve the whole top-level section, although we could resolve less -- we could find the lowest part of the tree. """ + # TODO: include schema? + # TODO: clean this up and avoid duplication resolved = {} output = [] + errors = [] for name in dot_names: if name is None: output.append(name) else: section = name.split(".")[0] - # We want to avoid resolving the same thing twice. + # We want to avoid resolving the same thing twice if section not in resolved: resolved[section] = registry.resolve(config[section]) - output.append(dot_to_object(resolved, name)) - return output + try: + output.append(dot_to_object(resolved, name)) + except KeyError: + msg = f"not a valid section reference: {name}" + errors.append({"loc": name.split("."), "msg": msg}) + objects = [] + for ref in output: + if not isinstance(ref, str): + msg = f"not a valid section reference: {ref} ({type(ref)})" + errors.append({"loc": ref.split("."), "msg": msg}) + continue + section = ref.split(".")[0] + # We want to avoid resolving the same thing twice + if section not in resolved: + resolved[section] = registry.resolve(config[section]) + try: + objects.append(dot_to_object(resolved, ref)) + except KeyError: + msg = f"not a valid section reference: {name}" + errors.append({"loc": ref.split("."), "msg": msg}) + if errors: + raise ConfigValidationError(config=config, errors=errors) + return tuple(objects) def load_model_from_init_py( From 02838a1d470d08ab381524bb1d857a61366759ac Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 15:27:10 +0200 Subject: [PATCH 23/66] Fix resolve_dot_names --- spacy/util.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 9d7199d7f..f9d9e6495 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -379,9 +379,7 @@ def load_model_from_config( return nlp -def resolve_dot_names( - config: Config, dot_names: List[Optional[str]] -) -> Tuple[Any]: +def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]: """Resolve one or more "dot notation" names, e.g. corpora.train. The paths could point anywhere into the config, so we don't know which top-level section we'll be looking within. @@ -410,8 +408,7 @@ def resolve_dot_names( objects = [] for ref in output: if not isinstance(ref, str): - msg = f"not a valid section reference: {ref} ({type(ref)})" - errors.append({"loc": ref.split("."), "msg": msg}) + objects.append(ref) continue section = ref.split(".")[0] # We want to avoid resolving the same thing twice From 2e9c9e74af52dc3f8effbd862f0b999f70d7c926 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 15:34:00 +0200 Subject: [PATCH 24/66] Fix config resolution and interpolation TODO: auto-interpolate in Thinc if config is dict (i.e. likely subsection) --- spacy/cli/debug_data.py | 4 +++- spacy/cli/debug_model.py | 4 +++- spacy/tests/training/test_readers.py | 11 ++++++++--- spacy/training/pretrain.py | 5 +++-- spacy/util.py | 7 ++++++- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index f0e76be2b..c4d1069c0 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -97,7 +97,9 @@ def debug_data( with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) - T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + T = registry.resolve( + nlp.config.interpolate()["training"], schema=ConfigSchemaTraining + ) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index f8fc687fa..0b4db70b6 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -63,7 +63,9 @@ def debug_model_cli( set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + T = registry.resolve( + nlp.config.interpolate()["training"], schema=ConfigSchemaTraining + ) seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 7d94d5ddc..5c02aca36 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -42,7 +42,9 @@ def test_readers(): dot_names = ["training.train_corpus", "training.dev_corpus"] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) - T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + T = registry.resolve( + nlp.config.interpolate()["training"], schema=ConfigSchemaTraining + ) optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) @@ -53,7 +55,8 @@ def test_readers(): # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats - extra_corpus = registry.resolve(nlp.config["corpora"])["extra"] + corpora = {"corpora": nlp.config.interpolate()["corpora"]} + extra_corpus = registry.resolve(corpora)["corpora"]["extra"] assert isinstance(extra_corpus, Callable) @@ -91,7 +94,9 @@ def test_cat_readers(reader, additional_config): nlp = load_model_from_config(config, auto_fill=True) dot_names = ["training.train_corpus", "training.dev_corpus"] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) - T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + T = registry.resolve( + nlp.config["training"].interpolate(), schema=ConfigSchemaTraining + ) optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 1e0f055ee..e8dd9df30 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -33,8 +33,9 @@ def pretrain( if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) nlp = load_model_from_config(config) - T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) - P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain) + _config = nlp.config.interpolate() + T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) + P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) corpus = dot_to_object(T, P["corpus"]) batcher = P["batcher"] model = create_pretraining_model(nlp, P) diff --git a/spacy/util.py b/spacy/util.py index f9d9e6495..67c577927 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -413,7 +413,12 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A section = ref.split(".")[0] # We want to avoid resolving the same thing twice if section not in resolved: - resolved[section] = registry.resolve(config[section]) + if registry.is_promise(config[section]): + # Otherwise we can't resolve [corpus] if it's a promise + result = registry.resolve({"config": config[section]})["config"] + else: + result = registry.resolve(config[section]) + resolved[section] = result try: objects.append(dot_to_object(resolved, ref)) except KeyError: From a139fe672bbf465a829bb2d73558fa61351dfc7e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 21:17:10 +0200 Subject: [PATCH 25/66] Fix typos and refactor CLI logging --- spacy/cli/_util.py | 15 ++---------- spacy/cli/init_pipeline.py | 4 ++-- spacy/cli/pretrain.py | 4 ++-- spacy/cli/train.py | 8 +++---- spacy/training/initialize.py | 45 ++++++++++++++++++------------------ spacy/training/loop.py | 19 +++++++-------- spacy/training/pretrain.py | 24 +++++++++---------- 7 files changed, 52 insertions(+), 67 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c41905970..2c944bf3a 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in return result -class CliLogger: - """Helper mocking up the most commonly used logger methods. Can be passed - into functions like train() to make them output pretty-printed messages - on the CLI and regular logging if used from within Python. - """ - - debug = msg.text - info = msg.info - warn = msg.info - error = msg.fail - - -def setup_gpu(use_gpu: int): +def setup_gpu(use_gpu: int) -> None: + """Configure the GPU and log info.""" if use_gpu >= 0: msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index de1dc8a46..a92705cb0 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -7,7 +7,7 @@ import typer from .. import util from ..training.initialize import init_nlp from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, CliLogger, setup_gpu +from ._util import import_code, setup_gpu @init_cli.command( @@ -32,6 +32,6 @@ def init_pipeline_cli( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good) + nlp = init_nlp(config, use_gpu=use_gpu, silent=False) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 6494486a9..de9341449 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,7 +5,7 @@ import typer import re from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu, CliLogger +from ._util import import_code, setup_gpu from ..training.pretrain import pretrain from ..util import load_config @@ -73,7 +73,7 @@ def pretrain_cli( resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, - logger=CliLogger, + silent=False, ) msg.good("Successfully finished pretrain") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index aa0e71b5a..b0bd48ddb 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -6,7 +6,7 @@ import typer import logging from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, CliLogger, setup_gpu +from ._util import import_code, setup_gpu from ..language import Language from ..training.loop import train from ..training.initialize import init_nlp, must_reinitialize @@ -50,15 +50,13 @@ def train_cli( msg.divider("Initializing pipeline") nlp = init_pipeline(config, output_path, use_gpu=use_gpu) msg.divider("Training pipeline") - final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger) - if final_path: - msg.good(f"Saved pipeline to output directory", final_path) + train(nlp, output_path, use_gpu=use_gpu, silent=False) def init_pipeline( config: Config, output_path: Optional[Path], *, use_gpu: int = -1 ) -> Language: - init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good} + init_kwargs = {"use_gpu": use_gpu, "silent": False} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 8938886fe..ecfc57ee9 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,7 +1,8 @@ -from typing import Union, Dict, Optional, Any, List, Callable +from typing import Union, Dict, Optional, Any, List from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path +from wasabi import Printer import srsly from .loop import create_before_to_disk_callback @@ -10,16 +11,11 @@ from ..lookups import Lookups from ..errors import Errors from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names -from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB +from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB -def init_nlp( - config: Config, - *, - use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, - on_success: Callable[[str], None] = lambda x: None, -) -> Language: +def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: + msg = Printer(no_print=silent) raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -30,7 +26,7 @@ def init_nlp( # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) - on_success("Set up nlp object from config") + msg.good("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) @@ -38,29 +34,31 @@ def init_nlp( train_corpus, dev_corpus = resolve_dot_names(config, dot_names) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) V = I["vocab"] - init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) + init_vocab( + nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent + ) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced_components if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + msg.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - on_success(f"Initialized pipeline components") + msg.good(f"Initialized pipeline components") # Verify the config after calling 'begin_training' to ensure labels # are properly initialized verify_config(nlp) if "pretraining" in config and config["pretraining"]: P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - loaded = add_tok2vec_weights(nlp, P, I) + loaded = add_tok2vec_weights(nlp, P, V) if loaded and P["component"]: - on_success(f"Loaded pretrained weights into component '{P['component']}'") + msg.good(f"Loaded pretrained weights into component '{P['component']}'") nlp = before_to_disk(nlp) return nlp @@ -76,11 +74,12 @@ def init_vocab( data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, - on_success: Callable[[str], None] = lambda x: None, + silent: bool = True, ) -> Language: + msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups - on_success(f"Added vocab lookups: {', '.join(lookups.tables)}") + msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -96,11 +95,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab") - on_success("Created vocabulary") + msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + msg.good("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - on_success(f"Added vectors: {vectors}") + msg.good(f"Added vectors: {vectors}") def load_vectors_into_model( @@ -137,8 +136,8 @@ def add_tok2vec_weights( init_tok2vec = ensure_path(V["init_tok2vec"]) if init_tok2vec is not None: if P["objective"].get("type") == "vectors" and not V["vectors"]: - err = 'need initialize.vectors if pretraining.objective.type is "vectors"' - errors = [{"loc": ["initialize", "vectors"], "msg": err}] + err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize", "vocab"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 3e3e9f5ce..5153be66c 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -5,12 +5,13 @@ from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator import random import tqdm +from wasabi import Printer from .example import Example from ..schemas import ConfigSchemaTraining from ..language import Language from ..errors import Errors -from ..util import resolve_dot_names, registry, logger +from ..util import resolve_dot_names, registry def train( @@ -18,8 +19,8 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, -) -> Optional[Path]: + silent: bool = False, +) -> None: """Train a pipeline. nlp (Language): The initialized nlp object with the full config. @@ -31,7 +32,7 @@ def train( swapped for CLI logger. RETURNS (Path / None): The path to the final exported model. """ - + msg = Printer(no_print=silent) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -62,10 +63,10 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) - logger.info(f"Pipeline: {nlp.pipe_names}") + msg.info(f"Pipeline: {nlp.pipe_names}") if frozen_components: - logger.info(f"Frozen components: {frozen_components}") - logger.info(f"Initial learn rate: {optimizer.learn_rate}") + msg.info(f"Frozen components: {frozen_components}") + msg.info(f"Initial learn rate: {optimizer.learn_rate}") with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) try: @@ -89,7 +90,7 @@ def train( if output_path is not None: # We don't want to swallow the traceback if we don't have a # specific error. - logger.warn( + msg.warn( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" ) @@ -105,7 +106,7 @@ def train( nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - return final_model_path + msg.good(f"Saved pipeline to output directory", final_model_path) def train_while_improving( diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index e8dd9df30..5e136cdf1 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -1,4 +1,4 @@ -from typing import Optional, Callable, Any, Iterable, Union, List +from typing import Optional, Callable, Iterable, Union, List from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance from pathlib import Path @@ -8,7 +8,7 @@ import srsly import numpy import time import re -from wasabi import msg +from wasabi import Printer from .example import Example from ..tokens import Doc @@ -16,7 +16,7 @@ from ..attrs import ID from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain -from ..util import registry, load_model_from_config, dot_to_object, logger +from ..util import registry, load_model_from_config, dot_to_object def pretrain( @@ -25,8 +25,9 @@ def pretrain( resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, + silent: bool = True, ): + msg = Printer(no_print=silent) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] @@ -42,11 +43,10 @@ def pretrain( optimizer = P["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: - _resume_model(model, resume_path, epoch_resume) + _resume_model(model, resume_path, epoch_resume, silent=silent) else: # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 - # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") @@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: def _resume_model( - model: Model, - resume_path: Path, - epoch_resume: int, - logger: Callable[[Any], Any] = logger, + model: Model, resume_path: Path, epoch_resume: int, silent: bool = True, ) -> None: - logger.info(f"Resume training tok2vec from: {resume_path}") + msg = Printer(no_print=silent) + msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) @@ -108,9 +106,9 @@ def _resume_model( if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - logger.info(f"Resuming from epoch: {epoch_resume}") + msg.info(f"Resuming from epoch: {epoch_resume}") else: - logger.info(f"Resuming from epoch: {epoch_resume}") + msg.info(f"Resuming from epoch: {epoch_resume}") def make_update( From 046f655d860601b54265a24af04a7b3352209772 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 21:17:45 +0200 Subject: [PATCH 26/66] Fix error --- spacy/training/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index ecfc57ee9..24b00a764 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -141,7 +141,7 @@ def add_tok2vec_weights( raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" - errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}] + errors = [{"loc": ["initialize", "vocab", "init_tok2vec"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) with init_tok2vec.open("rb") as file_: weights_data = file_.read() From ff9a63bfbd70b0fe140f352da22833c0109eaa2c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 21:35:09 +0200 Subject: [PATCH 27/66] begin_training -> initialize --- spacy/cli/debug_model.py | 4 +- spacy/errors.py | 7 ++- spacy/language.py | 20 +++++-- spacy/pipeline/dep_parser.pyx | 2 +- spacy/pipeline/entity_linker.py | 4 +- spacy/pipeline/morphologizer.pyx | 4 +- spacy/pipeline/multitask.pyx | 6 +- spacy/pipeline/ner.pyx | 2 +- spacy/pipeline/pipe.pyx | 4 +- spacy/pipeline/sentencizer.pyx | 2 +- spacy/pipeline/senter.pyx | 4 +- spacy/pipeline/tagger.pyx | 4 +- spacy/pipeline/textcat.py | 4 +- spacy/pipeline/tok2vec.py | 4 +- spacy/pipeline/transition_parser.pyx | 2 +- spacy/tests/doc/test_add_entities.py | 4 +- spacy/tests/parser/test_add_label.py | 4 +- spacy/tests/parser/test_ner.py | 20 +++---- spacy/tests/parser/test_parse.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 8 +-- spacy/tests/pipeline/test_morphologizer.py | 18 +++--- spacy/tests/pipeline/test_senter.py | 12 ++-- spacy/tests/pipeline/test_tagger.py | 30 +++++----- spacy/tests/pipeline/test_textcat.py | 24 ++++---- spacy/tests/pipeline/test_tok2vec.py | 4 +- spacy/tests/regression/test_issue1-1000.py | 2 +- spacy/tests/regression/test_issue1501-2000.py | 2 +- spacy/tests/regression/test_issue2001-2500.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 4 +- spacy/tests/regression/test_issue3001-3500.py | 4 +- spacy/tests/regression/test_issue3501-4000.py | 8 +-- spacy/tests/regression/test_issue4001-4500.py | 12 ++-- spacy/tests/regression/test_issue4501-5000.py | 2 +- spacy/tests/regression/test_issue5230.py | 4 +- spacy/tests/regression/test_issue5551.py | 2 +- .../tests/serialize/test_serialize_config.py | 6 +- spacy/tests/test_language.py | 2 +- spacy/tests/training/test_readers.py | 4 +- spacy/tests/training/test_training.py | 2 +- spacy/training/initialize.py | 4 +- website/docs/api/architectures.md | 60 +++++++++---------- website/docs/api/dependencyparser.md | 21 ++++--- website/docs/api/entitylinker.md | 12 +++- website/docs/api/entityrecognizer.md | 21 ++++--- website/docs/api/language.md | 23 +++---- website/docs/api/morphologizer.md | 17 +++--- website/docs/api/pipe.md | 24 +++++--- website/docs/api/sentencerecognizer.md | 6 +- website/docs/api/tagger.md | 22 ++++--- website/docs/api/textcategorizer.md | 26 ++++---- website/docs/api/tok2vec.md | 6 +- website/docs/api/transformer.md | 6 +- website/docs/usage/layers-architectures.md | 6 +- website/docs/usage/processing-pipelines.md | 12 ++-- website/docs/usage/training.md | 6 +- website/docs/usage/v3.md | 25 ++++---- 57 files changed, 301 insertions(+), 253 deletions(-) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 0b4db70b6..eca85dc04 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -103,12 +103,12 @@ def debug_model( with data_validation(False): try: train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - nlp.begin_training(lambda: train_corpus(nlp)) + nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) - nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X]) + nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( diff --git a/spacy/errors.py b/spacy/errors.py index 640419182..1f9bcb0ae 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,6 +85,7 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master + W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") @@ -306,7 +307,7 @@ class Errors: "settings: {opts}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") E109 = ("Component '{name}' could not be run. Did you forget to " - "call begin_training()?") + "call initialize()?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E111 = ("Pickling a token is not supported, because tokens are only views " "of the parent Doc and can't exist on their own. A pickled token " @@ -376,7 +377,7 @@ class Errors: "provided {found}.") E143 = ("Labels for component '{name}' not initialized. This can be fixed " "by calling add_label, or by providing a representative batch of " - "examples to the component's begin_training method.") + "examples to the component's initialize method.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -517,7 +518,7 @@ class Errors: "but the provided argument {loc} points to a file.") E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " "not seem to exist.") - E930 = ("Received invalid get_examples callback in {name}.begin_training. " + E930 = ("Received invalid get_examples callback in {name}.initialize. " "Expected function that returns an iterable of Example objects but " "got: {obj}") E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " diff --git a/spacy/language.py b/spacy/language.py index c1d2df026..a5b78b178 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1154,6 +1154,16 @@ class Language: *, sgd: Optional[Optimizer] = None, device: int = -1, + ) -> Optimizer: + warnings.warn(Warnings.W089, DeprecationWarning) + return self.initialize(get_examples, sgd=sgd, device=device) + + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + sgd: Optional[Optimizer] = None, + device: int = -1, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. @@ -1163,11 +1173,11 @@ class Language: create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/language#begin_training + DOCS: https://nightly.spacy.io/api/language#initialize """ if get_examples is None: util.logger.debug( - "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" + "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples" ) doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] @@ -1179,7 +1189,7 @@ class Language: for example in get_examples(): if not isinstance(example, Example): err = Errors.E978.format( - name="Language.begin_training", types=type(example) + name="Language.initialize", types=type(example) ) raise ValueError(err) else: @@ -1198,8 +1208,8 @@ class Language: sgd = create_default_optimizer() self._optimizer = sgd for name, proc in self.pipeline: - if hasattr(proc, "begin_training"): - proc.begin_training( + if hasattr(proc, "initialize"): + proc.initialize( get_examples, pipeline=self.pipeline, sgd=self._optimizer ) self._link_components() diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a447434d2..95effac59 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -132,7 +132,7 @@ cdef class DependencyParser(Parser): labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) + labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd) @property def labels(self): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 039e2a891..0f33378b4 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -140,7 +140,7 @@ class EntityLinker(Pipe): if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, @@ -159,7 +159,7 @@ class EntityLinker(Pipe): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/entitylinker#begin_training + DOCS: https://nightly.spacy.io/api/entitylinker#initialize """ self._ensure_examples(get_examples) self._require_kb() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 5fee9a900..d035172a8 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -129,7 +129,7 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using a representative set of data examples. @@ -142,7 +142,7 @@ class Morphologizer(Tagger): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/morphologizer#begin_training + DOCS: https://nightly.spacy.io/api/morphologizer#initialize """ self._ensure_examples(get_examples) # First, fetch all labels from the data diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 2f8940124..3fd034b30 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -81,7 +81,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None, sgd=None): if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) raise ValueError(err) @@ -177,10 +177,10 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None, sgd=None): self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) - self.model.output_layer.begin_training(X) + self.model.output_layer.initialize(X) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index fc0dda40d..effcef2e3 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser): labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline) + labeller.initialize(get_examples, pipeline=pipeline) @property def labels(self): diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 324c8e19c..bff2be1af 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -183,7 +183,7 @@ cdef class Pipe: """ return util.create_default_optimizer() - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. This method needs to be implemented by each Pipe component, ensuring the internal model (if available) is initialized properly @@ -198,7 +198,7 @@ cdef class Pipe: create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/pipe#begin_training + DOCS: https://nightly.spacy.io/api/pipe#initialize """ raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 2882f6f8b..0f49033ff 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -58,7 +58,7 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None, sgd=None): pass def __call__(self, doc): diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index da85a9cf2..68a9860a5 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using a representative set of data examples. @@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training + DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize """ self._ensure_examples(get_examples) doc_sample = [] diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 3efe29916..66f8b38b6 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -256,7 +256,7 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using a representative set of data examples. @@ -269,7 +269,7 @@ class Tagger(Pipe): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/tagger#begin_training + DOCS: https://nightly.spacy.io/api/tagger#initialize """ self._ensure_examples(get_examples) doc_sample = [] diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 6b8c0ca65..37665adfc 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -334,7 +334,7 @@ class TextCategorizer(Pipe): self.labels = tuple(list(self.labels) + [label]) return 1 - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, @@ -353,7 +353,7 @@ class TextCategorizer(Pipe): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training + DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ self._ensure_examples(get_examples) subbatch = [] # Select a subbatch of examples to initialize the model diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 9ab4e42b7..7c8bbf5e5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -203,7 +203,7 @@ class Tok2Vec(Pipe): def get_loss(self, examples, scores) -> None: pass - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, @@ -222,7 +222,7 @@ class Tok2Vec(Pipe): create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/tok2vec#begin_training + DOCS: https://nightly.spacy.io/api/tok2vec#initialize """ self._ensure_examples(get_examples) doc_sample = [] diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 1350e1f12..5a4503cf9 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -405,7 +405,7 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs): self._ensure_examples(get_examples) self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 86aa883bd..fa0206fdd 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: [_ner_example(ner)]) + ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] @@ -48,7 +48,7 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: [_ner_example(ner)]) + ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index cd376e0fc..fb1eabf7d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(5): @@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training(lambda: [_ner_example(ner1)]) + ner1.initialize(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index cd5581769..b657ae2e8 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -202,7 +202,7 @@ def test_train_empty(): train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") - nlp.begin_training() + nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) @@ -213,7 +213,7 @@ def test_train_empty(): def test_overwrite_token(): nlp = English() nlp.add_pipe("ner") - nlp.begin_training() + nlp.initialize() # The untrained NER will predict O for each token doc = nlp("I live in New York") assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] @@ -235,7 +235,7 @@ def test_empty_ner(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("John is watching the news about Croatia's elections") # if this goes wrong, the initialization of the parser's upper layer is probably broken result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] @@ -254,7 +254,7 @@ def test_ruler_before_ner(): # 2: untrained NER - should set everything else to O untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] @@ -269,7 +269,7 @@ def test_ner_before_ruler(): # 1: untrained NER - should set everything to O untrained_ner = nlp.add_pipe("ner", name="uner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() # 2 : Entity Ruler - should set "this" to B and keep everything else O patterns = [{"label": "THING", "pattern": "This"}] @@ -290,7 +290,7 @@ def test_block_ner(): nlp.add_pipe("blocker", config={"start": 2, "end": 5}) untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("This is Antti L Korhonen speaking in Finland") expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] expected_types = ["", "", "", "", "", "", "", ""] @@ -307,7 +307,7 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(50): losses = {} @@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog): assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") with caplog.at_level(logging.DEBUG): - nlp.begin_training() + nlp.initialize() assert "W033" in caplog.text caplog.clear() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with caplog.at_level(logging.DEBUG): - nlp.begin_training() + nlp.initialize() assert "W033" not in caplog.text @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") + doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified") return doc diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8648f2018..ffb6f23f1 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -191,7 +191,7 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(100): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index e8dfa68c7..d8f861b02 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -34,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 878f41a28..d5c8de36b 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -134,7 +134,7 @@ def test_kb_undefined(nlp): """Test that the EL can't train without defining a KB""" entity_linker = nlp.add_pipe("entity_linker", config={}) with pytest.raises(ValueError): - entity_linker.begin_training(lambda: []) + entity_linker.initialize(lambda: []) def test_kb_empty(nlp): @@ -143,7 +143,7 @@ def test_kb_empty(nlp): entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): - entity_linker.begin_training(lambda: []) + entity_linker.initialize(lambda: []) def test_kb_serialize(nlp): @@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) - nlp.begin_training() + nlp.initialize() assert entity_linker.model.get_dim("nO") == vector_length # test whether the entity links are preserved by the `as_doc()` function @@ -463,7 +463,7 @@ def test_overfitting_IO(): ) # train the NEL pipe - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 864c7332e..c86ee3617 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -33,7 +33,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("morphologizer") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_implicit_label(): @@ -42,7 +42,7 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) def test_no_resize(): @@ -50,13 +50,13 @@ def test_no_resize(): morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") - nlp.begin_training() + nlp.initialize() # this throws an error because the morphologizer can't be resized after initialization with pytest.raises(ValueError): morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") @@ -64,12 +64,12 @@ def test_begin_training_examples(): for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -79,7 +79,7 @@ def test_overfitting_IO(): train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 5827f8ff1..5d8a8be41 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -31,19 +31,19 @@ TRAIN_DATA = [ ] -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() nlp.add_pipe("senter") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -58,7 +58,7 @@ def test_overfitting_IO(): train_examples[1].reference[11].is_sent_start = False nlp.add_pipe("senter") - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(200): losses = {} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index cd5927675..69a6dd414 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -15,14 +15,14 @@ def test_label_types(): tagger.add_label(9) -def test_tagger_begin_training_tag_map(): - """Test that Tagger.begin_training() without gold tuples does not clobber +def test_tagger_initialize_tag_map(): + """Test that Tagger.initialize() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A") - nlp.begin_training() + nlp.initialize() assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) @@ -38,7 +38,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("tagger") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_no_resize(): @@ -47,7 +47,7 @@ def test_no_resize(): tagger.add_label("N") tagger.add_label("V") assert tagger.labels == ("N", "V") - nlp.begin_training() + nlp.initialize() assert tagger.model.get_dim("nO") == 2 # this throws an error because the tagger can't be resized after initialization with pytest.raises(ValueError): @@ -60,10 +60,10 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() tagger = nlp.add_pipe("tagger") train_examples = [] @@ -72,16 +72,16 @@ def test_begin_training_examples(): for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: train_examples[0]) + nlp.initialize(get_examples=lambda: train_examples[0]) with pytest.raises(ValueError): - nlp.begin_training(get_examples=lambda: []) + nlp.initialize(get_examples=lambda: []) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -91,7 +91,7 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): @@ -122,4 +122,4 @@ def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 02e189834..2870229c8 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -26,7 +26,7 @@ def test_simple_train(): nlp = Language() textcat = nlp.add_pipe("textcat") textcat.add_label("answer") - nlp.begin_training() + nlp.initialize() for i in range(5): for text, answer in [ ("aaaa", 1.0), @@ -56,7 +56,7 @@ def test_textcat_learns_multilabel(): textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: textcat.add_label(letter) - optimizer = textcat.begin_training(lambda: []) + optimizer = textcat.initialize(lambda: []) for i in range(30): losses = {} examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] @@ -86,7 +86,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("textcat") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_implicit_label(): @@ -95,7 +95,7 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) def test_no_resize(): @@ -103,14 +103,14 @@ def test_no_resize(): textcat = nlp.add_pipe("textcat") textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") - nlp.begin_training() + nlp.initialize() assert textcat.model.get_dim("nO") == 2 # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL") -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() textcat = nlp.add_pipe("textcat") train_examples = [] @@ -119,12 +119,12 @@ def test_begin_training_examples(): for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -139,7 +139,7 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 for i in range(50): @@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 558b9079c..f84b78247 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -88,7 +88,7 @@ def test_init_tok2vec(): nlp = English() tok2vec = nlp.add_pipe("tok2vec") assert tok2vec.listeners == [] - nlp.begin_training() + nlp.initialize() assert tok2vec.model.get_dim("nO") @@ -154,7 +154,7 @@ def test_tok2vec_listener(): # Check that the Tok2Vec component finds it listeners assert tok2vec.listeners == [] - optimizer = nlp.begin_training(lambda: train_examples) + optimizer = nlp.initialize(lambda: train_examples) assert tok2vec.listeners == [tagger_tok2vec] for i in range(5): diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index d841ee24b..6bb71f6f4 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -428,7 +428,7 @@ def test_issue999(): for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) - nlp.begin_training() + nlp.initialize() for itn in range(20): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index dce3e8298..f85ec70e1 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -250,7 +250,7 @@ def test_issue1915(): ner = nlp.add_pipe("ner") ner.add_label("answer") with pytest.raises(ValueError): - nlp.begin_training(**cfg) + nlp.initialize(**cfg) def test_issue1945(): diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index c4c755153..09baab4d8 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -30,7 +30,7 @@ def test_issue2179(): nlp = Italian() ner = nlp.add_pipe("ner") ner.add_label("CITIZENSHIP") - nlp.begin_training() + nlp.initialize() nlp2 = Italian() nlp2.add_pipe("ner") assert len(nlp2.get_pipe("ner").labels) == 0 diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 5895b616e..4952a545d 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -18,7 +18,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - nlp.begin_training() + nlp.initialize() doc = nlp("hello world") assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) @@ -149,7 +149,7 @@ def test_issue2800(): ner = nlp.add_pipe("ner") for entity_type in list(entity_types): ner.add_label(entity_type) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(20): losses = {} random.shuffle(train_data) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 56ef23dbf..6fc42e83f 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -92,7 +92,7 @@ def test_issue3209(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("ANIMAL") - nlp.begin_training() + nlp.initialize() move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() @@ -239,7 +239,7 @@ def test_issue3456(): nlp = English() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - nlp.begin_training() + nlp.initialize() list(nlp.pipe(["hi", ""])) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 304e654c3..31e441d86 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -223,7 +223,7 @@ def test_issue3611(): textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) @@ -268,7 +268,7 @@ def test_issue3830_no_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: [_parser_example(parser)]) + parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels @@ -283,7 +283,7 @@ def test_issue3830_with_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: [_parser_example(parser)]) + parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels @@ -342,7 +342,7 @@ def test_issue3880(): nlp.add_pipe("parser").add_label("dep") nlp.add_pipe("ner").add_label("PERSON") nlp.add_pipe("tagger").add_label("NN") - nlp.begin_training() + nlp.initialize() for doc in nlp.pipe(texts): pass diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 7b7ddfe0d..753cff37f 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -66,7 +66,7 @@ def test_issue4030(): textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) @@ -87,7 +87,7 @@ def test_issue4042(): # add ner pipe ner = nlp.add_pipe("ner") ner.add_label("SOME_LABEL") - nlp.begin_training() + nlp.initialize() # Add entity ruler patterns = [ {"label": "MY_ORG", "pattern": "Apple"}, @@ -118,7 +118,7 @@ def test_issue4042_bug2(): # add ner pipe ner1 = nlp1.add_pipe("ner") ner1.add_label("SOME_LABEL") - nlp1.begin_training() + nlp1.initialize() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 @@ -244,7 +244,7 @@ def test_issue4267(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("PEOPLE") - nlp.begin_training() + nlp.initialize() assert "ner" in nlp.pipe_names # assert that we have correct IOB annotations doc1 = nlp("hi") @@ -299,7 +299,7 @@ def test_issue4313(): config = {} ner = nlp.create_pipe("ner", config=config) ner.add_label("SOME_LABEL") - ner.begin_training(lambda: []) + ner.initialize(lambda: []) # add a new label to the doc doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 @@ -327,7 +327,7 @@ def test_issue4348(): TRAIN_DATA = [example, example] tagger = nlp.add_pipe("tagger") tagger.add_label("A") - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index e351858f5..6dbbc233b 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -180,7 +180,7 @@ def test_issue4725_2(): vocab.set_vector("dog", data[1]) nlp = English(vocab=vocab) nlp.add_pipe("ner") - nlp.begin_training() + nlp.initialize() docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 531e48ec3..5e320996a 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -64,7 +64,7 @@ def tagger(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") - nlp.begin_training() + nlp.initialize() return tagger @@ -85,7 +85,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - nlp.begin_training() + nlp.initialize() return entity_linker diff --git a/spacy/tests/regression/test_issue5551.py b/spacy/tests/regression/test_issue5551.py index b7139d463..655764362 100644 --- a/spacy/tests/regression/test_issue5551.py +++ b/spacy/tests/regression/test_issue5551.py @@ -25,7 +25,7 @@ def test_issue5551(): pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) for label in set(example[1]["cats"]): pipe.add_label(label) - nlp.begin_training() + nlp.initialize() # Store the result of each iteration result = pipe.model.predict([nlp.make_doc(example[0])]) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index eb5f15007..663e76550 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -152,7 +152,7 @@ def test_serialize_nlp(): nlp_config = Config().from_str(nlp_config_string) nlp = load_model_from_config(nlp_config, auto_fill=True) nlp.get_pipe("tagger").add_label("A") - nlp.begin_training() + nlp.initialize() assert "tok2vec" in nlp.pipe_names assert "tagger" in nlp.pipe_names assert "parser" not in nlp.pipe_names @@ -173,7 +173,7 @@ def test_serialize_custom_nlp(): parser_cfg = dict() parser_cfg["model"] = {"@architectures": "my_test_parser"} nlp.add_pipe("parser", config=parser_cfg) - nlp.begin_training() + nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) @@ -191,7 +191,7 @@ def test_serialize_parser(): model_config = Config().from_str(parser_config_string) parser = nlp.add_pipe("parser", config=model_config) parser.add_label("nsubj") - nlp.begin_training() + nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index da46ad424..6a487303e 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -18,7 +18,7 @@ def nlp(): textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) - nlp.begin_training() + nlp.initialize() return nlp diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 5c02aca36..ea39e8b90 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -47,7 +47,7 @@ def test_readers(): ) optimizer = T["optimizer"] # simulate a training loop - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) @@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config): ) optimizer = T["optimizer"] # simulate a training loop - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): assert example.y.cats # this shouldn't fail if each training example has at least one positive label diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index a04e6aadd..9655dd1b6 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -600,7 +600,7 @@ def _train_tuples(train_data): train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 24b00a764..23debfb28 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) msg.good(f"Initialized pipeline components") - # Verify the config after calling 'begin_training' to ensure labels + # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) if "pretraining" in config and config["pretraining"]: diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index ef2666ec0..3f6258be9 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -517,18 +517,18 @@ specific data and challenge. Stacked ensemble of a bag-of-words model and a neural network model. The neural network has an internal CNN Tok2Vec layer and uses attention. -| Name | Description | -| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | -| `width` | Output dimension of the feature encoding step. ~~int~~ | -| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | -| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | -| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | -| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | -| `dropout` | The dropout rate. ~~float~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | +| `width` | Output dimension of the feature encoding step. ~~int~~ | +| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | +| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | +| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `dropout` | The dropout rate. ~~float~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatCNN.v1 {#TextCatCNN} @@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. -| Name | Description | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatBOW.v1 {#TextCatBOW} @@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster. An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. -| Name | Description | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | -| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} @@ -629,11 +629,11 @@ into the "real world". This requires 3 main components: The `EntityLinker` model architecture is a Thinc `Model` with a [`Linear`](https://thinc.ai/api-layers#linear) output layer. -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | -| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.EmptyKB.v1 {#EmptyKB} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 8af4455d3..c7c41f2a1 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## DependencyParser.begin_training {#begin_training tag="method"} +## DependencyParser.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -151,11 +151,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > parser = nlp.add_pipe("parser") -> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and > > ```python > parser = nlp.add_pipe("parser") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = parser.update(examples, sgd=optimizer) > ``` @@ -294,11 +300,10 @@ context, the original parameters are restored. ## DependencyParser.add_label {#add_label tag="method"} Add a new label to the pipe. Note that you don't have to call this method if you -provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +provide a **representative data sample** to the [`initialize`](#initialize) +method. In this case, all labels found in the sample will be automatically added +to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 945a1568a..1dbe78703 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityLinker.begin_training {#begin_training tag="method"} +## EntityLinker.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -150,11 +150,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > entity_linker = nlp.add_pipe("entity_linker", last=True) -> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to > > ```python > entity_linker = nlp.add_pipe("entity_linker") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = entity_linker.update(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 6d710f425..2c32ff753 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityRecognizer.begin_training {#begin_training tag="method"} +## EntityRecognizer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -140,11 +140,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > ner = nlp.add_pipe("ner") -> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and > > ```python > ner = nlp.add_pipe("ner") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = ner.update(examples, sgd=optimizer) > ``` @@ -282,11 +288,10 @@ context, the original parameters are restored. ## EntityRecognizer.add_label {#add_label tag="method"} Add a new label to the pipe. Note that you don't have to call this method if you -provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +provide a **representative data sample** to the [`initialize`](#initialize) +method. In this case, all labels found in the sample will be automatically added +to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index dd3cc57dd..11631502c 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -201,30 +201,31 @@ more efficient than processing texts one-by-one. | `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ | -## Language.begin_training {#begin_training tag="method"} +## Language.initialize {#initialize tag="method"} Initialize the pipeline for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples can either be the full training data or a representative sample. They are used to **initialize the models** of trainable pipeline components and are -passed each component's [`begin_training`](/api/pipe#begin_training) method, if +passed each component's [`initialize`](/api/pipe#initialize) method, if available. Initialization includes validating the network, [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference) and setting up the label scheme based on the data. -If no `get_examples` function is provided when calling `nlp.begin_training`, the +If no `get_examples` function is provided when calling `nlp.initialize`, the pipeline components will be initialized with generic data. In this case, it is crucial that the output dimension of each component has already been defined either in the [config](/usage/training#config), or by calling [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for the tagger or textcat). - + -The `Language.update` method now takes a **function** that is called with no -arguments and returns a sequence of [`Example`](/api/example) objects instead of -tuples of `Doc` and `GoldParse` objects. +This method was previously called `begin_training`. It now also takes a +**function** that is called with no arguments and returns a sequence of +[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse` +objects. @@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects. > > ```python > get_examples = lambda: examples -> optimizer = nlp.begin_training(get_examples) +> optimizer = nlp.initialize(get_examples) > ``` | Name | Description | @@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into > > ```python > with nlp.select_pipes(disable=["tagger", "parser"]): -> nlp.begin_training() +> nlp.initialize() > > with nlp.select_pipes(enable="ner"): -> nlp.begin_training() +> nlp.initialize() > > disabled = nlp.select_pipes(disable=["tagger", "parser"]) -> nlp.begin_training() +> nlp.initialize() > disabled.restore() > ``` diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index e1a166474..4f00a09ef 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Morphologizer.begin_training {#begin_training tag="method"} +## Morphologizer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -133,7 +133,7 @@ setting up the label scheme based on the data. > ```python > morphologizer = nlp.add_pipe("morphologizer") > nlp.pipeline.append(morphologizer) -> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and > > ```python > morphologizer = nlp.add_pipe("morphologizer") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = morphologizer.update(examples, sgd=optimizer) > ``` @@ -259,12 +259,11 @@ context, the original parameters are restored. Add a new label to the pipe. If the `Morphologizer` should set annotations for both `pos` and `morph`, the label should include the UPOS as the feature `POS`. Raises an error if the output dimension is already set, or if the model has -already been fully [initialized](#begin_training). Note that you don't have to -call this method if you provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +already been fully [initialized](#initialize). Note that you don't have to call +this method if you provide a **representative data sample** to the +[`initialize`](#initialize) method. In this case, all labels found in the sample +will be automatically added to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index e4e1e97f1..17752ed5e 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Pipe.begin_training {#begin_training tag="method"} +## Pipe.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -109,11 +109,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = pipe.update(examples, sgd=optimizer) > ``` @@ -296,9 +302,9 @@ context, the original parameters are restored. Add a new label to the pipe, to be predicted by the model. The actual implementation depends on the specific component, but in general `add_label` shouldn't be called if the output dimension is already set, or if the model has -already been fully [initialized](#begin_training). If these conditions are -violated, the function will raise an Error. The exception to this rule is when -the component is [resizable](#is_resizable), in which case +already been fully [initialized](#initialize). If these conditions are violated, +the function will raise an Error. The exception to this rule is when the +component is [resizable](#is_resizable), in which case [`set_output`](#set_output) should be called to ensure that the model is properly resized. @@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method. | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ | Note that in general, you don't have to call `pipe.add_label` if you provide a -representative data sample to the [`begin_training`](#begin_training) method. In -this case, all labels found in the sample will be automatically added to the -model, and the output dimension will be +representative data sample to the [`initialize`](#initialize) method. In this +case, all labels found in the sample will be automatically added to the model, +and the output dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) automatically. ## Pipe.is_resizable {#is_resizable tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index acf94fb8e..d81725343 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## SentenceRecognizer.begin_training {#begin_training tag="method"} +## SentenceRecognizer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -129,7 +129,7 @@ setting up the label scheme based on the data. > > ```python > senter = nlp.add_pipe("senter") -> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and > > ```python > senter = nlp.add_pipe("senter") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = senter.update(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index d428d376e..6ca554f49 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Tagger.begin_training {#begin_training tag="method"} +## Tagger.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -123,11 +123,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = tagger.update(examples, sgd=optimizer) > ``` @@ -289,12 +295,12 @@ context, the original parameters are restored. ## Tagger.add_label {#add_label tag="method"} Add a new label to the pipe. Raises an error if the output dimension is already -set, or if the model has already been fully [initialized](#begin_training). Note +set, or if the model has already been fully [initialized](#initialize). Note that you don't have to call this method if you provide a **representative data -sample** to the [`begin_training`](#begin_training) method. In this case, all -labels found in the sample will be automatically added to the model, and the -output dimension will be -[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index b68039094..4c99d6984 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## TextCategorizer.begin_training {#begin_training tag="method"} +## TextCategorizer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -136,11 +136,17 @@ validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. + + +This method was previously called `begin_training`. + + + > #### Example > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = textcat.update(examples, sgd=optimizer) > ``` | Name | Description | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | +| _keyword-only_ | | | `drop` | The dropout rate. ~~float~~ | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | @@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | +| _keyword-only_ | | | `drop` | The dropout rate. ~~float~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | @@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values. ## TextCategorizer.add_label {#add_label tag="method"} Add a new label to the pipe. Raises an error if the output dimension is already -set, or if the model has already been fully [initialized](#begin_training). Note +set, or if the model has already been fully [initialized](#initialize). Note that you don't have to call this method if you provide a **representative data -sample** to the [`begin_training`](#begin_training) method. In this case, all -labels found in the sample will be automatically added to the model, and the -output dimension will be -[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 5c7214edc..8269ad7cf 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Tok2Vec.begin_training {#begin_training tag="method"} +## Tok2Vec.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -138,7 +138,7 @@ setting up the label scheme based on the data. > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict). > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = tok2vec.update(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index d5bcef229..712214fec 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Transformer.begin_training {#begin_training tag="method"} +## Transformer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -173,7 +173,7 @@ setting up the label scheme based on the data. > > ```python > trf = nlp.add_pipe("transformer") -> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline) +> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | @@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients. > > ```python > trf = nlp.add_pipe("transformer") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = trf.update(examples, sgd=optimizer) > ``` diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index a58ba2ba9..b65c3d903 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure that their internal models are **always initialized** with appropriate sample data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This -functionality is triggered when -[`nlp.begin_training`](/api/language#begin_training) is called. +functionality is triggered when [`nlp.initialize`](/api/language#initialize) is +called. ### Dropout and normalization in Thinc {#thinc-dropout-norm} @@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}): diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index dbf0881ac..b1cf2723b 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to customize how the model is updated from examples, how it's initialized, how the loss is calculated and to add evaluation scores to the training output. -| Name | Description | -| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | -| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | -| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | -| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | +| Name | Description | +| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | +| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | +| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | +| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 54be6b367..1c1b92e03 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -1045,8 +1045,8 @@ of being dropped. > - [`nlp`](/api/language): The `nlp` object with the pipeline components and > their models. -> - [`nlp.begin_training`](/api/language#begin_training): Start the training and -> return an optimizer to update the component model weights. +> - [`nlp.initialize`](/api/language#initialize): Start the training and return +> an optimizer to update the component model weights. > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds > state between updates. > - [`nlp.update`](/api/language#update): Update component models with examples. @@ -1057,7 +1057,7 @@ of being dropped. ```python ### Example training loop -optimizer = nlp.begin_training() +optimizer = nlp.initialize() for itn in range(100): random.shuffle(train_data) for raw_text, entity_offsets in train_data: diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 94c50e1ec..44f902cd5 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`Pipe.update`](/api/pipe#update) methods now all take batches of [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or raw text and a dictionary of annotations. - [`Language.begin_training`](/api/language#begin_training) and - [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that - returns a sequence of `Example` objects to initialize the model instead of a - list of tuples. + [`Language.initialize`](/api/language#initialize) and + [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a + sequence of `Example` objects to initialize the model instead of a list of + tuples. +- The `begin_training` methods have been renamed to `initialize`. - [`Matcher.add`](/api/matcher#add) and [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of patterns as the second argument (instead of a variable number of arguments). @@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | Removed | Replacement | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... | | `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | | `GoldParse` | [`Example`](/api/example) | | `GoldCorpus` | [`Corpus`](/api/corpus) | @@ -936,7 +938,7 @@ TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London.", {"entities": [(7, 13, "LOC")]}), ] -nlp.begin_training() +nlp.initialize() for i in range(20): random.shuffle(TRAIN_DATA) for batch in minibatch(TRAIN_DATA): @@ -946,17 +948,18 @@ for i in range(20): nlp.update(examples) ``` -[`Language.begin_training`](/api/language#begin_training) and -[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that -returns a sequence of `Example` objects to initialize the model instead of a -list of tuples. The data examples are used to **initialize the models** of +`Language.begin_training` and `Pipe.begin_training` have been renamed to +[`Language.initialize`](/api/language#initialize) and +[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function +that returns a sequence of `Example` objects to initialize the model instead of +a list of tuples. The data examples are used to **initialize the models** of trainable pipeline components, which includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme. ```diff -- nlp.begin_training(examples) -+ nlp.begin_training(lambda: examples) +- nlp.initialize(examples) ++ nlp.initialize(lambda: examples) ``` #### Packaging trained pipelines {#migrating-training-packaging} From 4925ad760a87d84b7cc4bb2fb48b45845a2e0c30 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 10:58:50 +0200 Subject: [PATCH 28/66] Add init vectors --- spacy/cli/init_pipeline.py | 26 +++++++- spacy/training/initialize.py | 117 +++++++++++++++++++++++++++++++++-- 2 files changed, 138 insertions(+), 5 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index a92705cb0..0e9de0eb4 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -5,11 +5,35 @@ from wasabi import msg import typer from .. import util -from ..training.initialize import init_nlp +from ..training.initialize import init_nlp, convert_vectors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu +@init_cli.command("vectors") +def init_vectors_cli( + # fmt: off + lang: str = Arg(..., help="The language of the nlp object to create"), + vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), + output_dir: Path = Arg(..., help="Pipeline output directory"), + prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), + truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + # fmt: on +): + msg.info(f"Creating blank nlp object for language '{lang}'") + nlp = util.get_lang_class(lang)() + convert_vectors( + nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False + ) + nlp.to_disk(output_dir) + msg.good( + "Saved nlp object with vectors to output directory. You can now use the " + "path to it in your config as the 'vectors' setting in [initialize.vocab].", + output_dir, + ) + + @init_cli.command( "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 23debfb28..9a47a7f69 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,13 +1,19 @@ -from typing import Union, Dict, Optional, Any, List +from typing import Union, Dict, Optional, Any, List, IO from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path from wasabi import Printer import srsly +import numpy +import tarfile +import gzip +import zipfile +import tqdm from .loop import create_before_to_disk_callback from ..language import Language from ..lookups import Lookups +from ..vectors import Vectors from ..errors import Errors from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names @@ -49,8 +55,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - msg.good(f"Initialized pipeline components") + nlp.initialize( + lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"] + ) + msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -103,7 +111,7 @@ def init_vocab( def load_vectors_into_model( - nlp: "Language", name: Union[str, Path], *, add_strings: bool = True + nlp: Language, name: Union[str, Path], *, add_strings: bool = True ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: @@ -202,3 +210,104 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: for name, cfg in config.get("components", {}).items() if "factory" not in cfg and "source" in cfg ] + + +def convert_vectors( + nlp: Language, + vectors_loc: Optional[Path], + *, + truncate: int, + prune: int, + name: Optional[str] = None, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent) + vectors_loc = ensure_path(vectors_loc) + if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): + nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) + for lex in nlp.vocab: + if lex.rank and lex.rank != OOV_RANK: + nlp.vocab.vectors.add(lex.orth, row=lex.rank) + else: + if vectors_loc: + with msg.loading(f"Reading vectors from {vectors_loc}"): + vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + msg.good(f"Loaded vectors from {vectors_loc}") + else: + vectors_data, vector_keys = (None, None) + if vector_keys is not None: + for word in vector_keys: + if word not in nlp.vocab: + nlp.vocab[word] + if vectors_data is not None: + nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) + if name is None: + # TODO: Is this correct? Does this matter? + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" + else: + nlp.vocab.vectors.name = name + nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name + if prune >= 1: + nlp.vocab.prune_vectors(prune) + msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") + + +def read_vectors(vectors_loc: Path, truncate_vectors: int): + f = open_file(vectors_loc) + f = ensure_shape(f) + shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) + vectors_data = numpy.zeros(shape=shape, dtype="f") + vectors_keys = [] + for i, line in enumerate(tqdm.tqdm(f)): + line = line.rstrip() + pieces = line.rsplit(" ", vectors_data.shape[1]) + word = pieces.pop(0) + if len(pieces) != vectors_data.shape[1]: + raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) + vectors_data[i] = numpy.asarray(pieces, dtype="f") + vectors_keys.append(word) + if i == truncate_vectors - 1: + break + return vectors_data, vectors_keys + + +def open_file(loc: Union[str, Path]) -> IO: + """Handle .gz, .tar.gz or unzipped files""" + loc = ensure_path(loc) + if tarfile.is_tarfile(str(loc)): + return tarfile.open(str(loc), "r:gz") + elif loc.parts[-1].endswith("gz"): + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + elif loc.parts[-1].endswith("zip"): + zip_file = zipfile.ZipFile(str(loc)) + names = zip_file.namelist() + file_ = zip_file.open(names[0]) + return (line.decode("utf8") for line in file_) + else: + return loc.open("r", encoding="utf8") + + +def ensure_shape(lines): + """Ensure that the first line of the data is the vectors shape. + If it's not, we read in the data and output the shape as the first result, + so that the reader doesn't have to deal with the problem. + """ + first_line = next(lines) + try: + shape = tuple(int(size) for size in first_line.split()) + except ValueError: + shape = None + if shape is not None: + # All good, give the data + yield first_line + yield from lines + else: + # Figure out the shape, make it the first value, and then give the + # rest of the data. + width = len(first_line.split()) - 1 + captured = [first_line] + list(lines) + length = len(captured) + yield f"{length} {width}" + yield from captured From 5276db6f3f4f44eb98cf984e7e54f9790b00d08e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 11:42:19 +0200 Subject: [PATCH 29/66] Remove 'device' argument from Language, clean up 'sgd' arg --- spacy/language.py | 57 ++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index a5b78b178..5b1f50ee2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -19,7 +19,7 @@ from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples from .scorer import Scorer -from .util import create_default_optimizer, registry, SimpleFrozenList +from .util import registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -1065,7 +1065,7 @@ class Language: validate_examples(examples, "Language.update") if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer() + self._optimizer = self.create_optimizer() sgd = self._optimizer if component_cfg is None: component_cfg = {} @@ -1123,7 +1123,7 @@ class Language: validate_examples(examples, "Language.rehearse") if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer() + self._optimizer = self.create_optimizer() sgd = self._optimizer pipes = list(self.pipeline) random.shuffle(pipes) @@ -1161,16 +1161,14 @@ class Language: def initialize( self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, - *, - sgd: Optional[Optimizer] = None, - device: int = -1, - ) -> Optimizer: + sgd: Optional[Optimizer]=None + ) -> None: """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. + sgd (Optional[Optimizer]): An optimizer to use for updates. If not + provided, will be created using the .create_optimizer() method. RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/language#initialize @@ -1199,25 +1197,22 @@ class Language: if not valid_examples: err = Errors.E930.format(name="Language", obj="empty list") raise ValueError(err) - if device >= 0: # TODO: do we need this here? - require_gpu(device) - if self.vocab.vectors.data.shape[1] >= 1: - ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - if sgd is None: - sgd = create_default_optimizer() - self._optimizer = sgd + if self.vocab.vectors.data.shape[1] >= 1: + ops = get_current_ops() + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) for name, proc in self.pipeline: if hasattr(proc, "initialize"): proc.initialize( - get_examples, pipeline=self.pipeline, sgd=self._optimizer + get_examples, pipeline=self.pipeline ) self._link_components() + if sgd is not None: + self._optimizer = sgd + elif self._optimizer is None: + self._optimizer = self.create_optimizer() return self._optimizer - def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1 - ) -> Optimizer: + def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer: """Continue training a pretrained model. Create and return an optimizer, and initialize "rehearsal" for any pipeline @@ -1226,22 +1221,20 @@ class Language: rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects. - sgd (Optional[Optimizer]): An optimizer. RETURNS (Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/language#resume_training """ - if device >= 0: # TODO: do we need this here? - require_gpu(device) - ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - if sgd is None: - sgd = create_default_optimizer() - self._optimizer = sgd + ops = get_current_ops() + if self.vocab.vectors.data.shape[1] >= 1: + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) + if sgd is not None: + self._optimizer = sgd + elif self._optimizer is None: + self._optimizer = self.create_optimizer() return self._optimizer def evaluate( @@ -1302,6 +1295,10 @@ class Language: n_words = sum(len(doc) for doc in docs) results["speed"] = n_words / (end_time - start_time) return results + + def create_optimizer(self): + """Create an optimizer, usually using the [training.optimizer] config.""" + return registry.resolve(self.config["training"]["optimizer"]) @contextmanager def use_params(self, params: Optional[dict]): From b3b6868639f3982f9cbe584784faa7371f7d7b07 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 11:42:35 +0200 Subject: [PATCH 30/66] Remove 'sgd' arg from component initialize --- spacy/pipeline/morphologizer.pyx | 7 +------ spacy/pipeline/multitask.pyx | 4 ++-- spacy/pipeline/pipe.pyx | 4 +--- spacy/pipeline/senter.pyx | 6 ++---- spacy/pipeline/tagger.pyx | 7 +------ spacy/pipeline/textcat.py | 3 +-- spacy/pipeline/transition_parser.pyx | 9 +++------ 7 files changed, 11 insertions(+), 29 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d035172a8..580b6b831 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -129,7 +129,7 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def initialize(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None): """Initialize the pipe for training, using a representative set of data examples. @@ -138,8 +138,6 @@ class Morphologizer(Tagger): pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/morphologizer#initialize @@ -178,9 +176,6 @@ class Morphologizer(Tagger): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 3fd034b30..ba406dabe 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -81,7 +81,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def initialize(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None): if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) raise ValueError(err) @@ -177,7 +177,7 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids): pass - def initialize(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None): self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.initialize(X) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index bff2be1af..08015e60e 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -183,7 +183,7 @@ cdef class Pipe: """ return util.create_default_optimizer() - def initialize(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None): """Initialize the pipe for training, using data examples if available. This method needs to be implemented by each Pipe component, ensuring the internal model (if available) is initialized properly @@ -194,8 +194,6 @@ cdef class Pipe: pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/pipe#initialize diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 68a9860a5..91ce9f1bb 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None): """Initialize the pipe for training, using a representative set of data examples. @@ -133,9 +133,7 @@ class SentenceRecognizer(Tagger): pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + RETURNS: None DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize """ diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 66f8b38b6..ecf93600e 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -256,7 +256,7 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, pipeline=None): """Initialize the pipe for training, using a representative set of data examples. @@ -265,8 +265,6 @@ class Tagger(Pipe): pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/tagger#initialize @@ -289,9 +287,6 @@ class Tagger(Pipe): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def add_label(self, label): """Add a new label to the pipe. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 37665adfc..67e8777c5 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -338,8 +338,7 @@ class TextCategorizer(Pipe): self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, + pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None ) -> Optimizer: """Initialize the pipe for training, using a representative set of data examples. diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5a4503cf9..9a2e5d8d0 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -354,7 +354,7 @@ cdef class Parser(Pipe): # If all weights for an output are 0 in the original model, don't # supervise that output. This allows us to add classes. loss += (d_scores**2).sum() - backprop(d_scores, sgd=sgd) + backprop(d_scores) # Follow the predicted action self.transition_states(states, guesses) states = [state for state in states if not state.is_final()] @@ -405,9 +405,8 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs): + def initialize(self, get_examples, pipeline=None, settings=None): self._ensure_examples(get_examples) - self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) @@ -425,8 +424,6 @@ cdef class Parser(Pipe): self.moves.initialize_actions(actions) # make sure we resize so we have an appropriate upper layer self._resize() - if sgd is None: - sgd = self.create_optimizer() doc_sample = [] if pipeline is not None: for name, component in pipeline: @@ -442,7 +439,7 @@ cdef class Parser(Pipe): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(doc_sample) if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) + self.init_multitask_objectives(get_examples, pipeline) return sgd def to_disk(self, path, exclude=tuple()): From dec984a9c1c067bc1538959da44e49df5b715965 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 11:52:45 +0200 Subject: [PATCH 31/66] Update Language.initialize and support components/tokenizer settings --- spacy/language.py | 21 ++++++- spacy/pipeline/transition_parser.pyx | 2 +- spacy/schemas.py | 93 +++++++++++++++++++++++++++- 3 files changed, 111 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index a5b78b178..20b7a7256 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -27,7 +27,7 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .tokenizer import Tokenizer from .errors import Errors, Warnings -from .schemas import ConfigSchema, ConfigSchemaNlp +from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings from .git_info import GIT_VERSION from . import util from . import about @@ -1162,6 +1162,7 @@ class Language: self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, + settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), sgd: Optional[Optimizer] = None, device: int = -1, ) -> Optimizer: @@ -1207,10 +1208,26 @@ class Language: if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd + if hasattr(self.tokenizer, "initialize"): + tok_settings = settings.get("tokenizer", {}) + tok_settings = validate_init_settings( + self.tokenizer.initialize, + tok_settings, + section="tokenizer", + name="tokenizer", + ) + self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) for name, proc in self.pipeline: if hasattr(proc, "initialize"): + p_settings = settings.get(name, {}) + p_settings = validate_init_settings( + proc.initialize, p_settings, section="components", name=name + ) proc.initialize( - get_examples, pipeline=self.pipeline, sgd=self._optimizer + get_examples, + pipeline=self.pipeline, + sgd=self._optimizer, + **p_settings, ) self._link_components() return self._optimizer diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5a4503cf9..78e3422f6 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,4 +1,4 @@ -# cython: infer_types=True, cdivision=True, boundscheck=False +# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True from __future__ import print_function from cymem.cymem cimport Pool cimport numpy as np diff --git a/spacy/schemas.py b/spacy/schemas.py index b98498b8b..cdd8c11ed 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,11 +1,13 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum -from pydantic import BaseModel, Field, ValidationError, validator +from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from pydantic.main import ModelMetaclass +from thinc.api import Optimizer, ConfigValidationError from thinc.config import Promise from collections import defaultdict -from thinc.api import Optimizer +import inspect from .attrs import NAMES from .lookups import Lookups @@ -43,6 +45,93 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] +# Initialization + + +class ArgSchemaConfig: + extra = "forbid" + arbitrary_types_allowed = True + + +class ArgSchemaConfigExtra: + extra = "forbid" + arbitrary_types_allowed = True + + +def get_arg_model( + func: Callable, + *, + exclude: Iterable[str] = tuple(), + name: str = "ArgModel", + strict: bool = True, +) -> ModelMetaclass: + """Generate a pydantic model for function arguments. + + func (Callable): The function to generate the schema for. + exclude (Iterable[str]): Parameter names to ignore. + name (str): Name of created model class. + strict (bool): Don't allow extra arguments if no variable keyword arguments + are allowed on the function. + RETURNS (ModelMetaclass): A pydantic model. + """ + sig_args = {} + try: + sig = inspect.signature(func) + except ValueError: + # Typically happens if the method is part of a Cython module without + # binding=True. Here we just use an empty model that allows everything. + return create_model(name, __config__=ArgSchemaConfigExtra) + has_variable = False + for param in sig.parameters.values(): + if param.name in exclude: + continue + if param.kind == param.VAR_KEYWORD: + # The function allows variable keyword arguments so we shouldn't + # include **kwargs etc. in the schema and switch to non-strict + # mode and pass through all other values + has_variable = True + continue + # If no annotation is specified assume it's anything + annotation = param.annotation if param.annotation != param.empty else Any + # If no default value is specified assume that it's required + default = param.default if param.default != param.empty else ... + sig_args[param.name] = (annotation, default) + is_strict = strict and not has_variable + sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra + return create_model(name, **sig_args) + + +def validate_init_settings( + func: Callable, + settings: Dict[str, Any], + *, + section: Optional[str] = None, + name: str = "", + exclude: Iterable[str] = ("get_examples", "pipeline", "sgd"), +) -> Dict[str, Any]: + """Validate initialization settings against the expected arguments in + the method signature. Will parse values if possible (e.g. int to string) + and return the updated settings dict. Will raise a ConfigValidationError + if types don't match or required values are missing. + + func (Callable): The initialize method of a given component etc. + settings (Dict[str, Any]): The settings from the repsective [initialize] block. + section (str): Initialize section, for error message. + name (str): Name of the block in the section. + exclude (Iterable[str]): Parameter names to exclude from schema. + RETURNS (Dict[str, Any]): The validated settings. + """ + schema = get_arg_model(func, exclude=exclude, name="InitArgModel") + try: + return schema(**settings).dict() + except ValidationError as e: + block = "initialize" if not section else f"initialize.{section}" + title = f"Error validating initialization settings in [{block}]" + raise ConfigValidationError( + title=title, errors=e.errors(), config=settings, parent=name, + ) from None + + # Matcher token patterns From 78396d137fa2faced8a0a612ed5009fa52e3b721 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 11:57:08 +0200 Subject: [PATCH 32/66] Integrate initialize settings --- spacy/language.py | 3 ++- spacy/training/initialize.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 20b7a7256..5ba7e38f8 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1217,9 +1217,10 @@ class Language: name="tokenizer", ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) + proc_settings = settings.get("components", {}) for name, proc in self.pipeline: if hasattr(proc, "initialize"): - p_settings = settings.get(name, {}) + p_settings = proc_settings.get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 9a47a7f69..b42732d48 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -55,9 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize( - lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"] - ) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized From f2d1b7feb597194746dfd973434a0d683aecd18e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 12:00:08 +0200 Subject: [PATCH 33/66] Clean up sgd --- spacy/language.py | 3 ++- spacy/pipeline/multitask.pyx | 6 ------ spacy/pipeline/senter.pyx | 3 --- spacy/pipeline/textcat.py | 5 ----- spacy/pipeline/transition_parser.pyx | 1 - 5 files changed, 2 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 5b1f50ee2..8d8f3175b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1298,7 +1298,8 @@ class Language: def create_optimizer(self): """Create an optimizer, usually using the [training.optimizer] config.""" - return registry.resolve(self.config["training"]["optimizer"]) + subconfig = {"optimizer": self.config["training"]["optimizer"]} + return registry.resolve(subconfig)["optimizer"] @contextmanager def use_params(self, params: Optional[dict]): diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index ba406dabe..d03fd3ae8 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -91,9 +91,6 @@ class MultitaskObjective(Tagger): if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() # TODO: fix initialization by defining X and Y - if sgd is None: - sgd = self.create_optimizer() - return sgd def predict(self, docs): tokvecs = self.model.get_ref("tok2vec")(docs) @@ -181,9 +178,6 @@ class ClozeMultitask(Pipe): self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.initialize(X) - if sgd is None: - sgd = self.create_optimizer() - return sgd def predict(self, docs): tokvecs = self.model.get_ref("tok2vec")(docs) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 91ce9f1bb..76767712f 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -149,9 +149,6 @@ class SentenceRecognizer(Tagger): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def add_label(self, label, values=None): raise NotImplementedError diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 67e8777c5..67ee38217 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -348,8 +348,6 @@ class TextCategorizer(Pipe): pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/textcategorizer#initialize @@ -367,9 +365,6 @@ class TextCategorizer(Pipe): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9a2e5d8d0..65f6fa928 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -440,7 +440,6 @@ cdef class Parser(Pipe): self.model.initialize(doc_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline) - return sgd def to_disk(self, path, exclude=tuple()): serializers = { From 50410c17ac7572fb0eab317cdefe0f55342e5560 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:05:38 +0200 Subject: [PATCH 34/66] Update schemas.py --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index cdd8c11ed..594fc92ad 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -107,7 +107,7 @@ def validate_init_settings( *, section: Optional[str] = None, name: str = "", - exclude: Iterable[str] = ("get_examples", "pipeline", "sgd"), + exclude: Iterable[str] = ("get_examples", "nlp", "pipeline", "sgd"), ) -> Dict[str, Any]: """Validate initialization settings against the expected arguments in the method signature. Will parse values if possible (e.g. int to string) From e1fdf2b7c5ef601c19c008f4dc0f4fa6198c077d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 12:05:38 +0200 Subject: [PATCH 35/66] Upd tests --- spacy/tests/parser/test_add_label.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fb1eabf7d..2f750b60c 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.initialize(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(5): diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index d8f861b02..ab58ac17b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -34,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.initialize(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(10): From 42f0e4c946bc5a5e68d3132ed518a15d994e9eb4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:14:08 +0200 Subject: [PATCH 36/66] Clean up --- spacy/language.py | 10 ++++------ spacy/pipeline/dep_parser.pyx | 2 +- spacy/pipeline/sentencizer.pyx | 2 +- spacy/schemas.py | 2 +- spacy/training/initialize.py | 2 +- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 97a317101..6c0a8394d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,7 +8,7 @@ from contextlib import contextmanager from copy import deepcopy from pathlib import Path import warnings -from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer +from thinc.api import Model, get_current_ops, Config, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle @@ -1153,10 +1153,9 @@ class Language: get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, sgd: Optional[Optimizer] = None, - device: int = -1, ) -> Optimizer: warnings.warn(Warnings.W089, DeprecationWarning) - return self.initialize(get_examples, sgd=sgd, device=device) + return self.initialize(get_examples, sgd=sgd) def initialize( self, @@ -1169,7 +1168,7 @@ class Language: get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. - sgd (Optional[Optimizer]): An optimizer to use for updates. If not + sgd (Optional[Optimizer]): An optimizer to use for updates. If not provided, will be created using the .create_optimizer() method. RETURNS (thinc.api.Optimizer): The optimizer. @@ -1220,7 +1219,6 @@ class Language: proc.initialize, p_settings, section="components", name=name ) proc.initialize( - get_examples, pipeline=self.pipeline get_examples, pipeline=self.pipeline, **p_settings, @@ -1315,7 +1313,7 @@ class Language: n_words = sum(len(doc) for doc in docs) results["speed"] = n_words / (end_time - start_time) return results - + def create_optimizer(self): """Create an optimizer, usually using the [training.optimizer] config.""" subconfig = {"optimizer": self.config["training"]["optimizer"]} diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 95effac59..eedb4cba9 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -132,7 +132,7 @@ cdef class DependencyParser(Parser): labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd) + labeller.initialize(get_examples, pipeline=pipeline) @property def labels(self): diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 0f49033ff..3cd480d20 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -58,7 +58,7 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def initialize(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, pipeline=None): pass def __call__(self, doc): diff --git a/spacy/schemas.py b/spacy/schemas.py index 594fc92ad..e183e0a75 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -107,7 +107,7 @@ def validate_init_settings( *, section: Optional[str] = None, name: str = "", - exclude: Iterable[str] = ("get_examples", "nlp", "pipeline", "sgd"), + exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"), ) -> Dict[str, Any]: """Validate initialization settings against the expected arguments in the method signature. Will parse values if possible (e.g. int to string) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index b42732d48..498fd890c 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -55,7 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) + nlp.initialize(lambda: train_corpus(nlp), settings=I) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized From 612bbf85abb26eacf9b6d41399b1a761f8732f15 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:14:47 +0200 Subject: [PATCH 37/66] Update initialize.py --- spacy/training/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 498fd890c..b42732d48 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -55,7 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), settings=I) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized From f171903139732ccbe514819da8e0d28f819c5256 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:20:26 +0200 Subject: [PATCH 38/66] Clean up sgd and pipeline -> nlp --- spacy/pipeline/entity_linker.py | 17 ++++------------- spacy/pipeline/morphologizer.pyx | 7 ++----- spacy/pipeline/multitask.pyx | 4 ++-- spacy/pipeline/pipe.pyx | 9 +++------ spacy/pipeline/sentencizer.pyx | 2 +- spacy/pipeline/senter.pyx | 7 ++----- spacy/pipeline/tagger.pyx | 7 ++----- spacy/pipeline/textcat.py | 9 +++------ spacy/pipeline/tok2vec.py | 12 +++--------- spacy/pipeline/transition_parser.pyx | 10 +++++----- spacy/schemas.py | 2 +- 11 files changed, 28 insertions(+), 58 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 0f33378b4..b67a15d32 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ from itertools import islice -from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple +from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List from pathlib import Path import srsly import random @@ -144,20 +144,14 @@ class EntityLinker(Pipe): self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, - ) -> Optimizer: + nlp: Optional[Language] = None, + ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/entitylinker#initialize """ @@ -174,9 +168,6 @@ class EntityLinker(Pipe): self.model.initialize( X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") ) - if sgd is None: - sgd = self.create_optimizer() - return sgd def update( self, diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 580b6b831..9b28a7ca1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -129,16 +129,13 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def initialize(self, get_examples, *, pipeline=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/morphologizer#initialize """ diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index d03fd3ae8..ba351f16e 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -81,7 +81,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def initialize(self, get_examples, pipeline=None): + def initialize(self, get_examples, nlp=None): if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) raise ValueError(err) @@ -174,7 +174,7 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids): pass - def initialize(self, get_examples, pipeline=None): + def initialize(self, get_examples, nlp=None): self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.initialize(X) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 08015e60e..b8961f307 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -183,7 +183,7 @@ cdef class Pipe: """ return util.create_default_optimizer() - def initialize(self, get_examples, *, pipeline=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using data examples if available. This method needs to be implemented by each Pipe component, ensuring the internal model (if available) is initialized properly @@ -191,14 +191,11 @@ cdef class Pipe: get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/pipe#initialize """ - raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name)) def _ensure_examples(self, get_examples): if get_examples is None or not hasattr(get_examples, "__call__"): diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 3cd480d20..13fcd15e2 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -58,7 +58,7 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def initialize(self, get_examples, pipeline=None): + def initialize(self, get_examples, nlp=None): pass def __call__(self, doc): diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 76767712f..ec635de5c 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, pipeline=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - RETURNS: None + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize """ diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index ecf93600e..3d5aca14e 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -256,16 +256,13 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, pipeline=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects.. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/tagger#initialize """ diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 67ee38217..ea058ad31 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -338,17 +338,14 @@ class TextCategorizer(Pipe): self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None - ) -> Optimizer: + nlp: Optional[Language] = None, + ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 7c8bbf5e5..89f9df757 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple +from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List from thinc.api import Model, set_dropout_rate, Optimizer, Config from itertools import islice @@ -207,20 +207,14 @@ class Tok2Vec(Pipe): self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, + nlp: Optional[Language] = None, ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. DOCS: https://nightly.spacy.io/api/tok2vec#initialize """ diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 413ea968c..c250d2522 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -405,7 +405,7 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def initialize(self, get_examples, pipeline=None, settings=None): + def initialize(self, get_examples, nlp=None): self._ensure_examples(get_examples) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: @@ -425,8 +425,8 @@ cdef class Parser(Pipe): # make sure we resize so we have an appropriate upper layer self._resize() doc_sample = [] - if pipeline is not None: - for name, component in pipeline: + if nlp is not None: + for name, component in nlp.pipeline: if component is self: break if hasattr(component, "pipe"): @@ -438,8 +438,8 @@ cdef class Parser(Pipe): doc_sample.append(example.predicted) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(doc_sample) - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline) + if nlp is not None: + self.init_multitask_objectives(get_examples, nlp.pipeline) def to_disk(self, path, exclude=tuple()): serializers = { diff --git a/spacy/schemas.py b/spacy/schemas.py index e183e0a75..0b2eeba68 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -107,7 +107,7 @@ def validate_init_settings( *, section: Optional[str] = None, name: str = "", - exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"), + exclude: Iterable[str] = ("get_examples", "nlp"), ) -> Dict[str, Any]: """Validate initialization settings against the expected arguments in the method signature. Will parse values if possible (e.g. int to string) From adca08a12fcd011df2d94e4701dfd193a6cbb5ea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:21:52 +0200 Subject: [PATCH 39/66] Pass nlp forward --- spacy/language.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6c0a8394d..8ef2f1d61 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1218,11 +1218,7 @@ class Language: p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) - proc.initialize( - get_examples, - pipeline=self.pipeline, - **p_settings, - ) + proc.initialize(get_examples, nlp=self, **p_settings) self._link_components() if sgd is not None: self._optimizer = sgd From 591038b1a4eac783506bee845a308f3991e39548 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 12:54:52 +0200 Subject: [PATCH 40/66] Add test --- spacy/tests/pipeline/test_initialize.py | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 spacy/tests/pipeline/test_initialize.py diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py new file mode 100644 index 000000000..974556b1c --- /dev/null +++ b/spacy/tests/pipeline/test_initialize.py @@ -0,0 +1,42 @@ +import pytest +from spacy.language import Language +from spacy.lang.en import English +from spacy.training import Example +from thinc.api import ConfigValidationError +from pydantic import StrictBool + + +def test_initialize_arguments(): + name = "test_initialize_arguments" + + class Component: + def __init__(self): + ... + + def initialize( + self, get_examples, nlp, custom1: str, custom2: StrictBool = False + ): + ... + + Language.factory(name, func=lambda nlp, name: Component()) + + nlp = English() + example = Example.from_dict(nlp("x"), {}) + get_examples = lambda: [example] + nlp.add_pipe(name) + # The settings here will typically come from the [initialize] block + with pytest.raises(ConfigValidationError) as e: + # Empty settings, no required custom1 argument + nlp.initialize(get_examples, settings={"components": {name: {}}}) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ("custom1",) + assert errors[0]["type"] == "value_error.missing" + with pytest.raises(ConfigValidationError) as e: + # Wrong type + settings = {"components": {name: {"custom1": "x", "custom2": 1}}} + nlp.initialize(get_examples, settings=settings) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ("custom2",) + assert errors[0]["type"] == "value_error.strictbool" From 56f8bc73ef1880ded2abe9da5a5ff26ca6babc20 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 15:23:34 +0200 Subject: [PATCH 41/66] Add more tests --- spacy/tests/pipeline/test_initialize.py | 32 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 974556b1c..1d2e7e5a3 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -9,34 +9,58 @@ from pydantic import StrictBool def test_initialize_arguments(): name = "test_initialize_arguments" + class CustomTokenizer: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.from_initialize = None + + def __call__(self, text): + return self.tokenizer(text) + + def initialize(self, get_examples, nlp, custom: int): + self.from_initialize = custom + class Component: def __init__(self): - ... + self.from_initialize = None def initialize( self, get_examples, nlp, custom1: str, custom2: StrictBool = False ): - ... + self.from_initialize = (custom1, custom2) Language.factory(name, func=lambda nlp, name: Component()) nlp = English() + nlp.tokenizer = CustomTokenizer(nlp.tokenizer) example = Example.from_dict(nlp("x"), {}) get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block with pytest.raises(ConfigValidationError) as e: # Empty settings, no required custom1 argument - nlp.initialize(get_examples, settings={"components": {name: {}}}) + settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.initialize(get_examples, settings=settings) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) assert errors[0]["type"] == "value_error.missing" with pytest.raises(ConfigValidationError) as e: # Wrong type - settings = {"components": {name: {"custom1": "x", "custom2": 1}}} + settings = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } nlp.initialize(get_examples, settings=settings) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) assert errors[0]["type"] == "value_error.strictbool" + settings = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": True}}, + } + nlp.initialize(get_examples, settings=settings) + assert nlp.tokenizer.from_initialize == 1 + pipe = nlp.get_pipe(name) + assert pipe.from_initialize == ("x", True) From 63d15981377aa207591380ba6eaf816c7696830c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:05:48 +0200 Subject: [PATCH 42/66] Simplify config use in Language.initialize --- spacy/language.py | 25 +++++++++++----- spacy/tests/pipeline/test_initialize.py | 25 +++++++++------- spacy/training/initialize.py | 38 ++++++++++--------------- spacy/training/loop.py | 20 +++++++------ 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 8ef2f1d61..8d546529d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,6 +18,7 @@ from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples +from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer from .util import registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER @@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .tokenizer import Tokenizer from .errors import Errors, Warnings -from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings +from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit +from .schemas import ConfigSchemaPretrain, validate_init_settings from .git_info import GIT_VERSION from . import util from . import about @@ -1161,7 +1163,6 @@ class Language: self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, - settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), sgd: Optional[Optimizer] = None, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. @@ -1198,28 +1199,38 @@ class Language: if not valid_examples: err = Errors.E930.format(name="Language", obj="empty list") raise ValueError(err) + # Make sure the config is interpolated so we can resolve subsections + config = self.config.interpolate() + # These are the settings provided in the [initialize] block in the config + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + V = I["vocab"] + init_vocab( + self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], + ) + pretrain_cfg = config.get("pretraining") + if pretrain_cfg: + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + init_tok2vec(self, P, V) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - self._optimizer = sgd if hasattr(self.tokenizer, "initialize"): - tok_settings = settings.get("tokenizer", {}) tok_settings = validate_init_settings( self.tokenizer.initialize, - tok_settings, + I["tokenizer"], section="tokenizer", name="tokenizer", ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) - proc_settings = settings.get("components", {}) for name, proc in self.pipeline: if hasattr(proc, "initialize"): - p_settings = proc_settings.get(name, {}) + p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) proc.initialize(get_examples, nlp=self, **p_settings) self._link_components() + self._optimizer = sgd if sgd is not None: self._optimizer = sgd elif self._optimizer is None: diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 1d2e7e5a3..b6c22ee09 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -37,30 +37,33 @@ def test_initialize_arguments(): get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block + init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Empty settings, no required custom1 argument - settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} - nlp.initialize(get_examples, settings=settings) + # Empty config for component, no required custom1 argument + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) assert errors[0]["type"] == "value_error.missing" + init_cfg = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Wrong type - settings = { - "tokenizer": {"custom": 1}, - "components": {name: {"custom1": "x", "custom2": 1}}, - } - nlp.initialize(get_examples, settings=settings) + # Wrong type of custom 2 + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) assert errors[0]["type"] == "value_error.strictbool" - settings = { + init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x", "custom2": True}}, } - nlp.initialize(get_examples, settings=settings) + nlp.config["initialize"].update(init_cfg) + nlp.initialize(get_examples) assert nlp.tokenizer.from_initialize == 1 pipe = nlp.get_pipe(name) assert pipe.from_initialize == ("x", True) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index b42732d48..9517c6c48 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Optional, Any, List, IO +from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -11,16 +11,18 @@ import zipfile import tqdm from .loop import create_before_to_disk_callback -from ..language import Language from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB +if TYPE_CHECKING: + from ..language import Language # noqa: F401 -def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: + +def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": msg = Printer(no_print=silent) raw_config = config config = raw_config.interpolate() @@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] - init_vocab( - nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent - ) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) - if "pretraining" in config and config["pretraining"]: - P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - loaded = add_tok2vec_weights(nlp, P, V) - if loaded and P["component"]: - msg.good(f"Loaded pretrained weights into component '{P['component']}'") nlp = before_to_disk(nlp) return nlp @@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool: def init_vocab( - nlp: Language, + nlp: "Language", *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, silent: bool = True, -) -> Language: +) -> "Language": msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups @@ -109,7 +101,7 @@ def init_vocab( def load_vectors_into_model( - nlp: Language, name: Union[str, Path], *, add_strings: bool = True + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: @@ -132,8 +124,8 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) -def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] +def init_tok2vec( + nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] ) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config @@ -171,7 +163,7 @@ def add_tok2vec_weights( return False -def verify_config(nlp: Language) -> None: +def verify_config(nlp: "Language") -> None: """Perform additional checks based on the config, loaded nlp object and training data.""" # TODO: maybe we should validate based on the actual components, the list # in config["nlp"]["pipeline"] instead? @@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None: verify_textcat_config(nlp, pipe_config) -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: +def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: # if 'positive_label' is provided: double check whether it's in the data and # the task is binary if pipe_config.get("positive_label"): @@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: def convert_vectors( - nlp: Language, + nlp: "Language", vectors_loc: Optional[Path], *, truncate: int, diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 5153be66c..41e6464e0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,5 +1,5 @@ from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any -from typing import Optional +from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator @@ -9,13 +9,15 @@ from wasabi import Printer from .example import Example from ..schemas import ConfigSchemaTraining -from ..language import Language from ..errors import Errors from ..util import resolve_dot_names, registry +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + def train( - nlp: Language, + nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, @@ -110,7 +112,7 @@ def train( def train_while_improving( - nlp: Language, + nlp: "Language", optimizer: Optimizer, train_data, evaluate, @@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient): def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] + nlp: "Language", dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: weights = {key: value for key, value in weights.items() if value is not None} @@ -277,7 +279,7 @@ def create_train_batches( def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] + training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: @@ -288,8 +290,10 @@ def update_meta( def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: + callback: Optional[Callable[["Language"], "Language"]] +) -> Callable[["Language"], "Language"]: + from ..language import Language # noqa: F811 + def before_to_disk(nlp: Language) -> Language: if not callback: return nlp From aa2a6882d064924165ee697cac0e431a92e64eb2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:08:39 +0200 Subject: [PATCH 43/66] Fix logging --- spacy/cli/init_pipeline.py | 11 ++++++++--- spacy/training/initialize.py | 35 ++++++++++++++--------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 0e9de0eb4..ac1cdb7be 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -19,13 +19,18 @@ def init_vectors_cli( prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): + """Convert word vectors for use with spaCy. Will export an nlp object that + you can use in the [initialize.vocab] block of your config to initialize + a model with vectors. + """ + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() - convert_vectors( - nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False - ) + convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) + msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 9517c6c48..ef0938321 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -2,7 +2,6 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path -from wasabi import Printer import srsly import numpy import tarfile @@ -14,16 +13,15 @@ from .loop import create_before_to_disk_callback from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain -from ..util import registry, load_model_from_config, resolve_dot_names +from ..schemas import ConfigSchemaTraining +from ..util import registry, load_model_from_config, resolve_dot_names, logger from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB if TYPE_CHECKING: from ..language import Language # noqa: F401 -def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": - msg = Printer(no_print=silent) +def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -34,7 +32,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) - msg.good("Set up nlp object from config") + logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) @@ -46,14 +44,14 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") + logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") + logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - msg.good("Initialized pipeline components") + logger.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -72,12 +70,10 @@ def init_vocab( data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, - silent: bool = True, ) -> "Language": - msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups - msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -93,11 +89,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - msg.good("Created vocabulary") + logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.good("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - msg.good(f"Added vectors: {vectors}") + logger.good(f"Added vectors: {vectors}") def load_vectors_into_model( @@ -209,9 +205,7 @@ def convert_vectors( truncate: int, prune: int, name: Optional[str] = None, - silent: bool = True, ) -> None: - msg = Printer(no_print=silent) vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -220,9 +214,9 @@ def convert_vectors( nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: - with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(vectors_loc, truncate) - msg.good(f"Loaded vectors from {vectors_loc}") + logger.info(f"Reading vectors from {vectors_loc}") + vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + logger.info(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) if vector_keys is not None: @@ -239,7 +233,6 @@ def convert_vectors( nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune >= 1: nlp.vocab.prune_vectors(prune) - msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") def read_vectors(vectors_loc: Path, truncate_vectors: int): From 58c8d4b414e61ecd612d44521216ff3e8fa9affa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:22:13 +0200 Subject: [PATCH 44/66] Add label_data property to pipeline --- spacy/pipeline/morphologizer.pyx | 7 ++++++- spacy/pipeline/pipe.pyx | 15 +++++++++++++++ spacy/pipeline/senter.pyx | 4 ++++ spacy/pipeline/tagger.pyx | 10 ++++++++++ spacy/pipeline/textcat.py | 15 +++++++++++++++ spacy/pipeline/transition_parser.pyx | 4 ++++ 6 files changed, 54 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 9b28a7ca1..c9798a638 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional +from typing import Optional, Union, Dict import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -101,6 +101,11 @@ class Morphologizer(Tagger): """RETURNS (Tuple[str]): The labels currently added to the component.""" return tuple(self.cfg["labels_morph"].keys()) + @property + def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]: + """RETURNS (Dict): A dictionary with all labels data.""" + return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]} + def add_label(self, label): """Add a new label to the pipe. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index b8961f307..481430a2c 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +from typing import Optional, Tuple import srsly from thinc.api import set_dropout_rate, Model @@ -32,6 +33,20 @@ cdef class Pipe: self.name = name self.cfg = dict(cfg) + @property + def labels(self) -> Optional[Tuple[str]]: + if "labels" in self.cfg: + return tuple(self.cfg["labels"]) + else: + return None + + @property + def label_data(self): + """Optional JSON-serializable data that would be sufficient to recreate + the label set if provided to the `pipe.initialize()` method. + """ + return None + def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the nlp object diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index ec635de5c..65c17c771 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def label_data(self): + return self.labels + def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 3d5aca14e..253b6f08c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -90,6 +90,16 @@ class Tagger(Pipe): """ return tuple(self.cfg["labels"]) + @property + def label_data(self): + """Data about the labels currently added to the component. + + RETURNS (Dict): The labels data. + + DOCS: https://nightly.spacy.io/api/tagger#labels + """ + return tuple(self.cfg["labels"]) + def __call__(self, doc): """Apply the pipe to a Doc. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index ea058ad31..63b040333 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -154,8 +154,23 @@ class TextCategorizer(Pipe): @labels.setter def labels(self, value: List[str]) -> None: + # TODO: This really shouldn't be here. I had a look and I added it when + # I added the labels property, but it's pretty nasty to have this, and + # will lead to problems. self.cfg["labels"] = tuple(value) + @property + def label_data(self) -> Dict: + """RETURNS (Dict): Information about the component's labels. + + DOCS: https://nightly.spacy.io/api/textcategorizer#labels + """ + return { + "labels": self.labels, + "positive": self.cfg["positive_label"], + "threshold": self.cfg["threshold"] + } + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index c250d2522..9f165cb15 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -95,6 +95,10 @@ cdef class Parser(Pipe): class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] return class_names + @property + def label_data(self): + return self.moves.labels + @property def tok2vec(self): """Return the embedding and convolutional layer of the model.""" From 45daf5c9fe1c0b7a42f0af2dccf2f7ef2faeded9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:22:37 +0200 Subject: [PATCH 45/66] Add init labels command --- spacy/cli/__init__.py | 1 + spacy/cli/init_labels.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 spacy/cli/init_labels.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 7368bcef3..c5f60adfc 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -16,6 +16,7 @@ from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401 +from .init_labels import init_labels_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py new file mode 100644 index 000000000..29cb23072 --- /dev/null +++ b/spacy/cli/init_labels.py @@ -0,0 +1,43 @@ +from typing import Optional +import logging +from pathlib import Path +from wasabi import msg +import typer +import srsly + +from .. import util +from ..training.initialize import init_nlp, convert_vectors +from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code, setup_gpu + + +@init_cli.command( + "labels", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def init_labels_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the labels"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + if not output_path.exists(): + output_path.mkdir() + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu, silent=False) + for name, component in nlp.pipeline: + if getattr(component, "label_data", None) is not None: + srsly.write_json(output_path / f"{name}.json", component.label_data) + msg.good(f"Saving {name} labels to {output_path}/{name}.json") + else: + msg.info(f"No labels found for {name}") From 978ab54a84262682f75b8bb0aa196cd4f93976aa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:22:41 +0200 Subject: [PATCH 46/66] Fix logging --- spacy/training/initialize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index ef0938321..862c76448 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -51,7 +51,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.good("Initialized pipeline components") + logger.info("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -89,11 +89,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - logger.good("Created vocabulary") + logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.good(f"Added vectors: {vectors}") + logger.info(f"Added vectors: {vectors}") def load_vectors_into_model( From 3f0d61232dbc8b45845463b27d766cdbb813af5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:47:44 +0200 Subject: [PATCH 47/66] Remove outdated arg from train --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b0bd48ddb..79c3d893c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -56,7 +56,7 @@ def train_cli( def init_pipeline( config: Config, output_path: Optional[Path], *, use_gpu: int = -1 ) -> Language: - init_kwargs = {"use_gpu": use_gpu, "silent": False} + init_kwargs = {"use_gpu": use_gpu} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): From e70a00fa76f50e6c49ece17b20d7c5246609ed35 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:47:54 +0200 Subject: [PATCH 48/66] Remove unnecessary warning from train --- spacy/cli/train.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 79c3d893c..7bbfe9315 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -74,12 +74,6 @@ def init_pipeline( else: msg.good(f"Loaded initialized pipeline from {init_path}") return nlp - msg.warn( - "Not saving initialized model: no output directory specified. " - "To speed up training, spaCy can save the initialized nlp object with " - "the vocabulary, vectors and label scheme. To take advantage of this, " - "provide an output directory." - ) return init_nlp(config, **init_kwargs) From fd594cfb9b80e47614d72020a50b4a12b925bc01 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:47:55 +0200 Subject: [PATCH 49/66] Tighten up format --- spacy/default_config.cfg | 24 +++++++++++------------- spacy/language.py | 5 ++--- spacy/schemas.py | 16 ++++------------ spacy/training/initialize.py | 8 ++++---- 4 files changed, 21 insertions(+), 32 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 86293fd40..c0fd27c3c 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,8 +1,9 @@ [paths] train = "" dev = "" -init_tok2vec = null +vectors = null vocab_data = null +init_tok2vec = null [system] seed = 0 @@ -96,19 +97,16 @@ eps = 1e-8 learn_rate = 0.001 # The 'initialize' step is run before training or pretraining. Components and -# the tokenizer can each define their own prepare step, giving them a chance -# to gather resources like lookup-tables, build label sets, construct vocabularies, -# etc. After 'prepare' is finished, the result will be saved out to disk, which -# will then be read in at the start of training. You can call the prepare step -# separately with the `spacy prepare` command, or you can let the train script -# do it for you. +# the tokenizer can each define their own arguments via their .initialize +# methods that are populated by the config. This lets them gather resources like +# lookup tables and build label sets, construct vocabularies, etc. [initialize] -tokenizer = {} -components = {} - -[initialize.vocab] -data = ${paths.vocab_data} +vocab_data = ${paths.vocab_data} lookups = null -vectors = null +vectors = ${paths.vectors} # Extra resources for transfer-learning or pseudo-rehearsal init_tok2vec = ${paths.init_tok2vec} +# Arguments passed to the tokenizer's initialize method +tokenizer = {} +# Arguments passed to the initialize methods of the components (keyed by component name) +components = {} diff --git a/spacy/language.py b/spacy/language.py index 8d546529d..7450db720 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1203,14 +1203,13 @@ class Language: config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] init_vocab( - self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], + self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"], ) pretrain_cfg = config.get("pretraining") if pretrain_cfg: P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) - init_tok2vec(self, P, V) + init_tok2vec(self, P, I) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) diff --git a/spacy/schemas.py b/spacy/schemas.py index 0b2eeba68..658eeb574 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaInitVocab(BaseModel): +class ConfigSchemaInit(BaseModel): # fmt: off - data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") + vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") + tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component") # fmt: on class Config: @@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaInit(BaseModel): - vocab: ConfigSchemaInitVocab - tokenizer: Any - components: Dict[StrictStr, Any] - - class Config: - extra = "forbid" - arbitrary_types_allowed = True - - class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 862c76448..aa5edde5d 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -121,15 +121,15 @@ def load_vectors_into_model( def init_tok2vec( - nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] + nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any] ) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config - V = vocab_config + I = init_config weights_data = None - init_tok2vec = ensure_path(V["init_tok2vec"]) + init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not V["vectors"]: + if P["objective"].get("type") == "vectors" and not I["vectors"]: err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' errors = [{"loc": ["initialize", "vocab"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) From 10847c7f4e61ac08533efb1dee1eacea9e939d71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:48:07 +0200 Subject: [PATCH 50/66] Fix arg --- spacy/cli/init_labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py index 29cb23072..e675901a3 100644 --- a/spacy/cli/init_labels.py +++ b/spacy/cli/init_labels.py @@ -34,7 +34,7 @@ def init_labels_cli( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu, silent=False) + nlp = init_nlp(config, use_gpu=use_gpu) for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: srsly.write_json(output_path / f"{name}.json", component.label_data) From ca726080592abfae0f9045c61f4e7d15b1188f9d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:48:33 +0200 Subject: [PATCH 51/66] Fix language --- spacy/language.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 8d546529d..ec2e42a35 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1181,24 +1181,9 @@ class Language: ) doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] - # Populate vocab if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="Language", obj=type(get_examples)) raise ValueError(err) - valid_examples = False - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.initialize", types=type(example) - ) - raise ValueError(err) - else: - valid_examples = True - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 - if not valid_examples: - err = Errors.E930.format(name="Language", obj="empty list") - raise ValueError(err) # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config From 99bff78617388d077b4113c602becfa09a23c344 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:48:44 +0200 Subject: [PATCH 52/66] Use labels in tagger --- spacy/pipeline/tagger.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 253b6f08c..f4e8ecebd 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -266,7 +266,7 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, nlp=None): + def initialize(self, get_examples, *, nlp=None, labels=None): """Initialize the pipe for training, using a representative set of data examples. @@ -277,15 +277,19 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#initialize """ self._ensure_examples(get_examples) + if labels is not None: + for tag in labels: + self.add_label(tag) + else: + tags = set() + for example in get_examples(): + for token in example.y: + if token.tag_: + tags.add(token.tag_) + for tag in sorted(tags): + self.add_label(tag) doc_sample = [] label_sample = [] - tags = set() - for example in get_examples(): - for token in example.y: - if token.tag_: - tags.add(token.tag_) - for tag in sorted(tags): - self.add_label(tag) for example in islice(get_examples(), 10): doc_sample.append(example.x) gold_tags = example.get_aligned("TAG", as_string=True) From 1fd002180e98d830da26f4593ce6bc7a838e2131 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:48:56 +0200 Subject: [PATCH 53/66] Allow more components to use labels --- spacy/pipeline/textcat.py | 25 ++++++++++++------------- spacy/pipeline/transition_parser.pyx | 15 +++++++++------ 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 63b040333..d6dafa3f5 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -160,16 +160,12 @@ class TextCategorizer(Pipe): self.cfg["labels"] = tuple(value) @property - def label_data(self) -> Dict: - """RETURNS (Dict): Information about the component's labels. + def label_data(self) -> List[str]: + """RETURNS (List[str]): Information about the component's labels. DOCS: https://nightly.spacy.io/api/textcategorizer#labels """ - return { - "labels": self.labels, - "positive": self.cfg["positive_label"], - "threshold": self.cfg["threshold"] - } + return self.labels def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under @@ -354,6 +350,7 @@ class TextCategorizer(Pipe): get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, + labels: Optional[Dict] = None ): """Initialize the pipe for training, using a representative set of data examples. @@ -365,12 +362,14 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ self._ensure_examples(get_examples) - subbatch = [] # Select a subbatch of examples to initialize the model - for example in islice(get_examples(), 10): - if len(subbatch) < 2: - subbatch.append(example) - for cat in example.y.cats: - self.add_label(cat) + if labels is None: + for example in get_examples(): + for cat in example.y.cats: + self.add_label(cat) + else: + for label in labels: + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) doc_sample = [eg.reference for eg in subbatch] label_sample, _ = self._examples_to_truth(subbatch) self._require_labels() diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9f165cb15..11e0e5af8 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -409,17 +409,20 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def initialize(self, get_examples, nlp=None): + def initialize(self, get_examples, *, nlp=None, labels=None): self._ensure_examples(get_examples) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) - actions = self.moves.get_actions( - examples=get_examples(), - min_freq=self.cfg['min_action_freq'], - learn_tokens=self.cfg["learn_tokens"] - ) + if labels is not None: + actions = dict(labels) + else: + actions = self.moves.get_actions( + examples=get_examples(), + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"] + ) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): From 43fc7a316d415a0e5ef9fecc02112502928c9fd3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:49:09 +0200 Subject: [PATCH 54/66] Add registry function for reading jsonl --- spacy/training/corpus.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 12bda486e..bd431ab83 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -30,6 +30,11 @@ def create_jsonl_reader( return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) +@util.registry.readers("srsly.read_json.v1") +def _read_json(loc: Path): + return srsly.read_json(loc) + + def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: path = util.ensure_path(path) if not path.is_dir() and path.parts[-1].endswith(file_type): From 4ad26f4a2f5ea73eff3b179c3a234d4713c1da6c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:54:53 +0200 Subject: [PATCH 55/66] Move reader --- spacy/training/corpus.py | 5 ----- spacy/util.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index bd431ab83..12bda486e 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -30,11 +30,6 @@ def create_jsonl_reader( return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) -@util.registry.readers("srsly.read_json.v1") -def _read_json(loc: Path): - return srsly.read_json(loc) - - def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: path = util.ensure_path(path) if not path.is_dir() and path.parts[-1].endswith(file_type): diff --git a/spacy/util.py b/spacy/util.py index 67c577927..948c4ab11 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -97,6 +97,9 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) +# We want json loading in the registry, so manually register srsly.read_json. +registry.readers("srsly.read_json.v0", srsly.read_json) + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default From e4f535a964da107a5fd558acf3c975388f1dce75 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 16:55:07 +0200 Subject: [PATCH 56/66] Fix Pipe.labels --- spacy/pipeline/pipe.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 481430a2c..49d0bea35 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -35,10 +35,7 @@ cdef class Pipe: @property def labels(self) -> Optional[Tuple[str]]: - if "labels" in self.cfg: - return tuple(self.cfg["labels"]) - else: - return None + return [] @property def label_data(self): From d7469283c5bcce87c7cdc088a161e909d5e87f19 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 16:59:21 +0200 Subject: [PATCH 57/66] Update docs [ci skip] --- website/docs/api/data-formats.md | 35 +++++++++++++++++++++++--- website/docs/api/dependencyparser.md | 20 +++++++-------- website/docs/api/entitylinker.md | 22 ++++++++-------- website/docs/api/entityrecognizer.md | 20 +++++++-------- website/docs/api/language.md | 19 +++++++++----- website/docs/api/morphologizer.md | 21 +++++++--------- website/docs/api/pipe.md | 20 +++++++-------- website/docs/api/sentencerecognizer.md | 20 +++++++-------- website/docs/api/tagger.md | 20 +++++++-------- website/docs/api/textcategorizer.md | 20 +++++++-------- website/docs/api/tok2vec.md | 9 +++---- website/docs/api/transformer.md | 9 +++---- 12 files changed, 126 insertions(+), 109 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 6ff3bfd0d..0d2c78598 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -190,8 +190,6 @@ process that are used when you run [`spacy train`](/api/cli#train). | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | | `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | @@ -200,7 +198,6 @@ process that are used when you run [`spacy train`](/api/cli#train). | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | | `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -220,6 +217,38 @@ used when you run [`spacy pretrain`](/api/cli#pretrain). | `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | | `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +### initialize {#config-initialize tag="section"} + +This config block lets you define resources for **initializing the pipeline**. +It's used by [`Language.initialize`](/api/language#initialize) and typically +called right before training (but not at runtime). The section allows you to +specify local file paths or custom functions to load data resources from, +without requiring them at runtime when you load the trained pipeline back in. + +> #### Example +> +> ```ini +> [initialize] +> vectors = "/path/to/vectors_nlp" +> init_tok2vec = "/path/to/pretrain.bin" +> +> [initialize_components] +> +> [initialize.components.my_component] +> data_path = "/path/to/component_data" +> ``` + + + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | +| `tokenizer` | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| `vocab_data` | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~ | + ## Training data {#training} ### Binary training format {#binary-training new="3"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index c7c41f2a1..7c56ce84e 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -142,14 +142,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and ## DependencyParser.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -161,16 +161,14 @@ This method was previously called `begin_training`. > > ```python > parser = nlp.add_pipe("parser") -> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline) +> parser.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## DependencyParser.predict {#predict tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 1dbe78703..b104fb69a 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -141,14 +141,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -159,17 +159,15 @@ This method was previously called `begin_training`. > #### Example > > ```python -> entity_linker = nlp.add_pipe("entity_linker", last=True) -> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline) +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## EntityLinker.predict {#predict tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 2c32ff753..b930660d9 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -131,14 +131,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and ## EntityRecognizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -150,16 +150,14 @@ This method was previously called `begin_training`. > > ```python > ner = nlp.add_pipe("ner") -> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline) +> ner.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## EntityRecognizer.predict {#predict tag="method"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 11631502c..8dbb0d821 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -204,12 +204,19 @@ more efficient than processing texts one-by-one. ## Language.initialize {#initialize tag="method"} Initialize the pipeline for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples can either be the full training data or a representative sample. They -are used to **initialize the models** of trainable pipeline components and are -passed each component's [`initialize`](/api/pipe#initialize) method, if -available. Initialization includes validating the network, +[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the +settings defined in the [`[initialize]`](/api/data-formats#config-initialize) +config block to set up the vocabulary, load in vectors and tok2vec weights and +pass optional arguments to the `initialize` methods implemented by pipeline +components or the tokenizer. This method is typically called automatically when +you run [`spacy train`](/api/cli#train). + +`get_examples` should be a function that returns an iterable of +[`Example`](/api/example) objects. The data examples can either be the full +training data or a representative sample. They are used to **initialize the +models** of trainable pipeline components and are passed each component's +[`initialize`](/api/pipe#initialize) method, if available. Initialization +includes validating the network, [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference) and setting up the label scheme based on the data. diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 4f00a09ef..68e096ab7 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -119,30 +119,27 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and ## Morphologizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > morphologizer = nlp.add_pipe("morphologizer") -> nlp.pipeline.append(morphologizer) -> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline) +> morphologizer.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Morphologizer.predict {#predict tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 17752ed5e..385ad7ec9 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -100,14 +100,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and ## Pipe.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -119,16 +119,14 @@ This method was previously called `begin_training`. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline) +> pipe.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Pipe.predict {#predict tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index d81725343..ac7008465 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -116,29 +116,27 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the ## SentenceRecognizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > senter = nlp.add_pipe("senter") -> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline) +> senter.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## SentenceRecognizer.predict {#predict tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 6ca554f49..ff9763e61 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -114,14 +114,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and ## Tagger.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -133,16 +133,14 @@ This method was previously called `begin_training`. > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline) +> tagger.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Tagger.predict {#predict tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 4c99d6984..6db960ea0 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -127,14 +127,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and ## TextCategorizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). @@ -146,16 +146,14 @@ This method was previously called `begin_training`. > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline) +> textcat.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## TextCategorizer.predict {#predict tag="method"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 8269ad7cf..fa6e6c689 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -132,22 +132,21 @@ examples are used to **initialize the model** of the component and can either be the full training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline) +> tok2vec.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Tok2Vec.predict {#predict tag="method"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 712214fec..938574f2e 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -167,22 +167,21 @@ examples are used to **initialize the model** of the component and can either be the full training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > trf = nlp.add_pipe("transformer") -> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline) +> trf.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Transformer.predict {#predict tag="method"} From f2352eb701a3fc968b0a20ba2e143ccd995347c3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 17:00:40 +0200 Subject: [PATCH 58/66] Test with default value --- spacy/tests/pipeline/test_initialize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index b6c22ee09..c9b514770 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -60,10 +60,10 @@ def test_initialize_arguments(): assert errors[0]["type"] == "value_error.strictbool" init_cfg = { "tokenizer": {"custom": 1}, - "components": {name: {"custom1": "x", "custom2": True}}, + "components": {name: {"custom1": "x"}}, } nlp.config["initialize"].update(init_cfg) nlp.initialize(get_examples) assert nlp.tokenizer.from_initialize == 1 pipe = nlp.get_pipe(name) - assert pipe.from_initialize == ("x", True) + assert pipe.from_initialize == ("x", False) From 534e1ef49889a3da771e273e14552078490c92b2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 17:02:55 +0200 Subject: [PATCH 59/66] Fix template --- spacy/cli/templates/quickstart_training.jinja | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 5e990611e..adad72995 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -314,8 +314,6 @@ compound = 1.001 {% endif %} [initialize] - -[initialize.vocab] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = null {% else -%} From 9353a82076f4babd60b4d150b5ff9b0632eae5f8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 18:07:48 +0200 Subject: [PATCH 60/66] Auto-format --- spacy/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/util.py b/spacy/util.py index 948c4ab11..1e0a8e7d4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -97,6 +97,7 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) + # We want json loading in the registry, so manually register srsly.read_json. registry.readers("srsly.read_json.v0", srsly.read_json) From dba26186efa4eb437e0f4f50d8dba395027164ec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 18:08:02 +0200 Subject: [PATCH 61/66] Handle None default args in Cython methods --- spacy/schemas.py | 8 ++++++-- spacy/util.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 658eeb574..555a505d7 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -11,6 +11,7 @@ import inspect from .attrs import NAMES from .lookups import Lookups +from .util import is_cython_func if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -93,8 +94,11 @@ def get_arg_model( continue # If no annotation is specified assume it's anything annotation = param.annotation if param.annotation != param.empty else Any - # If no default value is specified assume that it's required - default = param.default if param.default != param.empty else ... + # If no default value is specified assume that it's required. Cython + # functions/methods will have param.empty for default value None so we + # need to treat them differently + default_empty = None if is_cython_func(func) else ... + default = param.default if param.default != param.empty else default_empty sig_args[param.name] = (annotation, default) is_strict = strict and not has_variable sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra diff --git a/spacy/util.py b/spacy/util.py index 1e0a8e7d4..98c2a4083 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1310,3 +1310,21 @@ def minibatch(items, size): if len(batch) == 0: break yield list(batch) + + +def is_cython_func(func: Callable) -> bool: + """Slightly hacky check for whether a callable is implemented in Cython. + Can be used to implement slightly different behaviors, especially around + inspecting and parameter annotations. + + func (Callable): The callable to check. + RETURNS (bool): Whether the callable is Cython (probably). + """ + attr = "__reduce_cython__" + if hasattr(func, attr): # function or class instance + return True + # https://stackoverflow.com/a/55767059 + if hasattr(func, "__qualname__") and hasattr(func, "__module__"): # method + cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] + return hasattr(cls_func, attr) + return False From 71a0ee274a1f40c06f1c2ad077eb006aa42ac702 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 18:09:33 +0200 Subject: [PATCH 62/66] Move init labels to init pipeline module --- spacy/cli/__init__.py | 1 - spacy/cli/init_labels.py | 43 -------------------------------------- spacy/cli/init_pipeline.py | 33 +++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 44 deletions(-) delete mode 100644 spacy/cli/init_labels.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index c5f60adfc..7368bcef3 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -16,7 +16,6 @@ from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401 -from .init_labels import init_labels_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py deleted file mode 100644 index e675901a3..000000000 --- a/spacy/cli/init_labels.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Optional -import logging -from pathlib import Path -from wasabi import msg -import typer -import srsly - -from .. import util -from ..training.initialize import init_nlp, convert_vectors -from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu - - -@init_cli.command( - "labels", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def init_labels_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True), - output_path: Path = Arg(..., help="Output directory for the labels"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") - # fmt: on -): - if not output_path.exists(): - output_path.mkdir() - util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) - overrides = parse_config_overrides(ctx.args) - import_code(code_path) - setup_gpu(use_gpu) - with show_validation_error(config_path): - config = util.load_config(config_path, overrides=overrides) - with show_validation_error(hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu) - for name, component in nlp.pipeline: - if getattr(component, "label_data", None) is not None: - srsly.write_json(output_path / f"{name}.json", component.label_data) - msg.good(f"Saving {name} labels to {output_path}/{name}.json") - else: - msg.info(f"No labels found for {name}") diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index ac1cdb7be..43b95cec1 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -3,6 +3,7 @@ import logging from pathlib import Path from wasabi import msg import typer +import srsly from .. import util from ..training.initialize import init_nlp, convert_vectors @@ -64,3 +65,35 @@ def init_pipeline_cli( nlp = init_nlp(config, use_gpu=use_gpu, silent=False) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") + + +@init_cli.command( + "labels", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def init_labels_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the labels"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + if not output_path.exists(): + output_path.mkdir() + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu) + for name, component in nlp.pipeline: + if getattr(component, "label_data", None) is not None: + srsly.write_json(output_path / f"{name}.json", component.label_data) + msg.good(f"Saving {name} labels to {output_path}/{name}.json") + else: + msg.info(f"No labels found for {name}") From 78510206537eb6a7b0afe0eccfc62a84a5f96d6e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 18:14:15 +0200 Subject: [PATCH 63/66] Update tests --- spacy/tests/pipeline/test_morphologizer.py | 2 +- spacy/tests/pipeline/test_senter.py | 2 +- spacy/tests/pipeline/test_tagger.py | 2 +- spacy/tests/pipeline/test_textcat.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index c86ee3617..5d605f4e6 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -66,7 +66,7 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(TypeError): + with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): nlp.initialize(get_examples=train_examples) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 5d8a8be41..c64dfcbd6 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -40,7 +40,7 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(TypeError): + with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): nlp.initialize(get_examples=train_examples) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 69a6dd414..b32925d84 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -74,7 +74,7 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(TypeError): + with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: train_examples[0]) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 2870229c8..ff36bbda9 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -121,7 +121,7 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(TypeError): + with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): nlp.initialize(get_examples=train_examples) From 0b5c72fce23bdb14f5fe2473d8bdea3d07d6609d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 18:30:38 +0200 Subject: [PATCH 64/66] Fix incorrect docstrings --- spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/tagger.pyx | 7 +------ spacy/pipeline/textcat.py | 5 +---- spacy/pipeline/transition_parser.pyx | 3 ++- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index c9798a638..60ad10a2b 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -103,7 +103,7 @@ class Morphologizer(Tagger): @property def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]: - """RETURNS (Dict): A dictionary with all labels data.""" + """A dictionary with all labels data.""" return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]} def add_label(self, label): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index f4e8ecebd..a4f9d395f 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -92,12 +92,7 @@ class Tagger(Pipe): @property def label_data(self): - """Data about the labels currently added to the component. - - RETURNS (Dict): The labels data. - - DOCS: https://nightly.spacy.io/api/tagger#labels - """ + """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) def __call__(self, doc): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index d6dafa3f5..776b0a178 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -161,10 +161,7 @@ class TextCategorizer(Pipe): @property def label_data(self) -> List[str]: - """RETURNS (List[str]): Information about the component's labels. - - DOCS: https://nightly.spacy.io/api/textcategorizer#labels - """ + """RETURNS (List[str]): Information about the component's labels.""" return self.labels def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 11e0e5af8..bcaa8e8d4 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -7,6 +7,7 @@ from libcpp.vector cimport vector from libc.string cimport memset from libc.stdlib cimport calloc, free import random +from typing import Optional import srsly from thinc.api import set_dropout_rate @@ -409,7 +410,7 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def initialize(self, get_examples, *, nlp=None, labels=None): + def initialize(self, get_examples, nlp=None, labels=None): self._ensure_examples(get_examples) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: From a4da3120b4244c2599fa2b184189df52533f6fea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Sep 2020 18:33:16 +0200 Subject: [PATCH 65/66] Fix multitasks --- spacy/pipeline/dep_parser.pyx | 4 ++-- spacy/pipeline/ner.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index eedb4cba9..bdef332cc 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -126,13 +126,13 @@ cdef class DependencyParser(Parser): def add_multitask_objective(self, mt_component): self._multitasks.append(mt_component) - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, nlp=None, **cfg): # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.initialize(get_examples, pipeline=pipeline) + labeller.initialize(get_examples, nlp=nlp) @property def labels(self): diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index effcef2e3..6482d6125 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser): """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, nlp=None, **cfg): """Setup multi-task objective components. Experimental and internal.""" # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.initialize(get_examples, pipeline=pipeline) + labeller.initialize(get_examples, nlp=nlp) @property def labels(self): From 2be80379ec644c253af7d312c0a5bcad4fd3515d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 20:38:35 +0200 Subject: [PATCH 66/66] Fix small issues, resolve_dot_names and debug model --- spacy/cli/debug_config.py | 8 +++++-- spacy/cli/debug_model.py | 32 ++++++++++++++++++---------- spacy/schemas.py | 14 ++++++------ spacy/tests/test_util.py | 4 ++-- spacy/tests/training/test_readers.py | 10 ++++----- spacy/training/initialize.py | 6 +++--- spacy/training/loop.py | 4 +--- spacy/util.py | 29 ++++++------------------- 8 files changed, 51 insertions(+), 56 deletions(-) diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index d1dcc45b9..a6c7345f0 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -7,6 +7,8 @@ import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util @@ -52,8 +54,10 @@ def debug_config( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) - dot_names = ["training.dev_corpus", "training.train_corpus"] - util.resolve_dot_names(nlp.config, dot_names) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + util.resolve_dot_names(config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index eca85dc04..3b8ba7dae 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -2,7 +2,7 @@ from typing import Dict, Any, Optional, Iterable from pathlib import Path from spacy.training import Example -from spacy.util import dot_to_object +from spacy.util import resolve_dot_names from wasabi import msg from thinc.api import fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation, set_gpu_allocator @@ -15,7 +15,10 @@ from ..util import registry from .. import util -@debug_cli.command("model") +@debug_cli.command( + "model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments @@ -57,15 +60,14 @@ def debug_model_cli( raw_config = util.load_config( config_path, overrides=config_overrides, interpolate=False ) - config = raw_config.iterpolate() + config = raw_config.interpolate() allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - T = registry.resolve( - nlp.config.interpolate()["training"], schema=ConfigSchemaTraining - ) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") @@ -77,11 +79,16 @@ def debug_model_cli( exits=1, ) model = pipe.model - debug_model(T, nlp, model, print_settings=print_settings) + debug_model(config, T, nlp, model, print_settings=print_settings) def debug_model( - config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None + config, + resolved_train_config, + nlp, + model: Model, + *, + print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( @@ -102,13 +109,16 @@ def debug_model( # The output vector might differ from the official type of the output layer with data_validation(False): try: - train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - nlp.initialize(lambda: train_corpus(nlp)) + dot_names = [resolved_train_config["train_corpus"]] + with show_validation_error(): + (train_corpus,) = resolve_dot_names(config, dot_names) + nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) - nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) + with show_validation_error(): + nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( diff --git a/spacy/schemas.py b/spacy/schemas.py index 555a505d7..d9a31c742 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -389,14 +389,12 @@ class ConfigSchema(BaseModel): arbitrary_types_allowed = True -class TrainingSchema(BaseModel): - training: ConfigSchemaTraining - pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} - corpora: Dict[str, Reader] - - class Config: - extra = "allow" - arbitrary_types_allowed = True +CONFIG_SCHEMAS = { + "nlp": ConfigSchemaNlp, + "training": ConfigSchemaTraining, + "pretraining": ConfigSchemaPretrain, + "initialize": ConfigSchemaInit, +} # Project config Schema diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index f48cfba00..f710a38eb 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -128,10 +128,10 @@ def test_resolve_dot_names(): "training": {"optimizer": {"@optimizers": "Adam.v1"}}, "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, } - result = util.resolve_dot_names(config, ["foo.bar"]) + result = util.resolve_dot_names(config, ["training.optimizer"]) assert isinstance(result[0], Optimizer) with pytest.raises(ConfigValidationError) as e: - util.resolve_dot_names(config, ["foo.baz", "foo.bar"]) + util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index ea39e8b90..9d82ca50a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -39,12 +39,12 @@ def test_readers(): config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) - dot_names = ["training.train_corpus", "training.dev_corpus"] - train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) - assert isinstance(train_corpus, Callable) T = registry.resolve( nlp.config.interpolate()["training"], schema=ConfigSchemaTraining ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) + assert isinstance(train_corpus, Callable) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) @@ -92,11 +92,11 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - dot_names = ["training.train_corpus", "training.dev_corpus"] - train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) T = registry.resolve( nlp.config["training"].interpolate(), schema=ConfigSchemaTraining ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index aa5edde5d..09ac2b0ac 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -130,12 +130,12 @@ def init_tok2vec( init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: if P["objective"].get("type") == "vectors" and not I["vectors"]: - err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' - errors = [{"loc": ["initialize", "vocab"], "msg": err}] + err = 'need initialize.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" - errors = [{"loc": ["initialize", "vocab", "init_tok2vec"], "msg": err}] + errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) with init_tok2vec.open("rb") as file_: weights_data = file_.read() diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 41e6464e0..e20cddd3e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -29,9 +29,7 @@ def train( output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. - logger (Callable[[Any], Any]): Optional logger exposing the methods info, - error, debug and warn. Defaults to regular spaCy logger but can be - swapped for CLI logger. + silent (bool): Whether to pretty-print outputs. RETURNS (Path / None): The path to the final exported model. """ msg = Printer(no_print=silent) diff --git a/spacy/util.py b/spacy/util.py index 98c2a4083..2dfd00e2f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -392,7 +392,6 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A we could find the lowest part of the tree. """ # TODO: include schema? - # TODO: clean this up and avoid duplication resolved = {} output = [] errors = [] @@ -403,34 +402,20 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A section = name.split(".")[0] # We want to avoid resolving the same thing twice if section not in resolved: - resolved[section] = registry.resolve(config[section]) + if registry.is_promise(config[section]): + # Otherwise we can't resolve [corpus] if it's a promise + result = registry.resolve({"config": config[section]})["config"] + else: + result = registry.resolve(config[section]) + resolved[section] = result try: output.append(dot_to_object(resolved, name)) except KeyError: msg = f"not a valid section reference: {name}" errors.append({"loc": name.split("."), "msg": msg}) - objects = [] - for ref in output: - if not isinstance(ref, str): - objects.append(ref) - continue - section = ref.split(".")[0] - # We want to avoid resolving the same thing twice - if section not in resolved: - if registry.is_promise(config[section]): - # Otherwise we can't resolve [corpus] if it's a promise - result = registry.resolve({"config": config[section]})["config"] - else: - result = registry.resolve(config[section]) - resolved[section] = result - try: - objects.append(dot_to_object(resolved, ref)) - except KeyError: - msg = f"not a valid section reference: {name}" - errors.append({"loc": ref.split("."), "msg": msg}) if errors: raise ConfigValidationError(config=config, errors=errors) - return tuple(objects) + return tuple(output) def load_model_from_init_py(