From 06f0a8daa0b919edbafa966db42fc74dce5cab02 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 27 Feb 2020 18:42:27 +0100 Subject: [PATCH] Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build --- MANIFEST.in | 2 +- bin/ud/ud_train.py | 4 +- bin/wiki_entity_linking/train_descriptions.py | 6 +- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 12 +- .../ptb-joint-pos-dep/defaults.cfg | 11 +- examples/training/pretrain_textcat.py | 17 +- examples/training/train_textcat.py | 2 +- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 +- spacy/cli/pretrain.py | 17 +- spacy/cli/train.py | 88 ++-- spacy/cli/train_from_config.py | 139 ++----- spacy/errors.py | 15 +- spacy/language.py | 75 +++- spacy/ml/component_models.py | 227 ---------- spacy/ml/models/__init__.py | 6 + spacy/ml/models/defaults/__init__.py | 93 +++++ .../defaults/entity_linker_defaults.cfg | 12 + .../defaults/morphologizer_defaults.cfg | 14 + spacy/ml/models/defaults/ner_defaults.cfg | 15 + spacy/ml/models/defaults/parser_defaults.cfg | 15 + spacy/ml/models/defaults/sentrec_defaults.cfg | 14 + spacy/ml/models/defaults/tagger_defaults.cfg | 12 + .../models/defaults/tensorizer_defaults.cfg | 4 + spacy/ml/models/defaults/textcat_defaults.cfg | 13 + spacy/ml/models/defaults/tok2vec_defaults.cfg | 9 + spacy/ml/models/entity_linker.py | 23 ++ spacy/ml/models/multi_task.py | 29 ++ spacy/ml/models/parser.py | 33 ++ spacy/ml/models/tagger.py | 16 + spacy/ml/models/tensorizer.py | 10 + spacy/ml/models/textcat.py | 42 ++ spacy/ml/models/tok2vec.py | 390 ++++++++++++++++++ spacy/ml/tok2vec.py | 178 -------- spacy/pipeline/entityruler.py | 2 +- spacy/pipeline/hooks.py | 2 - spacy/pipeline/morphologizer.pyx | 26 +- spacy/pipeline/pipes.pyx | 356 +++------------- spacy/pipeline/tok2vec.py | 21 +- spacy/syntax/_parser_model.pyx | 65 ++- spacy/syntax/nn_parser.pyx | 221 +++------- spacy/tests/doc/test_add_entities.py | 5 +- spacy/tests/parser/test_add_label.py | 18 +- spacy/tests/parser/test_arc_eager_oracle.py | 4 +- spacy/tests/parser/test_ner.py | 57 +-- spacy/tests/parser/test_neural_parser.py | 20 +- spacy/tests/parser/test_nn_beam.py | 3 +- spacy/tests/parser/test_parse.py | 15 +- spacy/tests/parser/test_preset_sbd.py | 4 +- spacy/tests/pipeline/test_analysis.py | 3 +- spacy/tests/pipeline/test_tagger.py | 19 +- spacy/tests/pipeline/test_textcat.py | 22 +- spacy/tests/regression/test_issue1501-2000.py | 8 +- spacy/tests/regression/test_issue2001-2500.py | 3 + spacy/tests/regression/test_issue3001-3500.py | 4 +- spacy/tests/regression/test_issue3830.py | 6 +- spacy/tests/regression/test_issue4042.py | 3 +- spacy/tests/regression/test_issue4313.py | 3 +- .../tests/serialize/test_serialize_config.py | 126 ++++++ .../serialize/test_serialize_language.py | 3 +- .../serialize/test_serialize_pipeline.py | 71 ++-- spacy/tests/test_tok2vec.py | 36 +- spacy/util.py | 47 ++- 64 files changed, 1511 insertions(+), 1213 deletions(-) delete mode 100644 spacy/ml/component_models.py create mode 100644 spacy/ml/models/__init__.py create mode 100644 spacy/ml/models/defaults/__init__.py create mode 100644 spacy/ml/models/defaults/entity_linker_defaults.cfg create mode 100644 spacy/ml/models/defaults/morphologizer_defaults.cfg create mode 100644 spacy/ml/models/defaults/ner_defaults.cfg create mode 100644 spacy/ml/models/defaults/parser_defaults.cfg create mode 100644 spacy/ml/models/defaults/sentrec_defaults.cfg create mode 100644 spacy/ml/models/defaults/tagger_defaults.cfg create mode 100644 spacy/ml/models/defaults/tensorizer_defaults.cfg create mode 100644 spacy/ml/models/defaults/textcat_defaults.cfg create mode 100644 spacy/ml/models/defaults/tok2vec_defaults.cfg create mode 100644 spacy/ml/models/entity_linker.py create mode 100644 spacy/ml/models/multi_task.py create mode 100644 spacy/ml/models/parser.py create mode 100644 spacy/ml/models/tagger.py create mode 100644 spacy/ml/models/tensorizer.py create mode 100644 spacy/ml/models/textcat.py create mode 100644 spacy/ml/models/tok2vec.py create mode 100644 spacy/tests/serialize/test_serialize_config.py diff --git a/MANIFEST.in b/MANIFEST.in index 1947b9140..e6d25284f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.txt *.pyx *.pxd +recursive-include spacy *.pyx *.pxd *.txt *.cfg include LICENSE include README.md include bin/spacy diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index bda22088d..aa5050f3a 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -386,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc): weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, "model") and hasattr(component.model, "tok2vec"): - component.tok2vec.from_bytes(weights_data) + if hasattr(component, "model") and component.model.has_ref("tok2vec"): + component.get_ref("tok2vec").from_bytes(weights_data) loaded.append(name) return loaded diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index d98bba565..b0cfbb4c6 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -1,13 +1,9 @@ -# coding: utf-8 from random import shuffle import logging import numpy as np -from thinc.model import Model -from thinc.api import chain -from thinc.loss import CosineDistance -from thinc.layers import Linear +from thinc.api import Model, chain, CosineDistance, Linear from spacy.util import create_default_optimizer diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 8cd150868..4f1a915c5 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -39,25 +39,27 @@ factory = "tagger" factory = "parser" [nlp.pipeline.tagger.model] -@architectures = "tagger_model.v1" +@architectures = "spacy.Tagger.v1" [nlp.pipeline.tagger.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.parser.model] -@architectures = "transition_based_parser.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.parser.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_bilstm.v1" +@architectures = "spacy.HashEmbedBiLSTM.v1" pretrained_vectors = ${nlp:vectors} width = 96 depth = 4 embed_size = 2000 +subword_features = true +char_embed = false diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 6735284a7..2ceaab0be 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -39,27 +39,28 @@ factory = "tagger" factory = "parser" [nlp.pipeline.tagger.model] -@architectures = "tagger_model.v1" +@architectures = "spacy.Tagger.v1" [nlp.pipeline.tagger.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.parser.model] -@architectures = "transition_based_parser.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.parser.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_cnn.v1" +@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} width = 96 depth = 4 window_size = 1 embed_size = 2000 maxout_pieces = 3 +subword_features = true diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index 85d36fd66..0aefec9ef 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -20,9 +20,9 @@ import random import ml_datasets import spacy -from spacy.util import minibatch, use_gpu, compounding +from spacy.util import minibatch from spacy.pipeline import TextCategorizer -from spacy.ml.tok2vec import Tok2Vec +from spacy.ml.models.tok2vec import build_Tok2Vec_model import numpy @@ -65,9 +65,7 @@ def prefer_gpu(): def build_textcat_model(tok2vec, nr_class, width): - from thinc.model import Model - from thinc.layers import Softmax, chain, reduce_mean - from thinc.layers import list2ragged + from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged with Model.define_operators({">>": chain}): model = ( @@ -76,7 +74,7 @@ def build_textcat_model(tok2vec, nr_class, width): >> reduce_mean() >> Softmax(nr_class, width) ) - model.tok2vec = tok2vec + model.set_ref("tok2vec", tok2vec) return model @@ -97,8 +95,9 @@ def create_pipeline(width, embed_size, vectors_model): textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], + # TODO: replace with config version model=build_textcat_model( - Tok2Vec(width=width, embed_size=embed_size), 2, width + build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width ), ) @@ -121,7 +120,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter): def train_textcat(nlp, n_texts, n_iter=10): textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.tok2vec.to_bytes() + tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) print( "Using {} examples ({} training, {} evaluation)".format( @@ -135,7 +134,7 @@ def train_textcat(nlp, n_texts, n_iter=10): other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() - textcat.model.tok2vec.from_bytes(tok2vec_weights) + textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) for i in range(n_iter): diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 4d402e04d..50c852ac1 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -74,7 +74,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: - textcat.model.tok2vec.from_bytes(file_.read()) + textcat.model.get_ref("tok2vec").from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) diff --git a/pyproject.toml b/pyproject.toml index 71e523c7c..ee28d5d42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a0", + "thinc==8.0.0a1", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index f3a7cc162..09998cdc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a0 +thinc==8.0.0a1 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 980269c35..7b3a468b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a0 + thinc==8.0.0a1 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a0 + thinc==8.0.0a1 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 690e3107d..95d549254 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,10 +11,10 @@ import srsly from ..gold import Example from ..errors import Errors +from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD -from ..ml.component_models import Tok2Vec -from ..ml.component_models import masked_language_model +from ..ml.models.tok2vec import build_Tok2Vec_model from .. import util from ..util import create_default_optimizer from .train import _load_pretrained_tok2vec @@ -108,14 +108,19 @@ def pretrain( pretrained_vectors = None if not use_vectors else nlp.vocab.vectors model = create_pretraining_model( nlp, - Tok2Vec( + # TODO: replace with config + build_Tok2Vec_model( width, embed_rows, conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc - cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. + maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. + window_size=1, + char_embed=False, + nM=64, + nC=8 ), ) # Load in pretrained weights @@ -152,7 +157,7 @@ def pretrain( is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: - file_.write(model.tok2vec.to_bytes()) + file_.write(model.get_ref("tok2vec").to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, @@ -284,7 +289,7 @@ def create_pretraining_model(nlp, tok2vec): # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) - model = masked_language_model(nlp.vocab, model) + model = build_masked_language_model(nlp.vocab, model) model.set_ref("tok2vec", tok2vec) model.set_ref("output_layer", output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 92f94b53d..5667bb905 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -9,7 +9,7 @@ from wasabi import msg import contextlib import random -from ..util import create_default_optimizer +from ..util import create_default_optimizer, registry from ..util import use_gpu as set_gpu from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus @@ -111,6 +111,8 @@ def train( eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] + default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults" + # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. @@ -118,7 +120,6 @@ def train( msg.text(f"Training pipeline: {pipeline}") disabled_pipes = None pipes_added = False - msg.text(f"Training pipeline: {pipeline}") if use_gpu >= 0: activated_gpu = None try: @@ -140,16 +141,36 @@ def train( f"specified as `lang` argument ('{lang}') ", exits=1, ) + if vectors: + msg.text(f"Loading vectors from model '{vectors}'") + + nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: - pipe_cfg = {} + # first, create the model. + # Bit of a hack after the refactor to get the vectors into a default config + # use train-from-config instead :-) if pipe == "parser": - pipe_cfg = {"learn_tokens": learn_tokens} + config_loc = default_dir / "parser_defaults.cfg" + elif pipe == "tagger": + config_loc = default_dir / "tagger_defaults.cfg" + elif pipe == "ner": + config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": - pipe_cfg = { - "exclusive_classes": not textcat_multilabel, - "architecture": textcat_arch, - "positive_label": textcat_positive_label, - } + config_loc = default_dir / "textcat_defaults.cfg" + else: + raise ValueError(f"Component {pipe} currently not supported.") + pipe_cfg = util.load_config(config_loc, create_objects=False) + if vectors: + pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config + + if pipe == "parser": + pipe_cfg["learn_tokens"] = learn_tokens + elif pipe == "textcat": + pipe_cfg["exclusive_classes"] = not textcat_multilabel + pipe_cfg["architecture"] = textcat_arch + pipe_cfg["positive_label"] = textcat_positive_label + if pipe not in nlp.pipe_names: msg.text(f"Adding component to base model '{pipe}'") nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) @@ -181,26 +202,42 @@ def train( msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() + + if vectors: + msg.text(f"Loading vectors from model '{vectors}'") + for pipe in pipeline: + # first, create the model. + # Bit of a hack after the refactor to get the vectors into a default config + # use train-from-config instead :-) if pipe == "parser": - pipe_cfg = {"learn_tokens": learn_tokens} + config_loc = default_dir / "parser_defaults.cfg" + elif pipe == "tagger": + config_loc = default_dir / "tagger_defaults.cfg" + elif pipe == "ner": + config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": - pipe_cfg = { - "exclusive_classes": not textcat_multilabel, - "architecture": textcat_arch, - "positive_label": textcat_positive_label, - } + config_loc = default_dir / "textcat_defaults.cfg" else: - pipe_cfg = {} - nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + raise ValueError(f"Component {pipe} currently not supported.") + pipe_cfg = util.load_config(config_loc, create_objects=False) + if vectors: + pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config + + if pipe == "parser": + pipe_cfg["learn_tokens"] = learn_tokens + elif pipe == "textcat": + pipe_cfg["exclusive_classes"] = not textcat_multilabel + pipe_cfg["architecture"] = textcat_arch + pipe_cfg["positive_label"] = textcat_positive_label + + pipe = nlp.create_pipe(pipe, config=pipe_cfg) + nlp.add_pipe(pipe) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) - if vectors: - msg.text(f"Loading vector from model '{vectors}'") - _load_vectors(nlp, vectors) - # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: @@ -228,7 +265,7 @@ def train( optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg) nlp._optimizer = None - # Load in pretrained weights + # Load in pretrained weights (TODO: this may be broken in the config rewrite) if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text(f"Loaded pretrained tok2vec for: {components}") @@ -531,7 +568,7 @@ def _create_progress_bar(total): def _load_vectors(nlp, vectors): - util.load_model(vectors, vocab=nlp.vocab) + loaded_model = util.load_model(vectors, vocab=nlp.vocab) for lex in nlp.vocab: values = {} for attr, func in nlp.vocab.lex_attr_getters.items(): @@ -541,6 +578,7 @@ def _load_vectors(nlp, vectors): values[lex.vocab.strings[attr]] = func(lex.orth_) lex.set_attrs(**values) lex.is_oov = False + return loaded_model def _load_pretrained_tok2vec(nlp, loc): @@ -551,8 +589,8 @@ def _load_pretrained_tok2vec(nlp, loc): weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, "model") and hasattr(component.model, "tok2vec"): - component.tok2vec.from_bytes(weights_data) + if hasattr(component, "model") and component.model.has_ref("tok2vec"): + component.get_ref("tok2vec").from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 9150da356..0dba8a962 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,19 +1,17 @@ from typing import Optional, Dict, List, Union, Sequence +from pydantic import BaseModel, FilePath, StrictInt + import plac -from wasabi import msg +import tqdm from pathlib import Path + +from wasabi import msg import thinc import thinc.schedules from thinc.api import Model -from pydantic import BaseModel, FilePath, StrictInt -import tqdm -# TODO: relative imports? -import spacy -from spacy.gold import GoldCorpus -from spacy.pipeline.tok2vec import Tok2VecListener -from spacy.ml import component_models -from spacy import util +from ..gold import GoldCorpus +from .. import util registry = util.registry @@ -57,23 +55,24 @@ factory = "tok2vec" factory = "ner" [nlp.pipeline.ner.model] -@architectures = "transition_based_ner.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.ner.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_cnn.v1" +@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} width = 128 depth = 4 window_size = 1 embed_size = 10000 maxout_pieces = 3 +subword_features = true """ @@ -113,65 +112,6 @@ class ConfigSchema(BaseModel): extra = "allow" -# Of course, these would normally decorate the functions where they're defined. -# But for now... -@registry.architectures.register("hash_embed_cnn.v1") -def hash_embed_cnn( - pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size -): - return component_models.Tok2Vec( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - cnn_maxout_pieces=maxout_pieces, - bilstm_depth=0, - window_size=window_size, - ) - - -@registry.architectures.register("hash_embed_bilstm.v1") -def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size): - return component_models.Tok2Vec( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - cnn_maxout_pieces=0, - ) - - -@registry.architectures.register("tagger_model.v1") -def build_tagger_model_v1(tok2vec): - return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec) - - -@registry.architectures.register("transition_based_parser.v1") -def create_tb_parser_model( - tok2vec: Model, - nr_feature_tokens: StrictInt = 3, - hidden_width: StrictInt = 64, - maxout_pieces: StrictInt = 3, -): - from thinc.api import Linear, chain, list2array, use_ops, zero_init - from spacy.ml._layers import PrecomputableAffine - from spacy.syntax._parser_model import ParserModel - - token_vector_width = tok2vec.get_dim("nO") - tok2vec = chain(tok2vec, list2array()) - tok2vec.set_dim("nO", token_vector_width) - - lower = PrecomputableAffine( - hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces - ) - lower.set_dim("nP", maxout_pieces) - with use_ops("numpy"): - # Initialize weights at zero, as it's a classification layer. - upper = Linear(init_W=zero_init) - return ParserModel(tok2vec, lower, upper) - - @plac.annotations( # fmt: off train_path=("Location of JSON-formatted training data", "positional", None, Path), @@ -224,23 +164,25 @@ def train_from_config( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") - config = util.load_from_config(config_path, create_objects=True) + config = util.load_config(config_path, create_objects=True) use_gpu = config["training"]["use_gpu"] if use_gpu >= 0: msg.info("Using GPU") else: msg.info("Using CPU") msg.info("Creating nlp from config") - nlp = create_nlp_from_config(**config["nlp"]) + nlp_config = util.load_config(config_path, create_objects=False)["nlp"] + nlp = util.load_model_from_config(nlp_config) optimizer = config["optimizer"] - limit = config["training"]["limit"] + training = config["training"] + limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) msg.info("Initializing the nlp pipeline") nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) - train_batches = create_train_batches(nlp, corpus, config["training"]) - evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) + train_batches = create_train_batches(nlp, corpus, training) + evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) # Create iterator, which yields out info after each optimization step. msg.info("Start training") @@ -249,16 +191,16 @@ def train_from_config( optimizer, train_batches, evaluate, - config["training"]["dropout"], - config["training"]["patience"], - config["training"]["eval_frequency"], + training["dropout"], + training["patience"], + training["eval_frequency"], ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row = setup_printer(config) + print_row = setup_printer(training, nlp) try: - progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False) + progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) for batch, info, is_best_checkpoint in training_step_iterator: progress.update(1) if is_best_checkpoint is not None: @@ -266,9 +208,7 @@ def train_from_config( print_row(info) if is_best_checkpoint and output_path is not None: nlp.to_disk(output_path) - progress = tqdm.tqdm( - total=config["training"]["eval_frequency"], leave=False - ) + progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) finally: if output_path is not None: with nlp.use_params(optimizer.averages): @@ -280,18 +220,6 @@ def train_from_config( # msg.good("Created best model", best_model_path) -def create_nlp_from_config(lang, vectors, pipeline): - lang_class = spacy.util.get_lang_class(lang) - nlp = lang_class() - if vectors is not None: - spacy.cli.train._load_vectors(nlp, vectors) - for name, component_cfg in pipeline.items(): - factory = component_cfg.pop("factory") - component = nlp.create_pipe(factory, config=component_cfg) - nlp.add_pipe(component, name=name) - return nlp - - def create_train_batches(nlp, corpus, cfg): while True: train_examples = corpus.train_dataset( @@ -405,10 +333,10 @@ def subdivide_batch(batch): return [batch] -def setup_printer(config): - score_cols = config["training"]["scores"] +def setup_printer(training, nlp): + score_cols = training["scores"] score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]] + loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] @@ -420,20 +348,13 @@ def setup_printer(config): def print_row(info): losses = [ - "{0:.2f}".format(info["losses"].get(col, 0.0)) - for col in config["nlp"]["pipeline"] + "{0:.2f}".format(info["losses"].get(pipe_name, 0.0)) + for pipe_name in nlp.pipe_names ] scores = [ - "{0:.2f}".format(info["other_scores"].get(col, 0.0)) - for col in config["training"]["scores"] + "{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols ] data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])] msg.row(data, widths=table_widths, aligns=table_aligns) return print_row - - -@registry.architectures.register("tok2vec_tensors.v1") -def tok2vec_tensors_v1(width): - tok2vec = Tok2VecListener("tok2vec", width=width) - return tok2vec diff --git a/spacy/errors.py b/spacy/errors.py index 7a4953cce..6afbfc3c6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -106,6 +106,12 @@ class Warnings(object): "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "string \"Field1=Value1,Value2|Field2=Value3\".") + # TODO: fix numbering after merging develop into master + W098 = ("No Model config was provided to create the '{name}' component, " + "so a default configuration was used.") + W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " + "but got '{type}' instead, so ignoring it.") + @add_codes class Errors(object): @@ -227,7 +233,7 @@ class Errors(object): E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") - E053 = ("Could not read meta.json from {path}") + E053 = ("Could not read {name} from {path}") E054 = ("No valid '{setting}' setting found in model meta.json.") E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}") E056 = ("Invalid tokenizer exception: ORTH values combined don't match " @@ -345,8 +351,8 @@ class Errors(object): E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated " "in favor of the pipe name `sentencizer`, which does the same " "thing. For example, use `nlp.create_pipeline('sentencizer')`") - E109 = ("Model for component '{name}' not initialized. Did you forget to " - "load a model, or forget to call begin_training()?") + E109 = ("Component '{name}' could not be run. Did you forget to " + "call begin_training()?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E111 = ("Pickling a token is not supported, because tokens are only views " "of the parent Doc and can't exist on their own. A pickled token " @@ -532,6 +538,9 @@ class Errors(object): "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") # TODO: fix numbering after merging develop into master + E993 = ("The config for 'nlp' should include either a key 'name' to " + "refer to an existing model by name or path, or a key 'lang' " + "to create a new blank model.") E996 = ("Could not parse {file}: {msg}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " diff --git a/spacy/language.py b/spacy/language.py index 1c6014cec..83f8c9d21 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,7 +4,9 @@ import weakref import functools from contextlib import contextmanager from copy import copy, deepcopy -from thinc.api import get_current_ops +from pathlib import Path + +from thinc.api import get_current_ops, Config import srsly import multiprocessing as mp from itertools import chain, cycle @@ -16,7 +18,7 @@ from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer -from .util import link_vectors_to_models, create_default_optimizer +from .util import link_vectors_to_models, create_default_optimizer, registry from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -24,7 +26,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings, deprecation_warning, user_warning from . import util from . import about @@ -128,7 +130,7 @@ class Language(object): factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)} def __init__( - self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs + self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs ): """Initialise a Language object. @@ -138,6 +140,7 @@ class Language(object): object. Usually a `Tokenizer`. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. + config (Config): Configuration data for creating the pipeline components. max_length (int) : Maximum number of characters in a single text. The current v2 models may run out memory on extremely long texts, due to large internal @@ -152,6 +155,9 @@ class Language(object): user_factories = util.registry.factories.get_all() self.factories.update(user_factories) self._meta = dict(meta) + self._config = config + if not self._config: + self._config = Config() self._path = None if vocab is True: factory = self.Defaults.create_vocab @@ -170,6 +176,21 @@ class Language(object): self.max_length = max_length self._optimizer = None + from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \ + default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \ + default_tensorizer_config, default_tok2vec_config + + self.defaults = {"tagger": default_tagger_config(), + "parser": default_parser_config(), + "ner": default_ner_config(), + "textcat": default_textcat_config(), + "entity_linker": default_nel_config(), + "morphologizer": default_morphologizer_config(), + "sentrec": default_sentrec_config(), + "tensorizer": default_tensorizer_config(), + "tok2vec": default_tok2vec_config(), + } + @property def path(self): return self._path @@ -203,6 +224,10 @@ class Language(object): def meta(self, value): self._meta = value + @property + def config(self): + return self._config + # Conveniences to access pipeline components # Shouldn't be used anymore! @property @@ -293,7 +318,24 @@ class Language(object): else: raise KeyError(Errors.E002.format(name=name)) factory = self.factories[name] - return factory(self, **config) + default_config = self.defaults.get(name, None) + + # transform the model's config to an actual Model + model_cfg = None + if "model" in config: + model_cfg = config["model"] + if not isinstance(model_cfg, dict): + user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name)) + model_cfg = None + del config["model"] + if model_cfg is None and default_config is not None: + user_warning(Warnings.W098) + model_cfg = default_config["model"] + model = None + if model_cfg is not None: + self.config[name] = {"model": model_cfg} + model = registry.make_from_config({"model": model_cfg}, validate=True)["model"] + return factory(self, model, **config) def add_pipe( self, component, name=None, before=None, after=None, first=None, last=None @@ -430,7 +472,10 @@ class Language(object): continue if not hasattr(proc, "__call__"): raise ValueError(Errors.E003.format(component=type(proc), name=name)) - doc = proc(doc, **component_cfg.get(name, {})) + try: + doc = proc(doc, **component_cfg.get(name, {})) + except KeyError: + raise ValueError(Errors.E109.format(name=name)) if doc is None: raise ValueError(Errors.E005.format(name=name)) return doc @@ -578,9 +623,6 @@ class Language(object): ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) - if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors.name - cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1] if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -611,8 +653,6 @@ class Language(object): if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) - if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -868,6 +908,7 @@ class Language(object): serializers["meta.json"] = lambda p: p.open("w").write( srsly.json_dumps(self.meta) ) + serializers["config.cfg"] = lambda p: self.config.to_disk(p) for name, proc in self.pipeline: if not hasattr(proc, "name"): continue @@ -895,6 +936,8 @@ class Language(object): exclude = disable path = util.ensure_path(path) deserializers = {} + if Path(path / "config.cfg").exists(): + deserializers["config.cfg"] = lambda p: self.config.from_disk(p) deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p @@ -933,6 +976,7 @@ class Language(object): serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) + serializers["config.cfg"] = lambda: self.config.to_bytes() for name, proc in self.pipeline: if name in exclude: continue @@ -955,6 +999,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable deserializers = {} + deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b @@ -981,8 +1026,8 @@ class component(object): and class components and will automatically register components in the Language.factories. If the component is a class and needs access to the nlp object or config parameters, it can expose a from_nlp classmethod - that takes the nlp object and **cfg arguments and returns the initialized - component. + that takes the nlp & model objects and **cfg arguments, and returns the + initialized component. """ # NB: This decorator needs to live here, because it needs to write to @@ -1011,9 +1056,9 @@ class component(object): obj.requires = self.requires obj.retokenizes = self.retokenizes - def factory(nlp, **cfg): + def factory(nlp, model, **cfg): if hasattr(obj, "from_nlp"): - return obj.from_nlp(nlp, **cfg) + return obj.from_nlp(nlp, model, **cfg) elif isinstance(obj, type): return obj() return obj diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py deleted file mode 100644 index 8c694f950..000000000 --- a/spacy/ml/component_models.py +++ /dev/null @@ -1,227 +0,0 @@ -from spacy import util -from spacy.ml.extract_ngrams import extract_ngrams - -from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE -from ..errors import Errors -from ._character_embed import CharacterEmbed - -from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged -from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors -from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain -from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued -from thinc.api import zero_init - - -def build_text_classifier(arch, config): - if arch == "cnn": - return build_simple_cnn_text_classifier(**config) - elif arch == "bow": - return build_bow_text_classifier(**config) - else: - raise ValueError("Unexpected textcat arch") - - -def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg): - """ - Build a simple CNN text classifier, given a token-to-vector model as inputs. - If exclusive_classes=True, a softmax non-linearity is applied, so that the - outputs sum to 1. If exclusive_classes=False, a logistic non-linearity - is applied instead, so that outputs are in the range [0, 1]. - """ - with Model.define_operators({">>": chain}): - if exclusive_classes: - output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) - else: - # TODO: experiment with init_w=zero_init - output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic() - model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer - model.set_ref("tok2vec", tok2vec) - model.set_dim("nO", nr_class) - return model - - -def build_bow_text_classifier( - nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg -): - with Model.define_operators({">>": chain}): - model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class) - model.to_cpu() - if not no_output_layer: - output_layer = ( - Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class) - ) - output_layer.to_cpu() - model = model >> output_layer - model.set_dim("nO", nr_class) - return model - - -def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): - if "entity_width" not in cfg: - raise ValueError(Errors.E144.format(param="entity_width")) - - conv_depth = cfg.get("conv_depth", 2) - cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors", None) - context_width = cfg.get("entity_width") - - with Model.define_operators({">>": chain, "**": clone}): - nel_tok2vec = Tok2Vec( - width=hidden_width, - embed_size=embed_width, - pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, - subword_features=True, - conv_depth=conv_depth, - bilstm_depth=0, - ) - - model = ( - nel_tok2vec - >> list2ragged() - >> reduce_mean() - >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0)) - >> Linear(nO=context_width, nI=hidden_width) - ) - model.initialize() - - model.set_ref("tok2vec", nel_tok2vec) - model.set_dim("nO", context_width) - return model - - -def masked_language_model(*args, **kwargs): - raise NotImplementedError - - -def build_tagger_model(nr_class, tok2vec): - token_vector_width = tok2vec.get_dim("nO") - # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init)) - model = chain(tok2vec, softmax) - model.set_ref("tok2vec", tok2vec) - model.set_ref("softmax", softmax) - return model - - -def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt("embed_size", 7000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 128) - pretrained_vectors = cfg.get("pretrained_vectors") - char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add, "**": clone}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - char_embed=char_embed, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width)) - model = tok2vec >> softmax - model.set_ref("tok2vec", tok2vec) - model.set_ref("softmax", softmax) - return model - - -def Tok2Vec( - width, - embed_size, - pretrained_vectors=None, - window_size=1, - cnn_maxout_pieces=3, - subword_features=True, - char_embed=False, - conv_depth=4, - bilstm_depth=0, -): - if char_embed: - subword_features = False - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) - if subword_features: - prefix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0 - ) - suffix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0 - ) - shape = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0 - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors( - vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0 - ) - - if subword_features: - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> Maxout( - nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True - ), - column=cols.index(ORTH), - ) - else: - embed = uniqued( - (glove | norm) - >> Maxout( - nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True - ), - column=cols.index(ORTH), - ) - elif subword_features: - embed = uniqued( - concatenate(norm, prefix, suffix, shape) - >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True), - column=cols.index(ORTH), - ) - elif char_embed: - embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array( - norm - ) - reduce_dimensions = Maxout( - nO=width, - nI=64 * 8 + width, - nP=cnn_maxout_pieces, - dropout=0.0, - normalize=True, - ) - else: - embed = norm - - convolution = residual( - expand_window(window_size=window_size) - >> Maxout( - nO=width, - nI=width * 3, - nP=cnn_maxout_pieces, - dropout=0.0, - normalize=True, - ) - ) - if char_embed: - tok2vec = embed >> with_array( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtractor(cols) >> with_array( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchLSTM( - nO=width, nI=width, depth=bilstm_depth, bi=True - ) - # Work around thinc API limitations :(. TODO: Revise in Thinc 7 - tok2vec.set_dim("nO", width) - tok2vec.set_ref("embed", embed) - return tok2vec diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py new file mode 100644 index 000000000..56696d581 --- /dev/null +++ b/spacy/ml/models/__init__.py @@ -0,0 +1,6 @@ +from .entity_linker import * +from .parser import * +from .tagger import * +from .tensorizer import * +from .textcat import * +from .tok2vec import * diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py new file mode 100644 index 000000000..9af4da87d --- /dev/null +++ b/spacy/ml/models/defaults/__init__.py @@ -0,0 +1,93 @@ +from pathlib import Path + +from .... import util + + +def default_nel_config(): + loc = Path(__file__).parent / "entity_linker_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_nel(): + loc = Path(__file__).parent / "entity_linker_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_morphologizer_config(): + loc = Path(__file__).parent / "morphologizer_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_morphologizer(): + loc = Path(__file__).parent / "morphologizer_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_parser_config(): + loc = Path(__file__).parent / "parser_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_parser(): + loc = Path(__file__).parent / "parser_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_ner_config(): + loc = Path(__file__).parent / "ner_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_ner(): + loc = Path(__file__).parent / "ner_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_sentrec_config(): + loc = Path(__file__).parent / "sentrec_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_sentrec(): + loc = Path(__file__).parent / "sentrec_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tagger_config(): + loc = Path(__file__).parent / "tagger_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tagger(): + loc = Path(__file__).parent / "tagger_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tensorizer_config(): + loc = Path(__file__).parent / "tensorizer_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tensorizer(): + loc = Path(__file__).parent / "tensorizer_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_textcat_config(): + loc = Path(__file__).parent / "textcat_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_textcat(): + loc = Path(__file__).parent / "textcat_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tok2vec_config(): + loc = Path(__file__).parent / "tok2vec_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tok2vec(): + loc = Path(__file__).parent / "tok2vec_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] diff --git a/spacy/ml/models/defaults/entity_linker_defaults.cfg b/spacy/ml/models/defaults/entity_linker_defaults.cfg new file mode 100644 index 000000000..6a591ec3e --- /dev/null +++ b/spacy/ml/models/defaults/entity_linker_defaults.cfg @@ -0,0 +1,12 @@ +[model] +@architectures = "spacy.EntityLinker.v1" + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 2 +embed_size = 300 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/ml/models/defaults/morphologizer_defaults.cfg new file mode 100644 index 000000000..80e776c4f --- /dev/null +++ b/spacy/ml/models/defaults/morphologizer_defaults.cfg @@ -0,0 +1,14 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashCharEmbedCNN.v1" +pretrained_vectors = null +width = 128 +depth = 4 +embed_size = 7000 +window_size = 1 +maxout_pieces = 3 +subword_features = true +nM = 64 +nC = 8 diff --git a/spacy/ml/models/defaults/ner_defaults.cfg b/spacy/ml/models/defaults/ner_defaults.cfg new file mode 100644 index 000000000..db2c131f5 --- /dev/null +++ b/spacy/ml/models/defaults/ner_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/parser_defaults.cfg b/spacy/ml/models/defaults/parser_defaults.cfg new file mode 100644 index 000000000..9cbb6eadb --- /dev/null +++ b/spacy/ml/models/defaults/parser_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/sentrec_defaults.cfg b/spacy/ml/models/defaults/sentrec_defaults.cfg new file mode 100644 index 000000000..a039a4533 --- /dev/null +++ b/spacy/ml/models/defaults/sentrec_defaults.cfg @@ -0,0 +1,14 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashCharEmbedCNN.v1" +pretrained_vectors = null +width = 12 +depth = 1 +embed_size = 2000 +window_size = 1 +maxout_pieces = 2 +subword_features = true +nM = 64 +nC = 8 diff --git a/spacy/ml/models/defaults/tagger_defaults.cfg b/spacy/ml/models/defaults/tagger_defaults.cfg new file mode 100644 index 000000000..5aea80a32 --- /dev/null +++ b/spacy/ml/models/defaults/tagger_defaults.cfg @@ -0,0 +1,12 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/tensorizer_defaults.cfg b/spacy/ml/models/defaults/tensorizer_defaults.cfg new file mode 100644 index 000000000..81880a109 --- /dev/null +++ b/spacy/ml/models/defaults/tensorizer_defaults.cfg @@ -0,0 +1,4 @@ +[model] +@architectures = "spacy.Tensorizer.v1" +input_size=96 +output_size=300 diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/ml/models/defaults/textcat_defaults.cfg new file mode 100644 index 000000000..cea1bfe54 --- /dev/null +++ b/spacy/ml/models/defaults/textcat_defaults.cfg @@ -0,0 +1,13 @@ +[model] +@architectures = "spacy.TextCatCNN.v1" +exclusive_classes = false + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/tok2vec_defaults.cfg b/spacy/ml/models/defaults/tok2vec_defaults.cfg new file mode 100644 index 000000000..9475d4aab --- /dev/null +++ b/spacy/ml/models/defaults/tok2vec_defaults.cfg @@ -0,0 +1,9 @@ +[model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py new file mode 100644 index 000000000..0c1762026 --- /dev/null +++ b/spacy/ml/models/entity_linker.py @@ -0,0 +1,23 @@ +from pathlib import Path + +from thinc.api import chain, clone, list2ragged, reduce_mean, residual +from thinc.api import Model, Maxout, Linear + +from spacy.util import registry + + +@registry.architectures.register("spacy.EntityLinker.v1") +def build_nel_encoder(tok2vec, nO=None): + with Model.define_operators({">>": chain, "**": clone}): + token_width = tok2vec.get_dim("nO") + output_layer = Linear(nO=nO, nI=token_width) + model = ( + tok2vec + >> list2ragged() + >> reduce_mean() + >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) + >> output_layer + ) + model.set_ref("output_layer", output_layer) + model.set_ref("tok2vec", tok2vec) + return model diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py new file mode 100644 index 000000000..1c193df82 --- /dev/null +++ b/spacy/ml/models/multi_task.py @@ -0,0 +1,29 @@ +from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init + + +def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): + model = chain( + tok2vec, + Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0), + LayerNorm(token_vector_width * 2), + Softmax(nO=n_tags, nI=token_vector_width * 2), + ) + return model + + +def build_cloze_multi_task_model(vocab, tok2vec): + output_size = vocab.vectors.data.shape[1] + output_layer = chain( + Maxout( + nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0 + ), + Linear(nO=output_size, nI=output_size, init_W=zero_init), + ) + model = chain(tok2vec, output_layer) + model = build_masked_language_model(vocab, model) + return model + + +def build_masked_language_model(*args, **kwargs): + # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828 + raise NotImplementedError diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py new file mode 100644 index 000000000..89f303e2a --- /dev/null +++ b/spacy/ml/models/parser.py @@ -0,0 +1,33 @@ +from pydantic import StrictInt + +from spacy.util import registry +from spacy.ml._layers import PrecomputableAffine +from spacy.syntax._parser_model import ParserModel + +from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops + + +@registry.architectures.register("spacy.TransitionBasedParser.v1") +def build_tb_parser_model( + tok2vec: Model, + nr_feature_tokens: StrictInt, + hidden_width: StrictInt, + maxout_pieces: StrictInt, + nO=None, +): + token_vector_width = tok2vec.get_dim("nO") + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) + + lower = PrecomputableAffine( + nO=hidden_width, + nF=nr_feature_tokens, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + ) + lower.set_dim("nP", maxout_pieces) + with use_ops("numpy"): + # Initialize weights at zero, as it's a classification layer. + upper = Linear(nO=nO, init_W=zero_init) + model = ParserModel(tok2vec, lower, upper) + return model diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py new file mode 100644 index 000000000..92e8be1b2 --- /dev/null +++ b/spacy/ml/models/tagger.py @@ -0,0 +1,16 @@ +from thinc.api import zero_init, with_array, Softmax, chain, Model + +from spacy.util import registry + + +@registry.architectures.register("spacy.Tagger.v1") +def build_tagger_model(tok2vec, nO=None) -> Model: + token_vector_width = tok2vec.get_dim("nO") + # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! + output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) + softmax = with_array(output_layer) + model = chain(tok2vec, softmax) + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + model.set_ref("output_layer", output_layer) + return model diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py new file mode 100644 index 000000000..f66610b64 --- /dev/null +++ b/spacy/ml/models/tensorizer.py @@ -0,0 +1,10 @@ +from thinc.api import Linear, zero_init + +from ... import util +from ...util import registry + + +@registry.architectures.register("spacy.Tensorizer.v1") +def build_tensorizer(input_size, output_size): + input_size = util.env_opt("token_vector_width", input_size) + return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py new file mode 100644 index 000000000..d9ac34b99 --- /dev/null +++ b/spacy/ml/models/textcat.py @@ -0,0 +1,42 @@ +from spacy.attrs import ORTH +from spacy.util import registry +from spacy.ml.extract_ngrams import extract_ngrams + +from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax + + +@registry.architectures.register("spacy.TextCatCNN.v1") +def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): + """ + Build a simple CNN text classifier, given a token-to-vector model as inputs. + If exclusive_classes=True, a softmax non-linearity is applied, so that the + outputs sum to 1. If exclusive_classes=False, a logistic non-linearity + is applied instead, so that outputs are in the range [0, 1]. + """ + with Model.define_operators({">>": chain}): + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO")) + model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model.set_ref("output_layer", output_layer) + else: + # TODO: experiment with init_w=zero_init + linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) + model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() + model.set_ref("output_layer", linear_layer) + model.set_ref("tok2vec", tok2vec) + model.set_dim("nO", nO) + return model + + +@registry.architectures.register("spacy.TextCatBOW.v1") +def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): + # Note: original defaults were ngram_size=1 and no_output_layer=False + with Model.define_operators({">>": chain}): + model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO) + model.to_cpu() + if not no_output_layer: + output_layer = Softmax(nO) if exclusive_classes else Logistic(nO) + output_layer.to_cpu() + model = model >> output_layer + model.set_ref("output_layer", output_layer) + return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py new file mode 100644 index 000000000..2e0e4c2d4 --- /dev/null +++ b/spacy/ml/models/tok2vec.py @@ -0,0 +1,390 @@ +from thinc.api import chain, clone, concatenate, with_array, uniqued +from thinc.api import Model, noop, with_padded, Maxout, expand_window +from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM +from thinc.api import residual, LayerNorm, FeatureExtractor, Mish + +from ... import util +from ...util import registry, make_layer +from ...ml import _character_embed +from ...pipeline.tok2vec import Tok2VecListener +from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE + + +@registry.architectures.register("spacy.Tok2VecTensors.v1") +def tok2vec_tensors_v1(width): + tok2vec = Tok2VecListener("tok2vec", width=width) + return tok2vec + + +@registry.architectures.register("spacy.VocabVectors.v1") +def get_vocab_vectors(name): + nlp = util.load_model(name) + return nlp.vocab.vectors + + +@registry.architectures.register("spacy.Tok2Vec.v1") +def Tok2Vec(config): + doc2feats = make_layer(config["@doc2feats"]) + embed = make_layer(config["@embed"]) + encode = make_layer(config["@encode"]) + field_size = 0 + if encode.has_attr("receptive_field"): + field_size = encode.attrs["receptive_field"] + tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) + tok2vec.attrs["cfg"] = config + tok2vec.set_dim("nO", encode.get_dim("nO")) + tok2vec.set_ref("embed", embed) + tok2vec.set_ref("encode", encode) + return tok2vec + + +@registry.architectures.register("spacy.Doc2Feats.v1") +def Doc2Feats(config): + columns = config["columns"] + return FeatureExtractor(columns) + + +@registry.architectures.register("spacy.HashEmbedCNN.v1") +def hash_embed_cnn( + pretrained_vectors, + width, + depth, + embed_size, + maxout_pieces, + window_size, + subword_features, +): + # Does not use character embeddings: set to False by default + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + bilstm_depth=0, + maxout_pieces=maxout_pieces, + window_size=window_size, + subword_features=subword_features, + char_embed=False, + nM=0, + nC=0, + ) + + +@registry.architectures.register("spacy.HashCharEmbedCNN.v1") +def hash_charembed_cnn( + pretrained_vectors, + width, + depth, + embed_size, + maxout_pieces, + window_size, + subword_features, + nM=0, + nC=0, +): + # Allows using character embeddings by setting nC, nM and char_embed=True + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + bilstm_depth=0, + maxout_pieces=maxout_pieces, + window_size=window_size, + subword_features=subword_features, + char_embed=True, + nM=nM, + nC=nC, + ) + + +@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") +def hash_embed_bilstm_v1( + pretrained_vectors, width, depth, embed_size, subword_features +): + # Does not use character embeddings: set to False by default + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + maxout_pieces=0, + window_size=1, + subword_features=subword_features, + char_embed=False, + nM=0, + nC=0, + ) + + +@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") +def hash_embed_bilstm_v1( + pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 +): + # Allows using character embeddings by setting nC, nM and char_embed=True + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + maxout_pieces=0, + window_size=1, + subword_features=subword_features, + char_embed=True, + nM=nM, + nC=nC, + ) + + +@registry.architectures.register("spacy.MultiHashEmbed.v1") +def MultiHashEmbed(config): + # For backwards compatibility with models before the architecture registry, + # we have to be careful to get exactly the same model structure. One subtle + # trick is that when we define concatenation with the operator, the operator + # is actually binary associative. So when we write (a | b | c), we're actually + # getting concatenate(concatenate(a, b), c). That's why the implementation + # is a bit ugly here. + cols = config["columns"] + width = config["width"] + rows = config["rows"] + + norm = HashEmbed(width, rows, column=cols.index("NORM")) + if config["use_subwords"]: + prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX")) + suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX")) + shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE")) + if config.get("@pretrained_vectors"): + glove = make_layer(config["@pretrained_vectors"]) + mix = make_layer(config["@mix"]) + + with Model.define_operators({">>": chain, "|": concatenate}): + if config["use_subwords"] and config["@pretrained_vectors"]: + mix._layers[0].set_dim("nI", width * 5) + layer = uniqued( + (glove | norm | prefix | suffix | shape) >> mix, + column=cols.index("ORTH"), + ) + elif config["use_subwords"]: + mix._layers[0].set_dim("nI", width * 4) + layer = uniqued( + (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") + ) + elif config["@pretrained_vectors"]: + mix._layers[0].set_dim("nI", width * 2) + layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH")) + else: + layer = norm + layer.attrs["cfg"] = config + return layer + + +@registry.architectures.register("spacy.CharacterEmbed.v1") +def CharacterEmbed(config): + width = config["width"] + chars = config["chars"] + + chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) + other_tables = make_layer(config["@embed_features"]) + mix = make_layer(config["@mix"]) + + model = chain(concatenate(chr_embed, other_tables), mix) + model.attrs["cfg"] = config + return model + + +@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") +def MaxoutWindowEncoder(config): + nO = config["width"] + nW = config["window_size"] + nP = config["pieces"] + depth = config["depth"] + + cnn = ( + expand_window(window_size=nW), + Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), + ) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + model.attrs["receptive_field"] = nW * depth + return model + + +@registry.architectures.register("spacy.MishWindowEncoder.v1") +def MishWindowEncoder(config): + nO = config["width"] + nW = config["window_size"] + depth = config["depth"] + + cnn = chain( + expand_window(window_size=nW), + Mish(nO=nO, nI=nO * ((nW * 2) + 1)), + LayerNorm(nO), + ) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + return model + + +@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") +def TorchBiLSTMEncoder(config): + import torch.nn + + # TODO FIX + from thinc.api import PyTorchRNNWrapper + + width = config["width"] + depth = config["depth"] + if depth == 0: + return noop() + return with_padded( + PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) + ) + + +# TODO: update +_EXAMPLE_CONFIG = { + "@doc2feats": { + "arch": "Doc2Feats", + "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, + }, + "@embed": { + "arch": "spacy.MultiHashEmbed.v1", + "config": { + "width": 96, + "rows": 2000, + "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], + "use_subwords": True, + "@pretrained_vectors": { + "arch": "TransformedStaticVectors", + "config": { + "vectors_name": "en_vectors_web_lg.vectors", + "width": 96, + "column": 0, + }, + }, + "@mix": { + "arch": "LayerNormalizedMaxout", + "config": {"width": 96, "pieces": 3}, + }, + }, + }, + "@encode": { + "arch": "MaxoutWindowEncode", + "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, + }, +} + + +def build_Tok2Vec_model( + width, + embed_size, + pretrained_vectors, + window_size, + maxout_pieces, + subword_features, + char_embed, + nM, + nC, + conv_depth, + bilstm_depth, +) -> Model: + if char_embed: + subword_features = False + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + if subword_features: + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + else: + prefix, suffix, shape = (None, None, None) + if pretrained_vectors is not None: + glove = StaticVectors( + vectors=pretrained_vectors.data, + nO=width, + column=cols.index(ID), + dropout=0.0, + ) + + if subword_features: + columns = 5 + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + else: + columns = 2 + embed = uniqued( + (glove | norm) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + elif subword_features: + columns = 4 + embed = uniqued( + concatenate(norm, prefix, suffix, shape) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + elif char_embed: + embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor( + cols + ) >> with_array(norm) + reduce_dimensions = Maxout( + nO=width, + nI=nM * nC + width, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) + else: + embed = norm + + convolution = residual( + expand_window(window_size=window_size) + >> Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) + ) + if char_embed: + tok2vec = embed >> with_array( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth + ) + else: + tok2vec = FeatureExtractor(cols) >> with_array( + embed >> convolution ** conv_depth, pad=conv_depth + ) + + if bilstm_depth >= 1: + tok2vec = tok2vec >> PyTorchLSTM( + nO=width, nI=width, depth=bilstm_depth, bi=True + ) + tok2vec.set_dim("nO", width) + tok2vec.set_ref("embed", embed) + return tok2vec diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 5e51bc47a..e69de29bb 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,178 +0,0 @@ -from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop -from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors -from thinc.api import residual, LayerNorm, FeatureExtractor - -from ..ml import _character_embed -from ..util import make_layer, registry - - -@registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(config): - doc2feats = make_layer(config["@doc2feats"]) - embed = make_layer(config["@embed"]) - encode = make_layer(config["@encode"]) - field_size = 0 - if encode.has_attr("receptive_field"): - field_size = encode.attrs["receptive_field"] - tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) - tok2vec.attrs["cfg"] = config - tok2vec.set_dim("nO", encode.get_dim("nO")) - tok2vec.set_ref("embed", embed) - tok2vec.set_ref("encode", encode) - return tok2vec - - -@registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(config): - columns = config["columns"] - return FeatureExtractor(columns) - - -@registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(config): - # For backwards compatibility with models before the architecture registry, - # we have to be careful to get exactly the same model structure. One subtle - # trick is that when we define concatenation with the operator, the operator - # is actually binary associative. So when we write (a | b | c), we're actually - # getting concatenate(concatenate(a, b), c). That's why the implementation - # is a bit ugly here. - cols = config["columns"] - width = config["width"] - rows = config["rows"] - - norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0) - if config["use_subwords"]: - prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0) - suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0) - shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0) - if config.get("@pretrained_vectors"): - glove = make_layer(config["@pretrained_vectors"]) - mix = make_layer(config["@mix"]) - - with Model.define_operators({">>": chain, "|": concatenate}): - if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 5) - layer = uniqued( - (glove | norm | prefix | suffix | shape) >> mix, - column=cols.index("ORTH"), - ) - elif config["use_subwords"]: - mix._layers[0].set_dim("nI", width * 4) - layer = uniqued( - (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") - ) - elif config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 2) - layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),) - else: - layer = norm - layer.attrs["cfg"] = config - return layer - - -@registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(config): - width = config["width"] - chars = config["chars"] - - chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) - other_tables = make_layer(config["@embed_features"]) - mix = make_layer(config["@mix"]) - - model = chain(concatenate(chr_embed, other_tables), mix) - model.attrs["cfg"] = config - return model - - -@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - nP = config["pieces"] - depth = config["depth"] - cnn = ( - expand_window(window_size=nW), - Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - model.attrs["receptive_field"] = nW * depth - return model - - -@registry.architectures.register("spacy.MishWindowEncoder.v1") -def MishWindowEncoder(config): - from thinc.api import Mish - - nO = config["width"] - nW = config["window_size"] - depth = config["depth"] - cnn = chain( - expand_window(window_size=nW), - Mish(nO=nO, nI=nO * ((nW * 2) + 1)), - LayerNorm(nO), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - return model - - -@registry.architectures.register("spacy.PretrainedVectors.v1") -def PretrainedVectors(config): - # TODO: actual vectors instead of name - return StaticVectors( - vectors=config["vectors_name"], - nO=config["width"], - column=config["column"], - dropout=0.0, - ) - - -@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(config): - import torch.nn - - # TODO: FIX - from thinc.api import PyTorchRNNWrapper - - width = config["width"] - depth = config["depth"] - if depth == 0: - return noop() - return with_padded( - PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) - ) - - -# TODO: update -_EXAMPLE_CONFIG = { - "@doc2feats": { - "arch": "Doc2Feats", - "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, - }, - "@embed": { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": 96, - "rows": 2000, - "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], - "use_subwords": True, - "@pretrained_vectors": { - "arch": "TransformedStaticVectors", - "config": { - "vectors_name": "en_vectors_web_lg.vectors", - "width": 96, - "column": 0, - }, - }, - "@mix": { - "arch": "LayerNormalizedMaxout", - "config": {"width": 96, "pieces": 3}, - }, - }, - }, - "@encode": { - "arch": "MaxoutWindowEncode", - "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, - }, -} diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index e211acb44..06c568ac9 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -66,7 +66,7 @@ class EntityRuler(object): self.add_patterns(patterns) @classmethod - def from_nlp(cls, nlp, **cfg): + def from_nlp(cls, nlp, model=None, **cfg): return cls(nlp, **cfg) def __len__(self): diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index d48b04bd1..351323ae9 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -76,11 +76,9 @@ class SimilarityHook(Pipe): yield self(doc) def predict(self, doc1, doc2): - self.require_model() return self.model.predict([(doc1, doc2)]) def update(self, doc1_doc2, golds, sgd=None, drop=0.0): - self.require_model() sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 999132b35..b6a6045d1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -15,25 +15,15 @@ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology -from ..ml.component_models import build_morphologizer_model - @component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): - @classmethod - def Model(cls, **cfg): - if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): - raise ValueError(TempErrors.T008) - class_map = Morphology.create_class_map() - return build_morphologizer_model(class_map.field_sizes, **cfg) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault('cnn_maxout_pieces', 2) - self._class_map = self.vocab.morphology.create_class_map() + self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ? @property def labels(self): @@ -58,6 +48,14 @@ class Morphologizer(Pipe): self.set_annotations(docs, features, tensors=tokvecs) yield from docs + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, + **kwargs): + self.set_output(len(self.labels)) + self.model.initialize() + if sgd is None: + sgd = self.create_optimizer() + return sgd + def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. @@ -65,8 +63,8 @@ class Morphologizer(Pipe): guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO"))) return guesses, tokvecs - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + scores = self.model.get_ref("softmax")(tokvecs) return scores, tokvecs def set_annotations(self, docs, batch_scores, tensors=None): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ad75d2e78..b9bf1ccd6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,8 +3,7 @@ import numpy import srsly import random -from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array -from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module +from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc @@ -22,11 +21,6 @@ from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X from ..kb import KnowledgeBase -from ..ml.component_models import Tok2Vec, build_tagger_model -from ..ml.component_models import build_text_classifier -from ..ml.component_models import build_simple_cnn_text_classifier -from ..ml.component_models import build_bow_text_classifier, build_nel_encoder -from ..ml.component_models import masked_language_model from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -47,13 +41,8 @@ class Pipe(object): name = None @classmethod - def Model(cls, *shape, **kwargs): - """Initialize a model for the pipe.""" - raise NotImplementedError - - @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) def _get_doc(self, example): """ Use this method if the `example` can be both a Doc or an Example """ @@ -61,7 +50,7 @@ class Pipe(object): return example return example.doc - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Create a new pipe instance.""" raise NotImplementedError @@ -72,7 +61,6 @@ class Pipe(object): Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - self.require_model() doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: @@ -85,11 +73,6 @@ class Pipe(object): return example return doc - def require_model(self): - """Raise an error if the component's model is not initialized.""" - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Apply the pipe to a stream of documents. @@ -116,7 +99,6 @@ class Pipe(object): """Apply the pipeline's model to a batch of docs, without modifying them. """ - self.require_model() raise NotImplementedError def set_annotations(self, docs, scores, tensors=None): @@ -158,22 +140,23 @@ class Pipe(object): ): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" - if self.model is True: - self.model = self.Model(**self.cfg) + self.model.initialize() if hasattr(self, "vocab"): link_vectors_to_models(self.vocab) - self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd + def set_output(self, nO): + self.model.set_dim("nO", nO) + if self.model.has_ref("output_layer"): + self.model.get_ref("output_layer").set_dim("nO", nO) + def get_gradients(self): """Get non-zero gradients of the model's parameters, as a dictionary keyed by the parameter ID. The values are (weights, gradients) tuples. """ gradients = {} - if self.model in (None, True, False): - return gradients queue = [self.model] seen = set() for node in queue: @@ -199,8 +182,7 @@ class Pipe(object): """ serialize = {} serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - if self.model not in (True, False, None): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes if hasattr(self, "vocab"): serialize["vocab"] = self.vocab.to_bytes exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -210,20 +192,15 @@ class Pipe(object): """Load the pipe from a bytestring.""" def load_model(b): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: raise ValueError(Errors.E149) deserialize = {} - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) if hasattr(self, "vocab"): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) @@ -234,8 +211,7 @@ class Pipe(object): serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) - if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -243,19 +219,14 @@ class Pipe(object): """Load the pipe from disk.""" def load_model(p): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(p.open("rb").read()) except AttributeError: raise ValueError(Errors.E149) deserialize = {} - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) @@ -266,31 +237,13 @@ class Pipe(object): class Tensorizer(Pipe): """Pre-train position-sensitive vectors for tokens.""" - @classmethod - def Model(cls, output_size=300, **cfg): - """Create a new statistical model for the class. - - width (int): Output size of the model. - embed_size (int): Number of vectors in the embedding table. - **cfg: Config parameters. - RETURNS (Model): A `thinc.model.Model` or similar instance. - """ - input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96)) - return Linear(output_size, input_size, init_W=zero_init) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. - model (Model): A `Model` instance or `True` to allocate one later. **cfg: Config parameters. - - EXAMPLE: - >>> from spacy.pipeline import TokenVectorEncoder - >>> tok2vec = TokenVectorEncoder(nlp.vocab) - >>> tok2vec.model = tok2vec.Model(128, 5000) """ self.vocab = vocab self.model = model @@ -337,7 +290,6 @@ class Tensorizer(Pipe): docs (iterable): A sequence of `Doc` objects. RETURNS (object): Vector representations for each token in the docs. """ - self.require_model() inputs = self.model.ops.flatten([doc.tensor for doc in docs]) outputs = self.model(inputs) return self.model.ops.unflatten(outputs, [len(d) for d in docs]) @@ -362,7 +314,6 @@ class Tensorizer(Pipe): sgd (callable): An optimizer. RETURNS (dict): Results from the update. """ - self.require_model() examples = Example.to_example_objects(examples) inputs = [] bp_inputs = [] @@ -405,10 +356,8 @@ class Tensorizer(Pipe): """ if pipeline is not None: for name, model in pipeline: - if getattr(model, "tok2vec", None): - self.input_models.append(model.tok2vec) - if self.model is True: - self.model = self.Model(**self.cfg) + if model.has_ref("tok2vec"): + self.input_models.append(model.get_ref("tok2vec")) self.model.initialize() link_vectors_to_models(self.vocab) if sgd is None: @@ -423,7 +372,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None @@ -433,13 +382,6 @@ class Tagger(Pipe): def labels(self): return tuple(self.vocab.morphology.tag_names) - @property - def tok2vec(self): - if self.model in (None, True, False): - return None - else: - return chain(self.model.get_ref("tok2vec"), list2array()) - def __call__(self, example): doc = self._get_doc(example) tags = self.predict([doc]) @@ -465,7 +407,6 @@ class Tagger(Pipe): yield from docs def predict(self, docs): - self.require_model() if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.labels) @@ -513,7 +454,6 @@ class Tagger(Pipe): doc.is_tagged = True def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -600,52 +540,21 @@ class Tagger(Pipe): vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) - self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") - if self.model is True: - for hp in ["token_vector_width", "conv_depth"]: - if hp in kwargs: - self.cfg[hp] = kwargs[hp] - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.set_output(len(self.labels)) + self.model.initialize() # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. - n_tags = self.vocab.morphology.n_tags - for node in self.model.walk(): - # TODO: softmax hack ? - if node.name == "softmax" and node.has_dim("nO") is None: - node.set_dim("nO", n_tags) link_vectors_to_models(self.vocab) - self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd - @classmethod - def Model(cls, n_tags=None, **cfg): - if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): - raise ValueError(TempErrors.T008) - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - config = { - "width": cfg.get("token_vector_width", 96), - "embed_size": cfg.get("embed_size", 2000), - "pretrained_vectors": cfg.get("pretrained_vectors", None), - "window_size": cfg.get("window_size", 1), - "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), - "subword_features": cfg.get("subword_features", True), - "char_embed": cfg.get("char_embed", False), - "conv_depth": cfg.get("conv_depth", 4), - "bilstm_depth": cfg.get("bilstm_depth", 0), - } - tok2vec = Tok2Vec(**config) - return build_tagger_model(n_tags, tok2vec) - def add_label(self, label, values=None): if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model not in (True, False, None): + if self.model.has_dim("nO"): # Here's how the model resizing will work, once the # neuron-to-tag mapping is no longer controlled by # the Morphology class, which sorts the tag names. @@ -672,8 +581,7 @@ class Tagger(Pipe): def to_bytes(self, exclude=tuple(), **kwargs): serialize = {} - if self.model not in (None, True, False): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) @@ -683,14 +591,6 @@ class Tagger(Pipe): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def load_model(b): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - token_vector_width = util.env_opt( - "token_vector_width", - self.cfg.get("token_vector_width", 96)) - self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -719,18 +619,13 @@ class Tagger(Pipe): "vocab": lambda p: self.vocab.to_disk(p), "tag_map": lambda p: srsly.write_msgpack(p, tag_map), "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg) + "cfg": lambda p: srsly.write_json(p, self.cfg), } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -745,8 +640,8 @@ class Tagger(Pipe): exc=self.vocab.morphology.exc) deserialize = { - "cfg": lambda p: self.cfg.update(_load_cfg(p)), "vocab": lambda p: self.vocab.from_disk(p), + "cfg": lambda p: self.cfg.update(_load_cfg(p)), "tag_map": load_tag_map, "model": load_model, } @@ -762,16 +657,11 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer """ - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault("cnn_maxout_pieces", 2) - self.cfg.setdefault("subword_features", True) - self.cfg.setdefault("token_vector_width", 12) - self.cfg.setdefault("conv_depth", 1) - self.cfg.setdefault("pretrained_vectors", None) @property def labels(self): @@ -797,7 +687,6 @@ class SentenceRecognizer(Tagger): doc.c[j].sent_start = -1 def update(self, examples, drop=0., sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -844,20 +733,12 @@ class SentenceRecognizer(Tagger): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): cdef Vocab vocab = self.vocab - if self.model is True: - for hp in ["token_vector_width", "conv_depth"]: - if hp in kwargs: - self.cfg[hp] = kwargs[hp] - self.model = self.Model(len(self.labels), **self.cfg) + self.set_output(len(self.labels)) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() - self.model.initialize() return sgd - @classmethod - def Model(cls, n_tags, **cfg): - return build_tagger_model(n_tags, **cfg) - def add_label(self, label, values=None): raise NotImplementedError @@ -867,8 +748,7 @@ class SentenceRecognizer(Tagger): def to_bytes(self, exclude=tuple(), **kwargs): serialize = {} - if self.model not in (None, True, False): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -876,8 +756,6 @@ class SentenceRecognizer(Tagger): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def load_model(b): - if self.model is True: - self.model = self.Model(len(self.labels), **self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -896,15 +774,13 @@ class SentenceRecognizer(Tagger): serialize = { "vocab": lambda p: self.vocab.to_disk(p), "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg) + "cfg": lambda p: srsly.write_json(p, self.cfg), } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - if self.model is True: - self.model = self.Model(len(self.labels), **self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -912,8 +788,8 @@ class SentenceRecognizer(Tagger): raise ValueError(Errors.E149) deserialize = { - "cfg": lambda p: self.cfg.update(_load_cfg(p)), "vocab": lambda p: self.vocab.from_disk(p), + "cfg": lambda p: self.cfg.update(_load_cfg(p)), "model": load_model, } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) @@ -927,7 +803,7 @@ class MultitaskObjective(Tagger): side-objective. """ - def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): + def __init__(self, vocab, model, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model if target == "dep": @@ -947,7 +823,8 @@ class MultitaskObjective(Tagger): else: raise ValueError(Errors.E016) self.cfg = dict(cfg) - self.cfg.setdefault("cnn_maxout_pieces", 2) + # TODO: remove - put in config + self.cfg.setdefault("maxout_pieces", 2) @property def labels(self): @@ -969,30 +846,15 @@ class MultitaskObjective(Tagger): label = self.make_label(i, example.token_annotation) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - if self.model is True: - token_vector_width = util.env_opt("token_vector_width") - self.model = self.Model(len(self.labels), tok2vec=tok2vec) - link_vectors_to_models(self.vocab) self.model.initialize() + link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd - @classmethod - def Model(cls, n_tags, tok2vec=None, **cfg): - token_vector_width = util.env_opt("token_vector_width", 96) - model = chain( - tok2vec, - Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0), - LayerNorm(token_vector_width*2), - Softmax(nO=n_tags, nI=token_vector_width*2) - ) - return model - def predict(self, docs): - self.require_model() - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + scores = self.model.get_ref("softmax")(tokvecs) return tokvecs, scores def get_loss(self, examples, scores): @@ -1097,18 +959,7 @@ class MultitaskObjective(Tagger): class ClozeMultitask(Pipe): - @classmethod - def Model(cls, vocab, tok2vec, **cfg): - output_size = vocab.vectors.data.shape[1] - output_layer = chain( - Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0), - Linear(nO=output_size, nI=output_size, init_W=zero_init) - ) - model = chain(tok2vec, output_layer) - model = masked_language_model(vocab, model) - return model - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self.cfg = cfg @@ -1120,19 +971,16 @@ class ClozeMultitask(Pipe): def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) - if self.model is True: - self.model = self.Model(self.vocab, tok2vec) - X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.initialize() + X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) if sgd is None: sgd = self.create_optimizer() return sgd def predict(self, docs): - self.require_model() - tokvecs = self.model.tok2vec(docs) - vectors = self.model.output_layer(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + vectors = self.model.get_ref("output_layer")(tokvecs) return tokvecs, vectors def get_loss(self, examples, vectors, prediction): @@ -1150,7 +998,6 @@ class ClozeMultitask(Pipe): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -1171,62 +1018,11 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer """ - - @classmethod - def Model(cls, nr_class=1, exclusive_classes=None, **cfg): - if nr_class == 1: - exclusive_classes = False - if exclusive_classes is None: - raise ValueError( - "TextCategorizer Model must specify 'exclusive_classes'. " - "This setting determines whether the model will output " - "scores that sum to 1 for each example. If only one class " - "is true for each example, you should set exclusive_classes=True. " - "For 'multi_label' classification, set exclusive_classes=False." - ) - if "embed_size" not in cfg: - cfg["embed_size"] = util.env_opt("embed_size", 2000) - if "token_vector_width" not in cfg: - cfg["token_vector_width"] = util.env_opt("token_vector_width", 96) - if cfg.get("architecture") == "bow": - return build_bow_text_classifier(nr_class, exclusive_classes, **cfg) - else: - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - config = { - "width": cfg.get("token_vector_width", 96), - "embed_size": cfg.get("embed_size", 2000), - "pretrained_vectors": cfg.get("pretrained_vectors", None), - "window_size": cfg.get("window_size", 1), - "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), - "subword_features": cfg.get("subword_features", True), - "char_embed": cfg.get("char_embed", False), - "conv_depth": cfg.get("conv_depth", 4), - "bilstm_depth": cfg.get("bilstm_depth", 0), - } - tok2vec = Tok2Vec(**config) - return build_simple_cnn_text_classifier( - tok2vec, - nr_class, - exclusive_classes, - **cfg - ) - - @property - def tok2vec(self): - if self.model in (None, True, False): - return None - else: - return self.model.tok2vec - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(cfg) - if "exclusive_classes" not in cfg: - self.cfg["exclusive_classes"] = True @property def labels(self): @@ -1255,7 +1051,6 @@ class TextCategorizer(Pipe): yield from docs def predict(self, docs): - self.require_model() tensors = [doc.tensor for doc in docs] if not any(len(doc) for doc in docs): @@ -1274,7 +1069,6 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. @@ -1311,7 +1105,7 @@ class TextCategorizer(Pipe): losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, examples, scores): + def _examples_to_truth(self, examples): golds = [ex.gold for ex in examples] truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") @@ -1322,6 +1116,10 @@ class TextCategorizer(Pipe): else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) + return truths, not_missing + + def get_loss(self, examples, scores): + truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) d_scores = (scores-truths) / scores.shape[0] d_scores *= not_missing @@ -1333,7 +1131,7 @@ class TextCategorizer(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model not in (None, True, False): + if self.model.has_dim("nO"): # This functionality was available previously, but was broken. # The problem is that we resize the last layer, but the last layer # is actually just an ensemble. We're not resizing the child layers @@ -1348,19 +1146,18 @@ class TextCategorizer(Pipe): return 1 def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - for example in get_examples(): + # TODO: begin_training is not guaranteed to see all data / labels ? + examples = list(get_examples()) + for example in examples: for cat in example.doc_annotation.cats: self.add_label(cat) - if self.model is True: - self.cfg.update(kwargs) - self.require_labels() - self.model = self.Model(len(self.labels), **self.cfg) - link_vectors_to_models(self.vocab) + self.require_labels() + docs = [Doc(Vocab(), words=["hello"])] + truths, _ = self._examples_to_truth(examples) + self.set_output(len(self.labels)) + self.model.initialize(X=docs, Y=truths) if sgd is None: sgd = self.create_optimizer() - # TODO: use get_examples instead - docs = [Doc(Vocab(), words=["hello"])] - self.model.initialize(X=docs) return sgd @@ -1393,7 +1190,7 @@ cdef class DependencyParser(Parser): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: - tok2vec = self.model.tok2vec + tok2vec = self.model.get_ref("tok2vec") labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) @@ -1423,7 +1220,6 @@ cdef class EntityRecognizer(Parser): assigns = ["doc.ents", "token.ent_iob", "token.ent_type"] requires = [] TransitionSystem = BiluoPushDown - nr_feature = 6 def add_multitask_objective(self, target): if target == "cloze": @@ -1435,7 +1231,7 @@ cdef class EntityRecognizer(Parser): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: - tok2vec = self.model.tok2vec + tok2vec = self.model.get_ref("tok2vec") labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec) @@ -1464,18 +1260,9 @@ class EntityLinker(Pipe): """ NIL = "NIL" # string used to refer to a non-existing link - @classmethod - def Model(cls, **cfg): - embed_width = cfg.get("embed_width", 300) - hidden_width = cfg.get("hidden_width", 128) - type_to_int = cfg.get("type_to_int", dict()) - - model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) - return model - - def __init__(self, vocab, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab - self.model = True + self.model = model self.kb = None self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) @@ -1483,11 +1270,6 @@ class EntityLinker(Pipe): def set_kb(self, kb): self.kb = kb - def require_model(self): - # Raise an error if the component's model is not initialized. - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def require_kb(self): # Raise an error if the knowledge base is not initialized. if getattr(self, "kb", None) in (None, True, False): @@ -1495,16 +1277,14 @@ class EntityLinker(Pipe): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() - self.cfg["entity_width"] = self.kb.entity_vector_length - if self.model is True: - self.model = self.Model(**self.cfg) + nO = self.kb.entity_vector_length + self.set_output(nO) self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): - self.require_model() self.require_kb() if losses is not None: losses.setdefault(self.name, 0.0) @@ -1614,7 +1394,6 @@ class EntityLinker(Pipe): def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ - self.require_model() self.require_kb() entity_count = 0 @@ -1714,15 +1493,12 @@ class EntityLinker(Pipe): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) - if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(p.open("rb").read()) except AttributeError: @@ -1734,8 +1510,8 @@ class EntityLinker(Pipe): self.set_kb(kb) deserialize = {} - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["kb"] = load_kb deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) @@ -1782,7 +1558,7 @@ class Sentencizer(Pipe): self.punct_chars = set(self.default_punct_chars) @classmethod - def from_nlp(cls, nlp, **cfg): + def from_nlp(cls, nlp, model=None, **cfg): return cls(**cfg) def __call__(self, example): @@ -1915,8 +1691,8 @@ class Sentencizer(Pipe): # Cython classes can't be decorated, so we need to add the factories here -Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg) -Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) +Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg) +Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg) __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 8290468cf..a49f94ca3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -5,32 +5,21 @@ from ..gold import Example from ..tokens import Doc from ..vocab import Vocab from ..language import component -from ..util import link_vectors_to_models, minibatch, registry, eg2doc +from ..util import link_vectors_to_models, minibatch, eg2doc @component("tok2vec", assigns=["doc.tensor"]) class Tok2Vec(Pipe): - @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) @classmethod - def Model(cls, architecture, **cfg): - """Create a new statistical model for the class. + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) - architecture (str): The registered model architecture to use. - **cfg: Config parameters. - RETURNS (Model): A `thinc.model.Model` or similar instance. - """ - model = registry.architectures.get(architecture) - return model(**cfg) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. - model (Model): A `Model` instance or `True` to allocate one later. **cfg: Config parameters. """ self.vocab = vocab @@ -143,8 +132,6 @@ class Tok2Vec(Pipe): get_examples (function): Function returning example training data. pipeline (list): The pipeline the model is part of. """ - if self.model is True: - self.model = self.Model(**self.cfg) # TODO: use examples instead ? docs = [Doc(Vocab(), words=["hello"])] self.model.initialize(X=docs) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 442233f19..7ff9517a5 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -221,7 +221,10 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): - Model.__init__(self, name="parser_model", forward=forward) + # don't define nO for this object, because we can't dynamically change it + Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None}) + if tok2vec.has_dim("nI"): + self.set_dim("nI", tok2vec.get_dim("nI")) self._layers = [tok2vec, lower_model] if upper_model is not None: self._layers.append(upper_model) @@ -229,6 +232,7 @@ class ParserModel(Model): if unseen_classes: for class_ in unseen_classes: self.unseen_classes.add(class_) + self.set_ref("tok2vec", tok2vec) def predict(self, docs): step_model = ParserStepModel(docs, self._layers, @@ -238,25 +242,32 @@ class ParserModel(Model): def resize_output(self, new_nO): if len(self._layers) == 2: return - if new_nO == self.upper.get_dim("nO"): + if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")): return smaller = self.upper - nI = smaller.get_dim("nI") + nI = None + if smaller.has_dim("nI"): + nI = smaller.get_dim("nI") with use_ops('numpy'): - larger = Linear(new_nO, nI) - larger_W = larger.ops.alloc2f(new_nO, nI) - larger_b = larger.ops.alloc1f(new_nO) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - # Weights are stored in (nr_out, nr_in) format, so we're basically - # just adding rows here. - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) + larger = Linear(nO=new_nO, nI=nI) + larger._init = smaller._init + # it could be that the model is not initialized yet, then skip this bit + if nI: + larger_W = larger.ops.alloc2f(new_nO, nI) + larger_b = larger.ops.alloc1f(new_nO) + smaller_W = smaller.get_param("W") + smaller_b = smaller.get_param("b") + # Weights are stored in (nr_out, nr_in) format, so we're basically + # just adding rows here. + if smaller.has_dim("nO"): + larger_W[:smaller.get_dim("nO")] = smaller_W + larger_b[:smaller.get_dim("nO")] = smaller_b + for i in range(smaller.get_dim("nO"), new_nO): + self.unseen_classes.add(i) + + larger.set_param("W", larger_W) + larger.set_param("b", larger_b) self._layers[-1] = larger - for i in range(smaller.get_dim("nO"), new_nO): - self.unseen_classes.add(i) def initialize(self, X=None, Y=None): self.tok2vec.initialize() @@ -412,7 +423,7 @@ cdef class precompute_hiddens: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. """ - cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc + cdef readonly int nF, nO, nP cdef bint _is_synchronized cdef public object ops cdef np.ndarray _features @@ -458,6 +469,16 @@ cdef class precompute_hiddens: self._is_synchronized = True return self._cached.data + def has_dim(self, name): + if name == "nF": + return self.nF if self.nF is not None else True + elif name == "nP": + return self.nP if self.nP is not None else True + elif name == "nO": + return self.nO if self.nO is not None else True + else: + return False + def get_dim(self, name): if name == "nF": return self.nF @@ -468,6 +489,16 @@ cdef class precompute_hiddens: else: raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + def set_dim(self, name, value): + if name == "nF": + self.nF = value + elif name == "nP": + self.nP = value + elif name == "nO": + self.nO = value + else: + raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + def __call__(self, X, bint is_train): if is_train: return self.begin_update(X) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index cf57e1cf6..9381fab6b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -27,11 +27,11 @@ from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel -from ..util import link_vectors_to_models, create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer, registry from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, user_warning, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -41,114 +41,42 @@ from . import _beam_utils from . import nonproj -from ..ml._layers import PrecomputableAffine -from ..ml.component_models import Tok2Vec - - cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. """ - @classmethod - def Model(cls, nr_class, **cfg): - depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) - subword_features = util.env_opt('subword_features', - cfg.get('subword_features', True)) - conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) - conv_window = util.env_opt('conv_window', cfg.get('conv_window', 1)) - t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) - bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) - self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) - nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature) - if depth not in (0, 1): - raise ValueError(TempErrors.T004.format(value=depth)) - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', - cfg.get('maxout_pieces', 2)) - token_vector_width = util.env_opt('token_vector_width', - cfg.get('token_vector_width', 96)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) - if depth == 0: - hidden_width = nr_class - parser_maxout_pieces = 1 - embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) - pretrained_vectors = cfg.get('pretrained_vectors', None) - tok2vec = Tok2Vec(width=token_vector_width, - embed_size=embed_size, - conv_depth=conv_depth, - window_size=conv_window, - cnn_maxout_pieces=t2v_pieces, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors, - bilstm_depth=bilstm_depth) - tok2vec = chain(tok2vec, list2array()) - tok2vec.set_dim("nO", token_vector_width) - lower = PrecomputableAffine(hidden_width, - nF=nr_feature_tokens, nI=token_vector_width, - nP=parser_maxout_pieces) - lower.set_dim("nP", parser_maxout_pieces) - if depth == 1: - with use_ops('numpy'): - upper = Linear(nr_class, hidden_width, init_W=zero_init) - else: - upper = None - - cfg = { - 'nr_class': nr_class, - 'nr_feature_tokens': nr_feature_tokens, - 'hidden_depth': depth, - 'token_vector_width': token_vector_width, - 'hidden_width': hidden_width, - 'maxout_pieces': parser_maxout_pieces, - 'pretrained_vectors': pretrained_vectors, - 'bilstm_depth': bilstm_depth, - 'self_attn_depth': self_attn_depth, - 'conv_depth': conv_depth, - 'window_size': conv_window, - 'embed_size': embed_size, - 'cnn_maxout_pieces': t2v_pieces - } - model = ParserModel(tok2vec, lower, upper) - model.initialize() - return model, cfg - name = 'base_parser' - def __init__(self, Vocab vocab, moves=True, model=True, **cfg): + + def __init__(self, Vocab vocab, model, **cfg): """Create a Parser. vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. The value is set to the `.vocab` attribute. - moves (TransitionSystem): Defines how the parse-state is created, - updated and evaluated. The value is set to the .moves attribute - unless True (default), in which case a new instance is created with - `Parser.Moves()`. - model (object): Defines how the parse-state is created, updated and - evaluated. The value is set to the .model attribute. If set to True - (default), a new instance will be created with `Parser.Model()` - in parser.begin_training(), parser.from_disk() or parser.from_bytes(). - **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute + **cfg: Configuration parameters. Set to the `.cfg` attribute. + If it doesn't include a value for 'moves', a new instance is + created with `self.TransitionSystem()`. This defines how the + parse-state is created, updated and evaluated. """ self.vocab = vocab - if moves is True: - self.moves = self.TransitionSystem(self.vocab.strings) - else: - self.moves = moves - if 'beam_width' not in cfg: - cfg['beam_width'] = util.env_opt('beam_width', 1) - if 'beam_density' not in cfg: - cfg['beam_density'] = util.env_opt('beam_density', 0.0) - if 'beam_update_prob' not in cfg: - cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) - cfg.setdefault('cnn_maxout_pieces', 3) - cfg.setdefault("nr_feature_tokens", self.nr_feature) - self.cfg = cfg + moves = cfg.get("moves", None) + if moves is None: + # defined by EntityRecognizer as a BiluoPushDown + moves = self.TransitionSystem(self.vocab.strings) + self.moves = moves + cfg.setdefault('min_action_freq', 30) + cfg.setdefault('learn_tokens', False) + cfg.setdefault('beam_width', 1) + cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used) self.model = model + self.set_output(self.moves.n_moves) + self.cfg = cfg self._multitasks = [] self._rehearsal_model = None @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) @@ -163,8 +91,6 @@ cdef class Parser: names.append(name) return names - nr_feature = 8 - @property def labels(self): class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] @@ -173,7 +99,7 @@ cdef class Parser: @property def tok2vec(self): '''Return the embedding and convolutional layer of the model.''' - return None if self.model in (None, True, False) else self.model.tok2vec + return self.model.tok2vec @property def postprocesses(self): @@ -190,10 +116,7 @@ cdef class Parser: self._resize() def _resize(self): - if "nr_class" in self.cfg: - self.cfg["nr_class"] = self.moves.n_moves - if self.model not in (True, False, None): - self.model.resize_output(self.moves.n_moves) + self.model.resize_output(self.moves.n_moves) if self._rehearsal_model not in (True, False, None): self._rehearsal_model.resize_output(self.moves.n_moves) @@ -227,7 +150,7 @@ cdef class Parser: doc (Doc): The document to be processed. """ if beam_width is None: - beam_width = self.cfg.get('beam_width', 1) + beam_width = self.cfg['beam_width'] beam_density = self.cfg.get('beam_density', 0.) states = self.predict([doc], beam_width=beam_width, beam_density=beam_density) @@ -243,7 +166,7 @@ cdef class Parser: YIELDS (Doc): Documents, in order. """ if beam_width is None: - beam_width = self.cfg.get('beam_width', 1) + beam_width = self.cfg['beam_width'] beam_density = self.cfg.get('beam_density', 0.) cdef Doc doc for batch in util.minibatch(docs, size=batch_size): @@ -264,13 +187,7 @@ cdef class Parser: else: yield from batch_in_order - def require_model(self): - """Raise an error if the component's model is not initialized.""" - if getattr(self, 'model', None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): - self.require_model() if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): @@ -313,11 +230,11 @@ cdef class Parser: # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() + cdef int nr_feature = self.model.lower.get_dim("nF") model = self.model.predict(docs) - token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), + token_ids = numpy.zeros((len(docs) * beam_width, nr_feature), dtype='i', order='C') cdef int* c_ids - cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states model = self.model.predict(docs) todo = [beam for beam in beams if not beam.is_done] @@ -430,7 +347,6 @@ cdef class Parser: return [b for b in beams if not b.is_done] def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is None: @@ -440,9 +356,9 @@ cdef class Parser: multitask.update(examples, drop=drop, sgd=sgd) # The probability we use beam update, instead of falling back to # a greedy update - beam_update_prob = self.cfg.get('beam_update_prob', 0.5) - if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam(examples, self.cfg.get('beam_width', 1), + beam_update_prob = self.cfg['beam_update_prob'] + if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob: + return self.update_beam(examples, self.cfg['beam_width'], drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations, beam_density=self.cfg.get('beam_density', 0.001)) @@ -533,7 +449,7 @@ cdef class Parser: set_dropout_rate(self.model, drop) model, backprop_tok2vec = self.model.begin_update(docs) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, + self.moves, self.model.lower.get_dim("nF"), 10000, states, golds, model.state2vec, model.vec2scores, width, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): @@ -562,8 +478,6 @@ cdef class Parser: keyed by the parameter ID. The values are (weights, gradients) tuples. """ gradients = {} - if self.model in (None, True, False): - return gradients queue = [self.model] seen = set() for node in queue: @@ -647,45 +561,40 @@ cdef class Parser: def create_optimizer(self): return create_default_optimizer() - def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg): - if 'model' in cfg: - self.model = cfg['model'] + def set_output(self, nO): + if self.model.upper.has_dim("nO") is None: + self.model.upper.set_dim("nO", nO) + + def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + self.cfg.update(kwargs) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples - cfg.setdefault('min_action_freq', 30) actions = self.moves.get_actions(gold_parses=get_examples(), - min_freq=cfg.get('min_action_freq', 30), - learn_tokens=self.cfg.get("learn_tokens", False)) + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"]) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): if label not in actions[action]: actions[action][label] = freq self.moves.initialize_actions(actions) - cfg.setdefault('token_vector_width', 96) - if self.model is True: - self.model, cfg = self.Model(self.moves.n_moves, **cfg) - if sgd is None: - sgd = self.create_optimizer() - doc_sample = [] - gold_sample = [] - for example in islice(get_examples(), 1000): - parses = example.get_gold_parses(merge=False, vocab=self.vocab) - for doc, gold in parses: - doc_sample.append(doc) - gold_sample.append(gold) - self.model.initialize(doc_sample, gold_sample) - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg) - link_vectors_to_models(self.vocab) - else: - if sgd is None: - sgd = self.create_optimizer() - if self.model.upper.has_dim("nO") is None: - self.model.upper.set_dim("nO", self.moves.n_moves) - self.model.initialize() - self.cfg.update(cfg) + # make sure we resize so we have an appropriate upper layer + self._resize() + if sgd is None: + sgd = self.create_optimizer() + doc_sample = [] + gold_sample = [] + for example in islice(get_examples(), 1000): + parses = example.get_gold_parses(merge=False, vocab=self.vocab) + for doc, gold in parses: + doc_sample.append(doc) + gold_sample.append(gold) + + self.model.initialize(doc_sample, gold_sample) + if pipeline is not None: + self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) + link_vectors_to_models(self.vocab) return sgd def _get_doc(self, example): @@ -709,28 +618,24 @@ cdef class Parser: 'vocab': lambda p: self.vocab.from_disk(p), 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), - 'model': lambda p: None + 'model': lambda p: None, } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_disk(path, deserializers, exclude) if 'model' not in exclude: path = util.ensure_path(path) - if self.model is True: - self.model, cfg = self.Model(**self.cfg) - else: - cfg = {} with (path / 'model').open('rb') as file_: bytes_data = file_.read() try: + self._resize() self.model.from_bytes(bytes_data) except AttributeError: raise ValueError(Errors.E149) - self.cfg.update(cfg) return self def to_bytes(self, exclude=tuple(), **kwargs): serializers = { - "model": lambda: (self.model.to_bytes() if self.model is not True else True), + "model": lambda: (self.model.to_bytes()), "vocab": lambda: self.vocab.to_bytes(), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) @@ -743,22 +648,14 @@ cdef class Parser: "vocab": lambda b: self.vocab.from_bytes(b), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: None + "model": lambda b: None, } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors - if self.model is True: - self.model, cfg = self.Model(**self.cfg) - else: - cfg = {} if 'model' in msg: try: self.model.from_bytes(msg['model']) except AttributeError: raise ValueError(Errors.E149) - self.cfg.update(cfg) return self diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 766dcb739..3a466b24c 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -3,12 +3,13 @@ from spacy.tokens import Span import pytest from ..util import get_doc +from ...ml.models.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab) + ner = EntityRecognizer(en_vocab, default_ner()) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 @@ -24,7 +25,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab) + ner = EntityRecognizer(en_vocab, default_ner()) ner.begin_training([]) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fe847a6ae..5af772ddc 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -3,6 +3,8 @@ from thinc.api import Adam, NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.util import fix_random_seed @@ -15,7 +17,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab) + parser = DependencyParser(vocab, default_parser()) return parser @@ -55,27 +57,31 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - ner1 = EntityRecognizer(Vocab()) + ner1 = EntityRecognizer(Vocab(), default_ner()) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.begin_training([]) - ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes()) + ner2 = EntityRecognizer(Vocab(), default_ner()) + + # the second model needs to be resized before we can call from_bytes + ner2.model.resize_output(ner1.moves.n_moves) + ner2.from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) @pytest.mark.parametrize( - "pipe_cls,n_moves", [(DependencyParser, 5), (EntityRecognizer, 4)] + "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())] ) -def test_add_label_get_label(pipe_cls, n_moves): +def test_add_label_get_label(pipe_cls, n_moves, model): """Test that added labels are returned correctly. This test was added to test for a bug in DependencyParser.labels that'd cause it to fail when splitting the move names. """ labels = ["A", "B", "C"] - pipe = pipe_cls(Vocab()) + pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index dd593f7d3..2426805d2 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,5 +1,7 @@ import pytest from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.tokens import Doc from spacy.gold import GoldParse @@ -136,7 +138,7 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - parser = DependencyParser(doc.vocab) + parser = DependencyParser(doc.vocab, default_parser()) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 9a4d21a8d..3fde75eb5 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,10 +1,15 @@ import pytest + +from spacy import util from spacy.lang.en import English +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse + +from spacy.tests.util import make_tempdir from spacy.tokens import Doc TRAIN_DATA = [ @@ -134,7 +139,7 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - ner1 = EntityRecognizer(doc1.vocab) + ner1 = EntityRecognizer(doc1.vocab, default_ner()) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -152,7 +157,7 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - ner2 = EntityRecognizer(doc2.vocab) + ner2 = EntityRecognizer(doc2.vocab, default_ner()) # set "New York" to a blocked entity doc2.ents = [(0, 3, 5)] @@ -188,7 +193,7 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - ner2 = EntityRecognizer(doc.vocab) + ner2 = EntityRecognizer(doc.vocab, default_ner()) ner2.moves.add_action(5, "") ner2.add_label("GPE") state = ner2.moves.init_batch([doc])[0] @@ -199,6 +204,17 @@ def test_overwrite_token(): assert ner2.moves.is_valid(state, "L-GPE") +def test_empty_ner(): + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("MY_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + doc = nlp("John is watching the news about Croatia's elections") + # if this goes wrong, the initialization of the parser's upper layer is probably broken + assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + + def test_ruler_before_ner(): """ Test that an NER works after an entity_ruler: the second can add annotations """ nlp = English() @@ -214,7 +230,6 @@ def test_ruler_before_ner(): untrained_ner.add_label("MY_LABEL") nlp.add_pipe(untrained_ner) nlp.begin_training() - doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] @@ -261,28 +276,7 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types -def test_change_number_features(): - # Test the default number features - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - ner.add_label("PERSON") - nlp.begin_training() - assert ner.model.lower.get_dim("nF") == ner.nr_feature - # Test we can change it - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - ner.add_label("PERSON") - nlp.begin_training( - component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} - ) - assert ner.model.lower.get_dim("nF") == 3 - # Test the model runs - nlp("hello world") - - -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly nlp = English() ner = nlp.create_pipe("ner") @@ -301,11 +295,20 @@ def test_overfitting(): test_text = "I like London." doc = nlp(test_text) ents = doc.ents - assert len(ents) == 1 assert ents[0].text == "London" assert ents[0].label_ == "LOC" + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "London" + assert ents2[0].label_ == "LOC" + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 2470982d3..984af4d6b 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,8 +1,9 @@ import pytest -from spacy.ml.component_models import Tok2Vec +from spacy.ml.models.defaults import default_parser, default_tok2vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser +from spacy.syntax._parser_model import ParserModel from spacy.tokens.doc import Doc from spacy.gold import GoldParse @@ -20,19 +21,22 @@ def arc_eager(vocab): @pytest.fixture def tok2vec(): - tok2vec = Tok2Vec(8, 100) + tok2vec = default_tok2vec() tok2vec.initialize() return tok2vec @pytest.fixture def parser(vocab, arc_eager): - return Parser(vocab, moves=arc_eager, model=None) + return Parser(vocab, model=default_parser(), moves=arc_eager) @pytest.fixture -def model(arc_eager, tok2vec): - return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0] +def model(arc_eager, tok2vec, vocab): + model = default_parser() + model.resize_output(arc_eager.n_moves) + model.initialize() + return model @pytest.fixture @@ -46,11 +50,11 @@ def gold(doc): def test_can_init_nn_parser(parser): - assert parser.model is None + assert isinstance(parser.model, ParserModel) -def test_build_model(parser): - parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0] +def test_build_model(parser, vocab): + parser.model = Parser(vocab, model=default_parser(), moves=parser.moves).model assert parser.model is not None diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 24997e47c..619e0cc0b 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -2,6 +2,7 @@ import pytest import numpy from spacy.vocab import Vocab from spacy.language import Language +from spacy.ml.models.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.syntax.arc_eager import ArcEager from spacy.tokens import Doc @@ -93,7 +94,7 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") + nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 1d3f522c9..6e13d3044 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,7 +1,8 @@ import pytest from spacy.lang.en import English -from ..util import get_doc, apply_transition_sequence +from ..util import get_doc, apply_transition_sequence, make_tempdir +from ... import util TRAIN_DATA = [ ( @@ -182,7 +183,7 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() parser = nlp.create_pipe("parser") @@ -200,7 +201,15 @@ def test_overfitting(): # test the trained model test_text = "I like securities." doc = nlp(test_text) - assert doc[0].dep_ is "nsubj" assert doc[2].dep_ is "dobj" assert doc[3].dep_ is "punct" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert doc2[0].dep_ is "nsubj" + assert doc2[2].dep_ is "dobj" + assert doc2[3].dep_ is "punct" diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index c6c1240a8..af777aa6b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -3,6 +3,8 @@ from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser from spacy.tokens import Doc from spacy.pipeline import DependencyParser @@ -14,7 +16,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab) + parser = DependencyParser(vocab, default_parser()) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 5c246538c..cda39f6ee 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -111,7 +111,8 @@ def test_component_factories_from_nlp(): nlp.add_pipe(pipe) assert nlp("hello world") # The first argument here is the class itself, so we're accepting any here - mock.assert_called_once_with(ANY, nlp, foo="bar") + # The model will be initialized to None by the factory + mock.assert_called_once_with(ANY, nlp, None, foo="bar") def test_analysis_validate_attrs_valid(): diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 366cd4f1a..a90207a78 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,9 @@ import pytest + +from spacy import util +from spacy.lang.en import English from spacy.language import Language +from spacy.tests.util import make_tempdir def test_label_types(): @@ -18,9 +22,9 @@ TRAIN_DATA = [ ] -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly - nlp = Language() + nlp = English() tagger = nlp.create_pipe("tagger") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) @@ -35,8 +39,17 @@ def test_overfitting(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[0].tag_ is "N" assert doc[1].tag_ is "V" assert doc[2].tag_ is "J" assert doc[3].tag_ is "N" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert doc2[0].tag_ is "N" + assert doc2[1].tag_ is "V" + assert doc2[2].tag_ is "J" + assert doc2[3].tag_ is "N" diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 558d09e40..1b5ca9a4c 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,8 +1,12 @@ import pytest import random import numpy.random + +from spacy import util +from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer +from spacy.tests.util import make_tempdir from spacy.tokens import Doc from spacy.gold import GoldParse @@ -74,9 +78,9 @@ def test_label_types(): nlp.get_pipe("textcat").add_label(9) -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly - nlp = Language() + nlp = English() textcat = nlp.create_pipe("textcat") for _, annotations in TRAIN_DATA: for label, value in annotations.get("cats").items(): @@ -87,11 +91,21 @@ def test_overfitting(): for i in range(50): losses = {} nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) - assert losses["textcat"] < 0.00001 + assert losses["textcat"] < 0.01 # test the trained model test_text = "I am happy." doc = nlp(test_text) cats = doc.cats + # note that by default, exclusive_classes = false so we need a bigger error margin assert cats["POSITIVE"] > 0.9 - assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) + assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1) + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + cats2 = doc2.cats + assert cats2["POSITIVE"] > 0.9 + assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 2bfdbd7c3..ff8c7c2fe 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import is_stop from spacy.vectors import Vectors from spacy.vocab import Vocab from spacy.language import Language +from spacy.ml.models.defaults import default_ner, default_tagger from spacy.tokens import Doc, Span, Token from spacy.pipeline import Tagger, EntityRecognizer from spacy.attrs import HEAD, DEP @@ -123,7 +124,7 @@ def test_issue1727(): correctly after vectors are added.""" data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) - tagger = Tagger(Vocab()) + tagger = Tagger(Vocab(), default_tagger()) tagger.add_label("PRP") with pytest.warns(UserWarning): tagger.begin_training() @@ -131,7 +132,7 @@ def test_issue1727(): tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) - tagger = Tagger(Vocab()).from_disk(path) + tagger = Tagger(Vocab(), default_tagger()).from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0 @@ -236,6 +237,7 @@ def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) +@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3") def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() @@ -268,7 +270,7 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - ner = EntityRecognizer(Vocab()) + ner = EntityRecognizer(Vocab(), default_ner()) example = Example(doc=None) example.set_token_annotation( ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 2c25b6d73..1786677e0 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -32,6 +32,9 @@ def test_issue2179(): nlp.begin_training() nlp2 = Italian() nlp2.add_pipe(nlp2.create_pipe("ner")) + + assert len(nlp2.get_pipe("ner").labels) == 0 + nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index cc893e472..df23efa4f 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,6 +1,7 @@ import pytest from spacy.lang.en import English from spacy.lang.de import German +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRuler, EntityRecognizer from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc @@ -103,6 +104,7 @@ def test_issue3209(): assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) + nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names @@ -193,7 +195,7 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab) + ner = EntityRecognizer(doc.vocab, default_ner()) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 54ce10924..9752f70df 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -1,10 +1,12 @@ from spacy.pipeline.pipes import DependencyParser from spacy.vocab import Vocab +from spacy.ml.models.defaults import default_parser + def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - parser = DependencyParser(Vocab()) + parser = DependencyParser(Vocab(), default_parser()) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) @@ -13,7 +15,7 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - parser = DependencyParser(Vocab(), learn_tokens=True) + parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 6644a8eda..75a1c23b7 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -3,6 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English from spacy.tokens import Span from spacy.util import ensure_path +from spacy.ml.models.defaults import default_ner from ..util import make_tempdir @@ -73,6 +74,6 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - ner2 = EntityRecognizer(vocab) + ner2 = EntityRecognizer(vocab, default_ner()) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index a3f6f69df..30688601f 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,5 +1,6 @@ from collections import defaultdict +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRecognizer from spacy.lang.en import English @@ -11,7 +12,7 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - ner = EntityRecognizer(nlp.vocab) + ner = EntityRecognizer(nlp.vocab, default_ner()) ner.add_label("SOME_LABEL") ner.begin_training([]) nlp.add_pipe(ner) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py new file mode 100644 index 000000000..c34d01547 --- /dev/null +++ b/spacy/tests/serialize/test_serialize_config.py @@ -0,0 +1,126 @@ +from thinc.api import Config + +import spacy +from spacy import util +from spacy.lang.en import English +from spacy.util import registry + +from ..util import make_tempdir +from ...ml.models import build_Tok2Vec_model, build_tb_parser_model + +nlp_config_string = """ +[nlp] +lang = "en" + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} +""" + + +parser_config_string = """ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 99 +hidden_width = 66 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 333 +depth = 4 +embed_size = 5555 +window_size = 1 +maxout_pieces = 7 +subword_features = false +""" + + +@registry.architectures.register("my_test_parser") +def my_parser(): + tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3, + maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8, + conv_depth=2, bilstm_depth=0) + parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5) + return parser + + +def test_serialize_nlp(): + """ Create a custom nlp pipeline from config and ensure it serializes it correctly """ + nlp_config = Config().from_str(nlp_config_string) + nlp = util.load_model_from_config(nlp_config["nlp"]) + nlp.begin_training() + assert "tok2vec" in nlp.pipe_names + assert "tagger" in nlp.pipe_names + assert "parser" not in nlp.pipe_names + assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342 + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + assert "tok2vec" in nlp2.pipe_names + assert "tagger" in nlp2.pipe_names + assert "parser" not in nlp2.pipe_names + assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342 + + +def test_serialize_custom_nlp(): + """ Create a custom nlp pipeline and ensure it serializes it correctly""" + nlp = English() + parser_cfg = dict() + parser_cfg["model"] = {'@architectures': "my_test_parser"} + parser = nlp.create_pipe("parser", parser_cfg) + nlp.add_pipe(parser) + nlp.begin_training() + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + model = nlp2.get_pipe("parser").model + tok2vec = model.get_ref("tok2vec") + upper = model.upper + + # check that we have the correct settings, not the default ones + assert tok2vec.get_dim("nO") == 321 + assert upper.get_dim("nI") == 65 + + +def test_serialize_parser(): + """ Create a non-default parser config to check nlp serializes it correctly """ + nlp = English() + model_config = Config().from_str(parser_config_string) + parser = nlp.create_pipe("parser", config=model_config) + parser.add_label("nsubj") + nlp.add_pipe(parser) + nlp.begin_training() + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + model = nlp2.get_pipe("parser").model + tok2vec = model.get_ref("tok2vec") + upper = model.upper + + # check that we have the correct settings, not the default ones + assert upper.get_dim("nI") == 66 + assert tok2vec.get_dim("nO") == 333 diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 4089a0d07..0e3b7c59f 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,5 +1,6 @@ import pytest import re + from spacy.language import Language from spacy.tokenizer import Tokenizer @@ -56,7 +57,7 @@ def test_serialize_language_exclude(meta_data): nlp = Language(meta=meta_data) assert nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes()) - assert nlp.meta["name"] == name + assert new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"]) assert not new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 0ad9bc4d4..fe14fba10 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,6 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer +from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec from ..util import make_tempdir @@ -10,58 +11,58 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - parser = DependencyParser(en_vocab) + parser = DependencyParser(en_vocab, default_parser()) parser.add_label("nsubj") - parser.model, cfg = parser.Model(parser.moves.n_moves) - parser.cfg.update(cfg) return parser @pytest.fixture def blank_parser(en_vocab): - parser = DependencyParser(en_vocab) + parser = DependencyParser(en_vocab, default_parser()) return parser @pytest.fixture def taggers(en_vocab): - tagger1 = Tagger(en_vocab) - tagger2 = Tagger(en_vocab) - tagger1.model = tagger1.Model(8) - tagger2.model = tagger1.model - return (tagger1, tagger2) + model = default_tagger() + tagger1 = Tagger(en_vocab, model) + tagger2 = Tagger(en_vocab, model) + return tagger1, tagger2 @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): - parser = Parser(en_vocab) - parser.model, _ = parser.Model(10) - new_parser = Parser(en_vocab) - new_parser.model, _ = new_parser.Model(10) + parser = Parser(en_vocab, default_parser()) + new_parser = Parser(en_vocab, default_parser()) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) - assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"]) + bytes_2 = new_parser.to_bytes(exclude=["vocab"]) + bytes_3 = parser.to_bytes(exclude=["vocab"]) + assert len(bytes_2) == len(bytes_3) + assert bytes_2 == bytes_3 @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): - parser = Parser(en_vocab) - parser.model, _ = parser.Model(0) + parser = Parser(en_vocab, default_parser()) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) - parser_d = Parser(en_vocab) - parser_d.model, _ = parser_d.Model(0) + parser_d = Parser(en_vocab, default_parser()) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) + assert len(parser_bytes) == len(parser_d_bytes) assert parser_bytes == parser_d_bytes def test_to_from_bytes(parser, blank_parser): assert parser.model is not True - assert blank_parser.model is True + assert blank_parser.model is not True assert blank_parser.moves.n_moves != parser.moves.n_moves bytes_data = parser.to_bytes(exclude=["vocab"]) + + # the blank parser needs to be resized before we can call from_bytes + blank_parser.model.resize_output(parser.moves.n_moves) blank_parser.from_bytes(bytes_data) assert blank_parser.model is not True assert blank_parser.moves.n_moves == parser.moves.n_moves @@ -75,8 +76,10 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1_b = tagger1.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b - new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) - assert new_tagger1.to_bytes() == tagger1_b + new_tagger1 = Tagger(en_vocab, default_tagger()).from_bytes(tagger1_b) + new_tagger1_b = new_tagger1.to_bytes() + assert len(new_tagger1_b) == len(tagger1_b) + assert new_tagger1_b == tagger1_b def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): @@ -86,26 +89,24 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) - tagger1_d = Tagger(en_vocab).from_disk(file_path1) - tagger2_d = Tagger(en_vocab).from_disk(file_path2) + tagger1_d = Tagger(en_vocab, default_tagger()).from_disk(file_path1) + tagger2_d = Tagger(en_vocab, default_tagger()).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes() def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab) - tensorizer.model = tensorizer.Model() + tensorizer = Tensorizer(en_vocab, default_tensorizer()) tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b) + new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab) - tensorizer.model = tensorizer.Model() + tensorizer = Tensorizer(en_vocab, default_tensorizer()) with make_tempdir() as d: file_path = d / "tensorizer" tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab).from_disk(file_path) + tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( exclude=["vocab"] ) @@ -113,19 +114,17 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): def test_serialize_textcat_empty(en_vocab): # See issue #1105 - textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) + textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]) textcat.to_bytes(exclude=["vocab"]) @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_pipe_exclude(en_vocab, Parser): def get_new_parser(): - new_parser = Parser(en_vocab) - new_parser.model, _ = new_parser.Model(0) + new_parser = Parser(en_vocab, default_parser()) return new_parser - parser = Parser(en_vocab) - parser.model, _ = parser.Model(0) + parser = Parser(en_vocab, default_parser()) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg @@ -144,7 +143,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser): def test_serialize_sentencerecognizer(en_vocab): - sr = SentenceRecognizer(en_vocab) + sr = SentenceRecognizer(en_vocab, default_sentrec()) sr_b = sr.to_bytes() - sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b) + sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b) assert sr.to_bytes() == sr_d.to_bytes() diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 2d10d79d4..310103d10 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,6 +1,6 @@ import pytest -from spacy.ml.component_models import Tok2Vec +from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.vocab import Vocab from spacy.tokens import Doc @@ -25,7 +25,8 @@ def test_empty_doc(): embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) - tok2vec = Tok2Vec(width, embed_size) + # TODO: fix tok2vec arguments + tok2vec = build_Tok2Vec_model(width, embed_size) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -36,7 +37,19 @@ def test_empty_doc(): ) def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) - tok2vec = Tok2Vec(width, embed_size) + tok2vec = build_Tok2Vec_model( + width, + embed_size, + pretrained_vectors=None, + conv_depth=4, + bilstm_depth=0, + window_size=1, + maxout_pieces=3, + subword_features=True, + char_embed=False, + nM=64, + nC=8, + ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) @@ -44,19 +57,24 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): assert doc_vec.shape == (len(doc), width) +# fmt: off @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False}, - {"width": 8, "embed_size": 100, "char_embed": True}, - {"width": 8, "embed_size": 100, "conv_depth": 6}, - {"width": 8, "embed_size": 100, "conv_depth": 6}, - {"width": 8, "embed_size": 100, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, ], ) +# fmt: on def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) - tok2vec = Tok2Vec(**tok2vec_config) + tok2vec = build_Tok2Vec_model(**tok2vec_config) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) diff --git a/spacy/util.py b/spacy/util.py index 465b9645e..286a6574c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -6,8 +6,7 @@ from pathlib import Path import random from typing import List import thinc -import thinc.config -from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu +from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config import functools import itertools import numpy.random @@ -146,6 +145,10 @@ def load_model_from_path(model_path, meta=False, **overrides): pipeline from meta.json and then calls from_disk() with path.""" if not meta: meta = get_model_meta(model_path) + nlp_config = get_model_config(model_path) + if nlp_config.get("nlp", None): + return load_model_from_config(nlp_config["nlp"]) + # Support language factories registered via entry points (e.g. custom # language subclass) while keeping top-level language identifier "lang" lang = meta.get("lang_factory", meta["lang"]) @@ -162,11 +165,30 @@ def load_model_from_path(model_path, meta=False, **overrides): if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) factory = factories.get(name, name) + if nlp_config.get(name, None): + model_config = nlp_config[name]["model"] + config["model"] = model_config component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) return nlp.from_disk(model_path, exclude=disable) +def load_model_from_config(nlp_config): + if "name" in nlp_config: + nlp = load_model(**nlp_config) + elif "lang" in nlp_config: + lang_class = get_lang_class(nlp_config["lang"]) + nlp = lang_class() + else: + raise ValueError(Errors.E993) + if "pipeline" in nlp_config: + for name, component_cfg in nlp_config["pipeline"].items(): + factory = component_cfg.pop("factory") + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) + return nlp + + def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. @@ -184,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) -def load_from_config(path, create_objects=False): +def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. @@ -212,7 +234,7 @@ def get_model_meta(path): raise IOError(Errors.E052.format(path=model_path)) meta_path = model_path / "meta.json" if not meta_path.is_file(): - raise IOError(Errors.E053.format(path=meta_path)) + raise IOError(Errors.E053.format(path=meta_path, name="meta.json")) meta = srsly.read_json(meta_path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: @@ -220,6 +242,23 @@ def get_model_meta(path): return meta +def get_model_config(path): + """Get the model's config from a directory path. + + path (unicode or Path): Path to model directory. + RETURNS (Config): The model's config data. + """ + model_path = ensure_path(path) + if not model_path.exists(): + raise IOError(Errors.E052.format(path=model_path)) + config_path = model_path / "config.cfg" + # model directories are allowed not to have config files ? + if not config_path.is_file(): + return Config({}) + # raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) + return Config().from_disk(config_path) + + def is_package(name): """Check if string maps to a package installed via pip.