From e0f9f448f1305e382c5e7042d8bbac882fea9644 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 1 Jun 2020 23:38:48 +0200 Subject: [PATCH] remove Tensorizer --- examples/training/pretrain_textcat.py | 212 ------------------ spacy/language.py | 4 - spacy/ml/models/__init__.py | 1 - spacy/ml/models/tensorizer.py | 10 - spacy/pipeline/__init__.py | 3 +- spacy/pipeline/defaults/__init__.py | 10 - .../pipeline/defaults/tensorizer_defaults.cfg | 4 - spacy/pipeline/hooks.py | 6 +- spacy/pipeline/pipes.pyx | 136 +---------- .../serialize/test_serialize_pipeline.py | 22 +- 10 files changed, 8 insertions(+), 400 deletions(-) delete mode 100644 examples/training/pretrain_textcat.py delete mode 100644 spacy/ml/models/tensorizer.py delete mode 100644 spacy/pipeline/defaults/tensorizer_defaults.cfg diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py deleted file mode 100644 index 5c41c0e92..000000000 --- a/examples/training/pretrain_textcat.py +++ /dev/null @@ -1,212 +0,0 @@ -"""This script is experimental. - -Try pre-training the CNN component of the text categorizer using a cheap -language modelling-like objective. Specifically, we load pretrained vectors -(from something like word2vec, GloVe, FastText etc), and use the CNN to -predict the tokens' pretrained vectors. This isn't as easy as it sounds: -we're not merely doing compression here, because heavy dropout is applied, -including over the input words. This means the model must often (50% of the time) -use the context in order to predict the word. - -To evaluate the technique, we're pre-training with the 50k texts from the IMDB -corpus, and then training with only 100 labels. Note that it's a bit dirty to -pre-train with the development data, but also not *so* terrible: we're not using -the development labels, after all --- only the unlabelled text. -""" -import plac -import tqdm -import random - -import ml_datasets - -import spacy -from spacy.util import minibatch -from spacy.pipeline import TextCategorizer -from spacy.ml.models.tok2vec import build_Tok2Vec_model -import numpy - - -def load_texts(limit=0): - train, dev = ml_datasets.imdb() - train_texts, train_labels = zip(*train) - dev_texts, dev_labels = zip(*train) - train_texts = list(train_texts) - dev_texts = list(dev_texts) - random.shuffle(train_texts) - random.shuffle(dev_texts) - if limit >= 1: - return train_texts[:limit] - else: - return list(train_texts) + list(dev_texts) - - -def load_textcat_data(limit=0): - """Load data from the IMDB dataset.""" - # Partition off part of the train data for evaluation - train_data, eval_data = ml_datasets.imdb() - random.shuffle(train_data) - train_data = train_data[-limit:] - texts, labels = zip(*train_data) - eval_texts, eval_labels = zip(*eval_data) - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels] - return (texts, cats), (eval_texts, eval_cats) - - -def prefer_gpu(): - used = spacy.util.use_gpu(0) - if used is None: - return False - else: - import cupy.random - - cupy.random.seed(0) - return True - - -def build_textcat_model(tok2vec, nr_class, width): - from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged - - with Model.define_operators({">>": chain}): - model = ( - tok2vec - >> list2ragged() - >> reduce_mean() - >> Softmax(nr_class, width) - ) - model.set_ref("tok2vec", tok2vec) - return model - - -def block_gradients(model): - from thinc.api import wrap # TODO FIX - - def forward(X, drop=0.0): - Y, _ = model.begin_update(X, drop=drop) - return Y, None - - return wrap(forward, model) - - -def create_pipeline(width, embed_size, vectors_model): - print("Load vectors") - nlp = spacy.load(vectors_model) - print("Start training") - textcat = TextCategorizer( - nlp.vocab, - labels=["POSITIVE", "NEGATIVE"], - # TODO: replace with config version - model=build_textcat_model( - build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width - ), - ) - - nlp.add_pipe(textcat) - return nlp - - -def train_tensorizer(nlp, texts, dropout, n_iter): - tensorizer = nlp.create_pipe("tensorizer") - nlp.add_pipe(tensorizer) - optimizer = nlp.begin_training() - for i in range(n_iter): - losses = {} - for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): - docs = [nlp.make_doc(text) for text in batch] - tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) - print(losses) - return optimizer - - -def train_textcat(nlp, n_texts, n_iter=10): - textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() - (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - for i in range(n_iter): - losses = {"textcat": 0.0} - # batch up the examples using spaCy's minibatch - batches = minibatch(tqdm.tqdm(train_data), size=2) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - -def evaluate_textcat(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 1e-8 - fp = 1e-8 - tn = 1e-8 - fn = 1e-8 - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -@plac.annotations( - width=("Width of CNN layers", "positional", None, int), - embed_size=("Embedding rows", "positional", None, int), - pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), - train_examples=("Number of labelled examples", "option", "eg", int), - vectors_model=("Name or path to vectors model to learn from"), -) -def main( - width, - embed_size, - vectors_model, - pretrain_iters=30, - train_iters=30, - train_examples=1000, -): - random.seed(0) - numpy.random.seed(0) - use_gpu = prefer_gpu() - print("Using GPU?", use_gpu) - - nlp = create_pipeline(width, embed_size, vectors_model) - print("Load data") - texts = load_texts(limit=0) - print("Train tensorizer") - optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters) - print("Train textcat") - train_textcat(nlp, train_examples, n_iter=train_iters) - - -if __name__ == "__main__": - plac.call(main) diff --git a/spacy/language.py b/spacy/language.py index 61d69b63e..22360c65f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -225,10 +225,6 @@ class Language(object): # Conveniences to access pipeline components # Shouldn't be used anymore! - @property - def tensorizer(self): - return self.get_pipe("tensorizer") - @property def tagger(self): return self.get_pipe("tagger") diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index ef1e8efca..40cde2437 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -2,6 +2,5 @@ from .entity_linker import * # noqa from .parser import * # noqa from .simple_ner import * from .tagger import * # noqa -from .tensorizer import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py deleted file mode 100644 index f66610b64..000000000 --- a/spacy/ml/models/tensorizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from thinc.api import Linear, zero_init - -from ... import util -from ...util import registry - - -@registry.architectures.register("spacy.Tensorizer.v1") -def build_tensorizer(input_size, output_size): - input_size = util.env_opt("token_vector_width", input_size) - return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index b2866bad2..116a08e92 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,5 +1,5 @@ from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker -from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import TextCategorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer from .simple_ner import SimpleNER from .morphologizer import Morphologizer @@ -14,7 +14,6 @@ __all__ = [ "EntityRecognizer", "EntityLinker", "TextCategorizer", - "Tensorizer", "Tok2Vec", "Pipe", "Morphologizer", diff --git a/spacy/pipeline/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py index e17e2d3b4..483c6bbd6 100644 --- a/spacy/pipeline/defaults/__init__.py +++ b/spacy/pipeline/defaults/__init__.py @@ -63,16 +63,6 @@ def default_tagger(): return util.load_config(loc, create_objects=True)["model"] -def default_tensorizer_config(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=False) - - -def default_tensorizer(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=True)["model"] - - def default_textcat_config(): loc = Path(__file__).parent / "textcat_defaults.cfg" return util.load_config(loc, create_objects=False) diff --git a/spacy/pipeline/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg deleted file mode 100644 index 81880a109..000000000 --- a/spacy/pipeline/defaults/tensorizer_defaults.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[model] -@architectures = "spacy.Tensorizer.v1" -input_size=96 -output_size=300 diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 351323ae9..a97e7be68 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -44,8 +44,8 @@ class SentenceSegmenter(object): class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised - similarity into `Doc` objects. Requires a `Tensorizer` to pre-process - documents. The similarity model can be any object obeying the Thinc `Model` + similarity into `Doc` objects. + The similarity model can be any object obeying the Thinc `Model` interface. By default, the model concatenates the elementwise mean and elementwise max of the two tensors, and compares them using the Cauchy-like similarity function from Chen (2013): @@ -82,7 +82,7 @@ class SimilarityHook(Pipe): sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): - """Allocate model, using width from tensorizer in pipeline. + """Allocate model, using nO from the first model in the pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f75ed1659..cfe01981e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -16,7 +16,7 @@ from ..morphology cimport Morphology from ..vocab cimport Vocab from .defaults import default_tagger, default_parser, default_ner, default_textcat -from .defaults import default_nel, default_senter, default_tensorizer +from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj @@ -238,138 +238,6 @@ class Pipe(object): return self -@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer) -class Tensorizer(Pipe): - """Pre-train position-sensitive vectors for tokens.""" - - def __init__(self, vocab, model, **cfg): - """Construct a new statistical model. Weights are not allocated on - initialisation. - - vocab (Vocab): A `Vocab` instance. The model must share the same - `Vocab` instance with the `Doc` objects it will process. - **cfg: Config parameters. - """ - self.vocab = vocab - self.model = model - self.input_models = [] - self.cfg = dict(cfg) - - def __call__(self, example): - """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM - model. Vectors are set to the `Doc.tensor` attribute. - - docs (Doc or iterable): One or more documents to add vectors to. - RETURNS (dict or None): Intermediate computations. - """ - doc = self._get_doc(example) - tokvecses = self.predict([doc]) - self.set_annotations([doc], tokvecses) - if isinstance(example, Example): - example.doc = doc - return example - return doc - - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - """Process `Doc` objects as a stream. - - stream (iterator): A sequence of `Doc` or `Example` objects to process. - batch_size (int): Number of `Doc` or `Example` objects to group. - YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. - """ - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] - tensors = self.predict(docs) - self.set_annotations(docs, tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs - - def predict(self, docs): - """Return a single tensor for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the docs. - """ - inputs = self.model.ops.flatten([doc.tensor for doc in docs]) - outputs = self.model(inputs) - return self.model.ops.unflatten(outputs, [len(d) for d in docs]) - - def set_annotations(self, docs, tensors): - """Set the tensor attribute for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - tensors (object): Vector representation for each token in the docs. - """ - for doc, tensor in zip(docs, tensors): - if tensor.shape[0] != len(doc): - raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) - doc.tensor = tensor - - def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): - """Update the model. - - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. - drop (float): The dropout rate. - sgd (callable): An optimizer. - RETURNS (dict): Results from the update. - """ - examples = Example.to_example_objects(examples) - inputs = [] - bp_inputs = [] - set_dropout_rate(self.model, drop) - for tok2vec in self.input_models: - set_dropout_rate(tok2vec, drop) - tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) - inputs.append(tensor) - bp_inputs.append(bp_tensor) - inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs) - loss, d_scores = self.get_loss(examples, scores) - d_inputs = bp_scores(d_scores, sgd=sgd) - d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) - for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input) - if sgd is not None: - for tok2vec in self.input_models: - tok2vec.finish_update(sgd) - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - return loss - - def get_loss(self, examples, prediction): - examples = Example.to_example_objects(examples) - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) - target = self.vocab.vectors.data[ids] - d_scores = (prediction - target) / prediction.shape[0] - loss = (d_scores ** 2).sum() - return loss, d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - """Allocate models, pre-process training data and acquire an - optimizer. - - get_examples (iterable): Gold-standard training data. - pipeline (list): The pipeline the model is part of. - """ - if pipeline is not None: - for name, model in pipeline: - if model.has_ref("tok2vec"): - self.input_models.append(model.get_ref("tok2vec")) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - @component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. @@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg): warnings.warn(Warnings.W098.format(name="ner")) return EntityRecognizer.from_nlp(nlp, model, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 4fc277c4f..595a35a9f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,7 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger +from spacy.pipeline import TextCategorizer, SentenceRecognizer +from spacy.pipeline.defaults import default_parser, default_tagger from spacy.pipeline.defaults import default_textcat, default_senter from ..util import make_tempdir @@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): assert tagger1_d.to_bytes() == tagger2_d.to_bytes() -def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) - assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b - - -def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - with make_tempdir() as d: - file_path = d / "tensorizer" - tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) - assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( - exclude=["vocab"] - ) - - def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer(