remove Tensorizer

2026-02-02 13:36:18 +03:00 · 2020-06-01 23:38:48 +02:00 · 2020-06-01 23:38:48 +02:00 · e0f9f448f1
commit e0f9f448f1
parent b5ae2edcba
10 changed files with 8 additions and 400 deletions
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -1,212 +0,0 @@
-"""This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pretrained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pretrained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-"""
-import plac
-import tqdm
-import random
-
-import ml_datasets
-
-import spacy
-from spacy.util import minibatch
-from spacy.pipeline import TextCategorizer
-from spacy.ml.models.tok2vec import build_Tok2Vec_model
-import numpy
-
-
-def load_texts(limit=0):
-    train, dev = ml_datasets.imdb()
-    train_texts, train_labels = zip(*train)
-    dev_texts, dev_labels = zip(*train)
-    train_texts = list(train_texts)
-    dev_texts = list(dev_texts)
-    random.shuffle(train_texts)
-    random.shuffle(dev_texts)
-    if limit >= 1:
-        return train_texts[:limit]
-    else:
-        return list(train_texts) + list(dev_texts)
-
-
-def load_textcat_data(limit=0):
-    """Load data from the IMDB dataset."""
-    # Partition off part of the train data for evaluation
-    train_data, eval_data = ml_datasets.imdb()
-    random.shuffle(train_data)
-    train_data = train_data[-limit:]
-    texts, labels = zip(*train_data)
-    eval_texts, eval_labels = zip(*eval_data)
-    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
-    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
-    return (texts, cats), (eval_texts, eval_cats)
-
-
-def prefer_gpu():
-    used = spacy.util.use_gpu(0)
-    if used is None:
-        return False
-    else:
-        import cupy.random
-
-        cupy.random.seed(0)
-        return True
-
-
-def build_textcat_model(tok2vec, nr_class, width):
-    from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
-
-    with Model.define_operators({">>": chain}):
-        model = (
-            tok2vec
-            >> list2ragged()
-            >> reduce_mean()
-            >> Softmax(nr_class, width)
-        )
-    model.set_ref("tok2vec", tok2vec)
-    return model
-
-
-def block_gradients(model):
-    from thinc.api import wrap  # TODO FIX
-
-    def forward(X, drop=0.0):
-        Y, _ = model.begin_update(X, drop=drop)
-        return Y, None
-
-    return wrap(forward, model)
-
-
-def create_pipeline(width, embed_size, vectors_model):
-    print("Load vectors")
-    nlp = spacy.load(vectors_model)
-    print("Start training")
-    textcat = TextCategorizer(
-        nlp.vocab,
-        labels=["POSITIVE", "NEGATIVE"],
-        # TODO: replace with config version
-        model=build_textcat_model(
-            build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
-        ),
-    )
-
-    nlp.add_pipe(textcat)
-    return nlp
-
-
-def train_tensorizer(nlp, texts, dropout, n_iter):
-    tensorizer = nlp.create_pipe("tensorizer")
-    nlp.add_pipe(tensorizer)
-    optimizer = nlp.begin_training()
-    for i in range(n_iter):
-        losses = {}
-        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
-            docs = [nlp.make_doc(text) for text in batch]
-            tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
-        print(losses)
-    return optimizer
-
-
-def train_textcat(nlp, n_texts, n_iter=10):
-    textcat = nlp.get_pipe("textcat")
-    tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
-    print(
-        "Using {} examples ({} training, {} evaluation)".format(
-            n_texts, len(train_texts), len(dev_texts)
-        )
-    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
-
-    with nlp.select_pipes(enable="textcat"):  # only train textcat
-        optimizer = nlp.begin_training()
-        textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
-        print("Training the model...")
-        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
-        for i in range(n_iter):
-            losses = {"textcat": 0.0}
-            # batch up the examples using spaCy's minibatch
-            batches = minibatch(tqdm.tqdm(train_data), size=2)
-            for batch in batches:
-                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
-            with textcat.model.use_params(optimizer.averages):
-                # evaluate on the dev data split off in load_data()
-                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print(
-                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
-                    losses["textcat"],
-                    scores["textcat_p"],
-                    scores["textcat_r"],
-                    scores["textcat_f"],
-                )
-            )
-
-
-def evaluate_textcat(tokenizer, textcat, texts, cats):
-    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8
-    fp = 1e-8
-    tn = 1e-8
-    fn = 1e-8
-    for i, doc in enumerate(textcat.pipe(docs)):
-        gold = cats[i]
-        for label, score in doc.cats.items():
-            if label not in gold:
-                continue
-            if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.0
-            elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.0
-            elif score < 0.5 and gold[label] < 0.5:
-                tn += 1
-            elif score < 0.5 and gold[label] >= 0.5:
-                fn += 1
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    f_score = 2 * (precision * recall) / (precision + recall)
-    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
-
-
-@plac.annotations(
-    width=("Width of CNN layers", "positional", None, int),
-    embed_size=("Embedding rows", "positional", None, int),
-    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
-    train_iters=("Number of iterations to pretrain", "option", "tn", int),
-    train_examples=("Number of labelled examples", "option", "eg", int),
-    vectors_model=("Name or path to vectors model to learn from"),
-)
-def main(
-    width,
-    embed_size,
-    vectors_model,
-    pretrain_iters=30,
-    train_iters=30,
-    train_examples=1000,
-):
-    random.seed(0)
-    numpy.random.seed(0)
-    use_gpu = prefer_gpu()
-    print("Using GPU?", use_gpu)
-
-    nlp = create_pipeline(width, embed_size, vectors_model)
-    print("Load data")
-    texts = load_texts(limit=0)
-    print("Train tensorizer")
-    optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
-    print("Train textcat")
-    train_textcat(nlp, train_examples, n_iter=train_iters)
-
-
-if __name__ == "__main__":
-    plac.call(main)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -225,10 +225,6 @@ class Language(object):

    # Conveniences to access pipeline components
    # Shouldn't be used anymore!
-    @property
-    def tensorizer(self):
-        return self.get_pipe("tensorizer")
-
    @property
    def tagger(self):
        return self.get_pipe("tagger")
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -2,6 +2,5 @@ from .entity_linker import *  # noqa
 from .parser import *  # noqa
 from .simple_ner import *
 from .tagger import *  # noqa
-from .tensorizer import *  # noqa
 from .textcat import *  # noqa
 from .tok2vec import *  # noqa
--- a/spacy/ml/models/tensorizer.py
+++ b/spacy/ml/models/tensorizer.py
@ -1,10 +0,0 @@
-from thinc.api import Linear, zero_init
-
-from ... import util
-from ...util import registry
-
-
-@registry.architectures.register("spacy.Tensorizer.v1")
-def build_tensorizer(input_size, output_size):
-    input_size = util.env_opt("token_vector_width", input_size)
-    return Linear(output_size, input_size, init_W=zero_init)
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,5 +1,5 @@
 from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
-from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
+from .pipes import TextCategorizer, Pipe, Sentencizer
 from .pipes import SentenceRecognizer
 from .simple_ner import SimpleNER
 from .morphologizer import Morphologizer
@ -14,7 +14,6 @@ __all__ = [
    "EntityRecognizer",
    "EntityLinker",
    "TextCategorizer",
-    "Tensorizer",
    "Tok2Vec",
    "Pipe",
    "Morphologizer",
--- a/spacy/pipeline/defaults/init.py
+++ b/spacy/pipeline/defaults/init.py
@ -63,16 +63,6 @@ def default_tagger():
    return util.load_config(loc, create_objects=True)["model"]


-def default_tensorizer_config():
-    loc = Path(__file__).parent / "tensorizer_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_tensorizer():
-    loc = Path(__file__).parent / "tensorizer_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
 def default_textcat_config():
    loc = Path(__file__).parent / "textcat_defaults.cfg"
    return util.load_config(loc, create_objects=False)
--- a/spacy/pipeline/defaults/tensorizer_defaults.cfg
+++ b/spacy/pipeline/defaults/tensorizer_defaults.cfg
@ -1,4 +0,0 @@
-[model]
-@architectures = "spacy.Tensorizer.v1"
-input_size=96
-output_size=300
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -44,8 +44,8 @@ class SentenceSegmenter(object):
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
-    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
-    documents. The similarity model can be any object obeying the Thinc `Model`
+    similarity into `Doc` objects.
+    The similarity model can be any object obeying the Thinc `Model`
    interface. By default, the model concatenates the elementwise mean and
    elementwise max of the two tensors, and compares them using the
    Cauchy-like similarity function from Chen (2013):
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
        sims, bp_sims = self.model.begin_update(doc1_doc2)

    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
-        """Allocate model, using width from tensorizer in pipeline.
+        """Allocate model, using nO from the first model in the pipeline.

        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -16,7 +16,7 @@ from ..morphology cimport Morphology
 from ..vocab cimport Vocab

 from .defaults import default_tagger, default_parser,  default_ner,  default_textcat
-from .defaults import default_nel, default_senter, default_tensorizer
+from .defaults import default_nel, default_senter
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
@ -238,138 +238,6 @@ class Pipe(object):
        return self


-@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
-class Tensorizer(Pipe):
-    """Pre-train position-sensitive vectors for tokens."""
-
-    def __init__(self, vocab, model, **cfg):
-        """Construct a new statistical model. Weights are not allocated on
-        initialisation.
-
-        vocab (Vocab): A `Vocab` instance. The model must share the same
-            `Vocab` instance with the `Doc` objects it will process.
-        **cfg: Config parameters.
-        """
-        self.vocab = vocab
-        self.model = model
-        self.input_models = []
-        self.cfg = dict(cfg)
-
-    def __call__(self, example):
-        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
-        model. Vectors are set to the `Doc.tensor` attribute.
-
-        docs (Doc or iterable): One or more documents to add vectors to.
-        RETURNS (dict or None): Intermediate computations.
-        """
-        doc = self._get_doc(example)
-        tokvecses = self.predict([doc])
-        self.set_annotations([doc], tokvecses)
-        if isinstance(example, Example):
-            example.doc = doc
-            return example
-        return doc
-
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        """Process `Doc` objects as a stream.
-
-        stream (iterator): A sequence of `Doc` or `Example` objects to process.
-        batch_size (int): Number of `Doc` or `Example` objects to group.
-        YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
-        """
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
-            tensors = self.predict(docs)
-            self.set_annotations(docs, tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
-
-    def predict(self, docs):
-        """Return a single tensor for a batch of documents.
-
-        docs (iterable): A sequence of `Doc` objects.
-        RETURNS (object): Vector representations for each token in the docs.
-        """
-        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
-        outputs = self.model(inputs)
-        return self.model.ops.unflatten(outputs, [len(d) for d in docs])
-
-    def set_annotations(self, docs, tensors):
-        """Set the tensor attribute for a batch of documents.
-
-        docs (iterable): A sequence of `Doc` objects.
-        tensors (object): Vector representation for each token in the docs.
-        """
-        for doc, tensor in zip(docs, tensors):
-            if tensor.shape[0] != len(doc):
-                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
-            doc.tensor = tensor
-
-    def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
-        """Update the model.
-
-        docs (iterable): A batch of `Doc` objects.
-        golds (iterable): A batch of `GoldParse` objects.
-        drop (float): The dropout rate.
-        sgd (callable): An optimizer.
-        RETURNS (dict): Results from the update.
-        """
-        examples = Example.to_example_objects(examples)
-        inputs = []
-        bp_inputs = []
-        set_dropout_rate(self.model, drop)
-        for tok2vec in self.input_models:
-            set_dropout_rate(tok2vec, drop)
-            tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
-            inputs.append(tensor)
-            bp_inputs.append(bp_tensor)
-        inputs = self.model.ops.xp.hstack(inputs)
-        scores, bp_scores = self.model.begin_update(inputs)
-        loss, d_scores = self.get_loss(examples, scores)
-        d_inputs = bp_scores(d_scores, sgd=sgd)
-        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
-        for d_input, bp_input in zip(d_inputs, bp_inputs):
-            bp_input(d_input)
-        if sgd is not None:
-            for tok2vec in self.input_models:
-                tok2vec.finish_update(sgd)
-            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-        return loss
-
-    def get_loss(self, examples, prediction):
-        examples = Example.to_example_objects(examples)
-        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
-        target = self.vocab.vectors.data[ids]
-        d_scores = (prediction - target) / prediction.shape[0]
-        loss = (d_scores ** 2).sum()
-        return loss, d_scores
-
-    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
-        """Allocate models, pre-process training data and acquire an
-        optimizer.
-
-        get_examples (iterable): Gold-standard training data.
-        pipeline (list): The pipeline the model is part of.
-        """
-        if pipeline is not None:
-            for name, model in pipeline:
-                if model.has_ref("tok2vec"):
-                    self.input_models.append(model.get_ref("tok2vec"))
-        self.model.initialize()
-        link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
-
-
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
 class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.
@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg):
        warnings.warn(Warnings.W098.format(name="ner"))
    return EntityRecognizer.from_nlp(nlp, model, **cfg)

-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -1,7 +1,7 @@
 import pytest
 from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
-from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
+from spacy.pipeline import TextCategorizer, SentenceRecognizer
+from spacy.pipeline.defaults import default_parser, default_tagger
 from spacy.pipeline.defaults import default_textcat, default_senter

 from ..util import make_tempdir
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()


-def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
-    tensorizer = Tensorizer(en_vocab, default_tensorizer())
-    tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
-    new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
-    assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
-
-
-def test_serialize_tensorizer_roundtrip_disk(en_vocab):
-    tensorizer = Tensorizer(en_vocab, default_tensorizer())
-    with make_tempdir() as d:
-        file_path = d / "tensorizer"
-        tensorizer.to_disk(file_path)
-        tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
-        assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
-            exclude=["vocab"]
-        )
-
-
 def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    textcat = TextCategorizer(