remove Tensorizer

This commit is contained in:
svlandeg 2020-06-01 23:38:48 +02:00
parent b5ae2edcba
commit e0f9f448f1
10 changed files with 8 additions and 400 deletions

View File

@ -1,212 +0,0 @@
"""This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pretrained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.
"""
import plac
import tqdm
import random
import ml_datasets
import spacy
from spacy.util import minibatch
from spacy.pipeline import TextCategorizer
from spacy.ml.models.tok2vec import build_Tok2Vec_model
import numpy
def load_texts(limit=0):
train, dev = ml_datasets.imdb()
train_texts, train_labels = zip(*train)
dev_texts, dev_labels = zip(*train)
train_texts = list(train_texts)
dev_texts = list(dev_texts)
random.shuffle(train_texts)
random.shuffle(dev_texts)
if limit >= 1:
return train_texts[:limit]
else:
return list(train_texts) + list(dev_texts)
def load_textcat_data(limit=0):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
train_data, eval_data = ml_datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
eval_texts, eval_labels = zip(*eval_data)
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
return (texts, cats), (eval_texts, eval_cats)
def prefer_gpu():
used = spacy.util.use_gpu(0)
if used is None:
return False
else:
import cupy.random
cupy.random.seed(0)
return True
def build_textcat_model(tok2vec, nr_class, width):
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
with Model.define_operators({">>": chain}):
model = (
tok2vec
>> list2ragged()
>> reduce_mean()
>> Softmax(nr_class, width)
)
model.set_ref("tok2vec", tok2vec)
return model
def block_gradients(model):
from thinc.api import wrap # TODO FIX
def forward(X, drop=0.0):
Y, _ = model.begin_update(X, drop=drop)
return Y, None
return wrap(forward, model)
def create_pipeline(width, embed_size, vectors_model):
print("Load vectors")
nlp = spacy.load(vectors_model)
print("Start training")
textcat = TextCategorizer(
nlp.vocab,
labels=["POSITIVE", "NEGATIVE"],
# TODO: replace with config version
model=build_textcat_model(
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
),
)
nlp.add_pipe(textcat)
return nlp
def train_tensorizer(nlp, texts, dropout, n_iter):
tensorizer = nlp.create_pipe("tensorizer")
nlp.add_pipe(tensorizer)
optimizer = nlp.begin_training()
for i in range(n_iter):
losses = {}
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
docs = [nlp.make_doc(text) for text in batch]
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
print(losses)
return optimizer
def train_textcat(nlp, n_texts, n_iter=10):
textcat = nlp.get_pipe("textcat")
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
print(
"Using {} examples ({} training, {} evaluation)".format(
n_texts, len(train_texts), len(dev_texts)
)
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
with nlp.select_pipes(enable="textcat"): # only train textcat
optimizer = nlp.begin_training()
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {"textcat": 0.0}
# batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches:
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
print(
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
)
def evaluate_textcat(tokenizer, textcat, texts, cats):
docs = (tokenizer(text) for text in texts)
tp = 1e-8
fp = 1e-8
tn = 1e-8
fn = 1e-8
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if label not in gold:
continue
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.0
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.0
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall)
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
@plac.annotations(
width=("Width of CNN layers", "positional", None, int),
embed_size=("Embedding rows", "positional", None, int),
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
train_iters=("Number of iterations to pretrain", "option", "tn", int),
train_examples=("Number of labelled examples", "option", "eg", int),
vectors_model=("Name or path to vectors model to learn from"),
)
def main(
width,
embed_size,
vectors_model,
pretrain_iters=30,
train_iters=30,
train_examples=1000,
):
random.seed(0)
numpy.random.seed(0)
use_gpu = prefer_gpu()
print("Using GPU?", use_gpu)
nlp = create_pipeline(width, embed_size, vectors_model)
print("Load data")
texts = load_texts(limit=0)
print("Train tensorizer")
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
print("Train textcat")
train_textcat(nlp, train_examples, n_iter=train_iters)
if __name__ == "__main__":
plac.call(main)

View File

@ -225,10 +225,6 @@ class Language(object):
# Conveniences to access pipeline components
# Shouldn't be used anymore!
@property
def tensorizer(self):
return self.get_pipe("tensorizer")
@property
def tagger(self):
return self.get_pipe("tagger")

View File

@ -2,6 +2,5 @@ from .entity_linker import * # noqa
from .parser import * # noqa
from .simple_ner import *
from .tagger import * # noqa
from .tensorizer import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -1,10 +0,0 @@
from thinc.api import Linear, zero_init
from ... import util
from ...util import registry
@registry.architectures.register("spacy.Tensorizer.v1")
def build_tensorizer(input_size, output_size):
input_size = util.env_opt("token_vector_width", input_size)
return Linear(output_size, input_size, init_W=zero_init)

View File

@ -1,5 +1,5 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .pipes import TextCategorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
@ -14,7 +14,6 @@ __all__ = [
"EntityRecognizer",
"EntityLinker",
"TextCategorizer",
"Tensorizer",
"Tok2Vec",
"Pipe",
"Morphologizer",

View File

@ -63,16 +63,6 @@ def default_tagger():
return util.load_config(loc, create_objects=True)["model"]
def default_tensorizer_config():
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tensorizer():
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_textcat_config():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=False)

View File

@ -1,4 +0,0 @@
[model]
@architectures = "spacy.Tensorizer.v1"
input_size=96
output_size=300

View File

@ -44,8 +44,8 @@ class SentenceSegmenter(object):
class SimilarityHook(Pipe):
"""
Experimental: A pipeline component to install a hook for supervised
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
documents. The similarity model can be any object obeying the Thinc `Model`
similarity into `Doc` objects.
The similarity model can be any object obeying the Thinc `Model`
interface. By default, the model concatenates the elementwise mean and
elementwise max of the two tensors, and compares them using the
Cauchy-like similarity function from Chen (2013):
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
sims, bp_sims = self.model.begin_update(doc1_doc2)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
"""Allocate model, using width from tensorizer in pipeline.
"""Allocate model, using nO from the first model in the pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.

View File

@ -16,7 +16,7 @@ from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .defaults import default_tagger, default_parser, default_ner, default_textcat
from .defaults import default_nel, default_senter, default_tensorizer
from .defaults import default_nel, default_senter
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
@ -238,138 +238,6 @@ class Pipe(object):
return self
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
def __init__(self, vocab, model, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same
`Vocab` instance with the `Doc` objects it will process.
**cfg: Config parameters.
"""
self.vocab = vocab
self.model = model
self.input_models = []
self.cfg = dict(cfg)
def __call__(self, example):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
doc = self._get_doc(example)
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
"""Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` or `Example` objects to process.
batch_size (int): Number of `Doc` or `Example` objects to group.
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
"""
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs.
"""
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
def set_annotations(self, docs, tensors):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tensors (object): Vector representation for each token in the docs.
"""
for doc, tensor in zip(docs, tensors):
if tensor.shape[0] != len(doc):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor
def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The dropout rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
examples = Example.to_example_objects(examples)
inputs = []
bp_inputs = []
set_dropout_rate(self.model, drop)
for tok2vec in self.input_models:
set_dropout_rate(tok2vec, drop)
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
scores, bp_scores = self.model.begin_update(inputs)
loss, d_scores = self.get_loss(examples, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
bp_input(d_input)
if sgd is not None:
for tok2vec in self.input_models:
tok2vec.finish_update(sgd)
self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return loss
def get_loss(self, examples, prediction):
examples = Example.to_example_objects(examples)
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
target = self.vocab.vectors.data[ids]
d_scores = (prediction - target) / prediction.shape[0]
loss = (d_scores ** 2).sum()
return loss, d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
"""Allocate models, pre-process training data and acquire an
optimizer.
get_examples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if pipeline is not None:
for name, model in pipeline:
if model.has_ref("tok2vec"):
self.input_models.append(model.get_ref("tok2vec"))
self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg):
warnings.warn(Warnings.W098.format(name="ner"))
return EntityRecognizer.from_nlp(nlp, model, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]

View File

@ -1,7 +1,7 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
from spacy.pipeline import TextCategorizer, SentenceRecognizer
from spacy.pipeline.defaults import default_parser, default_tagger
from spacy.pipeline.defaults import default_textcat, default_senter
from ..util import make_tempdir
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
tensorizer = Tensorizer(en_vocab, default_tensorizer())
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
tensorizer = Tensorizer(en_vocab, default_tensorizer())
with make_tempdir() as d:
file_path = d / "tensorizer"
tensorizer.to_disk(file_path)
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
exclude=["vocab"]
)
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(