mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
remove Tensorizer
This commit is contained in:
parent
b5ae2edcba
commit
e0f9f448f1
|
@ -1,212 +0,0 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
"""
|
||||
import plac
|
||||
import tqdm
|
||||
import random
|
||||
|
||||
import ml_datasets
|
||||
|
||||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from spacy.pipeline import TextCategorizer
|
||||
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
||||
import numpy
|
||||
|
||||
|
||||
def load_texts(limit=0):
|
||||
train, dev = ml_datasets.imdb()
|
||||
train_texts, train_labels = zip(*train)
|
||||
dev_texts, dev_labels = zip(*train)
|
||||
train_texts = list(train_texts)
|
||||
dev_texts = list(dev_texts)
|
||||
random.shuffle(train_texts)
|
||||
random.shuffle(dev_texts)
|
||||
if limit >= 1:
|
||||
return train_texts[:limit]
|
||||
else:
|
||||
return list(train_texts) + list(dev_texts)
|
||||
|
||||
|
||||
def load_textcat_data(limit=0):
|
||||
"""Load data from the IMDB dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, eval_data = ml_datasets.imdb()
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
eval_texts, eval_labels = zip(*eval_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
||||
return (texts, cats), (eval_texts, eval_cats)
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
|
||||
|
||||
def build_textcat_model(tok2vec, nr_class, width):
|
||||
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
|
||||
|
||||
with Model.define_operators({">>": chain}):
|
||||
model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> reduce_mean()
|
||||
>> Softmax(nr_class, width)
|
||||
)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
return model
|
||||
|
||||
|
||||
def block_gradients(model):
|
||||
from thinc.api import wrap # TODO FIX
|
||||
|
||||
def forward(X, drop=0.0):
|
||||
Y, _ = model.begin_update(X, drop=drop)
|
||||
return Y, None
|
||||
|
||||
return wrap(forward, model)
|
||||
|
||||
|
||||
def create_pipeline(width, embed_size, vectors_model):
|
||||
print("Load vectors")
|
||||
nlp = spacy.load(vectors_model)
|
||||
print("Start training")
|
||||
textcat = TextCategorizer(
|
||||
nlp.vocab,
|
||||
labels=["POSITIVE", "NEGATIVE"],
|
||||
# TODO: replace with config version
|
||||
model=build_textcat_model(
|
||||
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
|
||||
),
|
||||
)
|
||||
|
||||
nlp.add_pipe(textcat)
|
||||
return nlp
|
||||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
||||
docs = [nlp.make_doc(text) for text in batch]
|
||||
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
|
||||
print(losses)
|
||||
return optimizer
|
||||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
|
||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
for i in range(n_iter):
|
||||
losses = {"textcat": 0.0}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
print(
|
||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||
losses["textcat"],
|
||||
scores["textcat_p"],
|
||||
scores["textcat_r"],
|
||||
scores["textcat_f"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||
docs = (tokenizer(text) for text in texts)
|
||||
tp = 1e-8
|
||||
fp = 1e-8
|
||||
tn = 1e-8
|
||||
fn = 1e-8
|
||||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.0
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.0
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
width=("Width of CNN layers", "positional", None, int),
|
||||
embed_size=("Embedding rows", "positional", None, int),
|
||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
)
|
||||
def main(
|
||||
width,
|
||||
embed_size,
|
||||
vectors_model,
|
||||
pretrain_iters=30,
|
||||
train_iters=30,
|
||||
train_examples=1000,
|
||||
):
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
use_gpu = prefer_gpu()
|
||||
print("Using GPU?", use_gpu)
|
||||
|
||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
||||
print("Load data")
|
||||
texts = load_texts(limit=0)
|
||||
print("Train tensorizer")
|
||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
||||
print("Train textcat")
|
||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -225,10 +225,6 @@ class Language(object):
|
|||
|
||||
# Conveniences to access pipeline components
|
||||
# Shouldn't be used anymore!
|
||||
@property
|
||||
def tensorizer(self):
|
||||
return self.get_pipe("tensorizer")
|
||||
|
||||
@property
|
||||
def tagger(self):
|
||||
return self.get_pipe("tagger")
|
||||
|
|
|
@ -2,6 +2,5 @@ from .entity_linker import * # noqa
|
|||
from .parser import * # noqa
|
||||
from .simple_ner import *
|
||||
from .tagger import * # noqa
|
||||
from .tensorizer import * # noqa
|
||||
from .textcat import * # noqa
|
||||
from .tok2vec import * # noqa
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
from thinc.api import Linear, zero_init
|
||||
|
||||
from ... import util
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tensorizer.v1")
|
||||
def build_tensorizer(input_size, output_size):
|
||||
input_size = util.env_opt("token_vector_width", input_size)
|
||||
return Linear(output_size, input_size, init_W=zero_init)
|
|
@ -1,5 +1,5 @@
|
|||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
||||
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
|
||||
from .pipes import TextCategorizer, Pipe, Sentencizer
|
||||
from .pipes import SentenceRecognizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .morphologizer import Morphologizer
|
||||
|
@ -14,7 +14,6 @@ __all__ = [
|
|||
"EntityRecognizer",
|
||||
"EntityLinker",
|
||||
"TextCategorizer",
|
||||
"Tensorizer",
|
||||
"Tok2Vec",
|
||||
"Pipe",
|
||||
"Morphologizer",
|
||||
|
|
|
@ -63,16 +63,6 @@ def default_tagger():
|
|||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_tensorizer_config():
|
||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_tensorizer():
|
||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_textcat_config():
|
||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.Tensorizer.v1"
|
||||
input_size=96
|
||||
output_size=300
|
|
@ -44,8 +44,8 @@ class SentenceSegmenter(object):
|
|||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
similarity into `Doc` objects.
|
||||
The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
|
|||
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
"""Allocate model, using nO from the first model in the pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
|
|
|
@ -16,7 +16,7 @@ from ..morphology cimport Morphology
|
|||
from ..vocab cimport Vocab
|
||||
|
||||
from .defaults import default_tagger, default_parser, default_ner, default_textcat
|
||||
from .defaults import default_nel, default_senter, default_tensorizer
|
||||
from .defaults import default_nel, default_senter
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
|
@ -238,138 +238,6 @@ class Pipe(object):
|
|||
return self
|
||||
|
||||
|
||||
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
|
||||
class Tensorizer(Pipe):
|
||||
"""Pre-train position-sensitive vectors for tokens."""
|
||||
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||
`Vocab` instance with the `Doc` objects it will process.
|
||||
**cfg: Config parameters.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.input_models = []
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
def __call__(self, example):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
model. Vectors are set to the `Doc.tensor` attribute.
|
||||
|
||||
docs (Doc or iterable): One or more documents to add vectors to.
|
||||
RETURNS (dict or None): Intermediate computations.
|
||||
"""
|
||||
doc = self._get_doc(example)
|
||||
tokvecses = self.predict([doc])
|
||||
self.set_annotations([doc], tokvecses)
|
||||
if isinstance(example, Example):
|
||||
example.doc = doc
|
||||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
"""Process `Doc` objects as a stream.
|
||||
|
||||
stream (iterator): A sequence of `Doc` or `Example` objects to process.
|
||||
batch_size (int): Number of `Doc` or `Example` objects to group.
|
||||
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
|
||||
"""
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
tensors = self.predict(docs)
|
||||
self.set_annotations(docs, tensors)
|
||||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the docs.
|
||||
"""
|
||||
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
||||
outputs = self.model(inputs)
|
||||
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
||||
|
||||
def set_annotations(self, docs, tensors):
|
||||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tensors (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tensor in zip(docs, tensors):
|
||||
if tensor.shape[0] != len(doc):
|
||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
||||
doc.tensor = tensor
|
||||
|
||||
def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
||||
"""Update the model.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The dropout rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
"""
|
||||
examples = Example.to_example_objects(examples)
|
||||
inputs = []
|
||||
bp_inputs = []
|
||||
set_dropout_rate(self.model, drop)
|
||||
for tok2vec in self.input_models:
|
||||
set_dropout_rate(tok2vec, drop)
|
||||
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
|
||||
inputs.append(tensor)
|
||||
bp_inputs.append(bp_tensor)
|
||||
inputs = self.model.ops.xp.hstack(inputs)
|
||||
scores, bp_scores = self.model.begin_update(inputs)
|
||||
loss, d_scores = self.get_loss(examples, scores)
|
||||
d_inputs = bp_scores(d_scores, sgd=sgd)
|
||||
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||
bp_input(d_input)
|
||||
if sgd is not None:
|
||||
for tok2vec in self.input_models:
|
||||
tok2vec.finish_update(sgd)
|
||||
self.model.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
return loss
|
||||
|
||||
def get_loss(self, examples, prediction):
|
||||
examples = Example.to_example_objects(examples)
|
||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
||||
target = self.vocab.vectors.data[ids]
|
||||
d_scores = (prediction - target) / prediction.shape[0]
|
||||
loss = (d_scores ** 2).sum()
|
||||
return loss, d_scores
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate models, pre-process training data and acquire an
|
||||
optimizer.
|
||||
|
||||
get_examples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if pipeline is not None:
|
||||
for name, model in pipeline:
|
||||
if model.has_ref("tok2vec"):
|
||||
self.input_models.append(model.get_ref("tok2vec"))
|
||||
self.model.initialize()
|
||||
link_vectors_to_models(self.vocab)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
|
||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
|
||||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg):
|
|||
warnings.warn(Warnings.W098.format(name="ner"))
|
||||
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
|
||||
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
||||
from spacy.pipeline.defaults import default_parser, default_tagger
|
||||
from spacy.pipeline.defaults import default_textcat, default_senter
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
|||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||
|
||||
|
||||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
||||
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
|
||||
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
||||
|
||||
|
||||
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tensorizer"
|
||||
tensorizer.to_disk(file_path)
|
||||
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
|
||||
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
||||
exclude=["vocab"]
|
||||
)
|
||||
|
||||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
textcat = TextCategorizer(
|
||||
|
|
Loading…
Reference in New Issue
Block a user