mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
remove Tensorizer
This commit is contained in:
parent
b5ae2edcba
commit
e0f9f448f1
|
@ -1,212 +0,0 @@
|
||||||
"""This script is experimental.
|
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
|
||||||
language modelling-like objective. Specifically, we load pretrained vectors
|
|
||||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
|
||||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
|
||||||
we're not merely doing compression here, because heavy dropout is applied,
|
|
||||||
including over the input words. This means the model must often (50% of the time)
|
|
||||||
use the context in order to predict the word.
|
|
||||||
|
|
||||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
|
||||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
|
||||||
pre-train with the development data, but also not *so* terrible: we're not using
|
|
||||||
the development labels, after all --- only the unlabelled text.
|
|
||||||
"""
|
|
||||||
import plac
|
|
||||||
import tqdm
|
|
||||||
import random
|
|
||||||
|
|
||||||
import ml_datasets
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from spacy.pipeline import TextCategorizer
|
|
||||||
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
|
|
||||||
def load_texts(limit=0):
|
|
||||||
train, dev = ml_datasets.imdb()
|
|
||||||
train_texts, train_labels = zip(*train)
|
|
||||||
dev_texts, dev_labels = zip(*train)
|
|
||||||
train_texts = list(train_texts)
|
|
||||||
dev_texts = list(dev_texts)
|
|
||||||
random.shuffle(train_texts)
|
|
||||||
random.shuffle(dev_texts)
|
|
||||||
if limit >= 1:
|
|
||||||
return train_texts[:limit]
|
|
||||||
else:
|
|
||||||
return list(train_texts) + list(dev_texts)
|
|
||||||
|
|
||||||
|
|
||||||
def load_textcat_data(limit=0):
|
|
||||||
"""Load data from the IMDB dataset."""
|
|
||||||
# Partition off part of the train data for evaluation
|
|
||||||
train_data, eval_data = ml_datasets.imdb()
|
|
||||||
random.shuffle(train_data)
|
|
||||||
train_data = train_data[-limit:]
|
|
||||||
texts, labels = zip(*train_data)
|
|
||||||
eval_texts, eval_labels = zip(*eval_data)
|
|
||||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
|
||||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
|
||||||
return (texts, cats), (eval_texts, eval_cats)
|
|
||||||
|
|
||||||
|
|
||||||
def prefer_gpu():
|
|
||||||
used = spacy.util.use_gpu(0)
|
|
||||||
if used is None:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
import cupy.random
|
|
||||||
|
|
||||||
cupy.random.seed(0)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def build_textcat_model(tok2vec, nr_class, width):
|
|
||||||
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
model = (
|
|
||||||
tok2vec
|
|
||||||
>> list2ragged()
|
|
||||||
>> reduce_mean()
|
|
||||||
>> Softmax(nr_class, width)
|
|
||||||
)
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def block_gradients(model):
|
|
||||||
from thinc.api import wrap # TODO FIX
|
|
||||||
|
|
||||||
def forward(X, drop=0.0):
|
|
||||||
Y, _ = model.begin_update(X, drop=drop)
|
|
||||||
return Y, None
|
|
||||||
|
|
||||||
return wrap(forward, model)
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(width, embed_size, vectors_model):
|
|
||||||
print("Load vectors")
|
|
||||||
nlp = spacy.load(vectors_model)
|
|
||||||
print("Start training")
|
|
||||||
textcat = TextCategorizer(
|
|
||||||
nlp.vocab,
|
|
||||||
labels=["POSITIVE", "NEGATIVE"],
|
|
||||||
# TODO: replace with config version
|
|
||||||
model=build_textcat_model(
|
|
||||||
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
nlp.add_pipe(textcat)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
|
||||||
tensorizer = nlp.create_pipe("tensorizer")
|
|
||||||
nlp.add_pipe(tensorizer)
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(n_iter):
|
|
||||||
losses = {}
|
|
||||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
|
||||||
docs = [nlp.make_doc(text) for text in batch]
|
|
||||||
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
|
|
||||||
print(losses)
|
|
||||||
return optimizer
|
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
|
||||||
textcat = nlp.get_pipe("textcat")
|
|
||||||
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
|
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
|
||||||
print(
|
|
||||||
"Using {} examples ({} training, {} evaluation)".format(
|
|
||||||
n_texts, len(train_texts), len(dev_texts)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
|
||||||
|
|
||||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
|
|
||||||
print("Training the model...")
|
|
||||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
|
||||||
for i in range(n_iter):
|
|
||||||
losses = {"textcat": 0.0}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
|
||||||
# evaluate on the dev data split off in load_data()
|
|
||||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
|
||||||
print(
|
|
||||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
|
||||||
losses["textcat"],
|
|
||||||
scores["textcat_p"],
|
|
||||||
scores["textcat_r"],
|
|
||||||
scores["textcat_f"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
|
||||||
docs = (tokenizer(text) for text in texts)
|
|
||||||
tp = 1e-8
|
|
||||||
fp = 1e-8
|
|
||||||
tn = 1e-8
|
|
||||||
fn = 1e-8
|
|
||||||
for i, doc in enumerate(textcat.pipe(docs)):
|
|
||||||
gold = cats[i]
|
|
||||||
for label, score in doc.cats.items():
|
|
||||||
if label not in gold:
|
|
||||||
continue
|
|
||||||
if score >= 0.5 and gold[label] >= 0.5:
|
|
||||||
tp += 1.0
|
|
||||||
elif score >= 0.5 and gold[label] < 0.5:
|
|
||||||
fp += 1.0
|
|
||||||
elif score < 0.5 and gold[label] < 0.5:
|
|
||||||
tn += 1
|
|
||||||
elif score < 0.5 and gold[label] >= 0.5:
|
|
||||||
fn += 1
|
|
||||||
precision = tp / (tp + fp)
|
|
||||||
recall = tp / (tp + fn)
|
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
|
||||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
width=("Width of CNN layers", "positional", None, int),
|
|
||||||
embed_size=("Embedding rows", "positional", None, int),
|
|
||||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
|
||||||
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
|
||||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
|
||||||
vectors_model=("Name or path to vectors model to learn from"),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
width,
|
|
||||||
embed_size,
|
|
||||||
vectors_model,
|
|
||||||
pretrain_iters=30,
|
|
||||||
train_iters=30,
|
|
||||||
train_examples=1000,
|
|
||||||
):
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
use_gpu = prefer_gpu()
|
|
||||||
print("Using GPU?", use_gpu)
|
|
||||||
|
|
||||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
|
||||||
print("Load data")
|
|
||||||
texts = load_texts(limit=0)
|
|
||||||
print("Train tensorizer")
|
|
||||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
|
||||||
print("Train textcat")
|
|
||||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -225,10 +225,6 @@ class Language(object):
|
||||||
|
|
||||||
# Conveniences to access pipeline components
|
# Conveniences to access pipeline components
|
||||||
# Shouldn't be used anymore!
|
# Shouldn't be used anymore!
|
||||||
@property
|
|
||||||
def tensorizer(self):
|
|
||||||
return self.get_pipe("tensorizer")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tagger(self):
|
def tagger(self):
|
||||||
return self.get_pipe("tagger")
|
return self.get_pipe("tagger")
|
||||||
|
|
|
@ -2,6 +2,5 @@ from .entity_linker import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
from .simple_ner import *
|
from .simple_ner import *
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .tensorizer import * # noqa
|
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
from .tok2vec import * # noqa
|
from .tok2vec import * # noqa
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
from thinc.api import Linear, zero_init
|
|
||||||
|
|
||||||
from ... import util
|
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tensorizer.v1")
|
|
||||||
def build_tensorizer(input_size, output_size):
|
|
||||||
input_size = util.env_opt("token_vector_width", input_size)
|
|
||||||
return Linear(output_size, input_size, init_W=zero_init)
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
||||||
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
|
from .pipes import TextCategorizer, Pipe, Sentencizer
|
||||||
from .pipes import SentenceRecognizer
|
from .pipes import SentenceRecognizer
|
||||||
from .simple_ner import SimpleNER
|
from .simple_ner import SimpleNER
|
||||||
from .morphologizer import Morphologizer
|
from .morphologizer import Morphologizer
|
||||||
|
@ -14,7 +14,6 @@ __all__ = [
|
||||||
"EntityRecognizer",
|
"EntityRecognizer",
|
||||||
"EntityLinker",
|
"EntityLinker",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tensorizer",
|
|
||||||
"Tok2Vec",
|
"Tok2Vec",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"Morphologizer",
|
"Morphologizer",
|
||||||
|
|
|
@ -63,16 +63,6 @@ def default_tagger():
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
def default_tensorizer_config():
|
|
||||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_tensorizer():
|
|
||||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_textcat_config():
|
def default_textcat_config():
|
||||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||||
return util.load_config(loc, create_objects=False)
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
[model]
|
|
||||||
@architectures = "spacy.Tensorizer.v1"
|
|
||||||
input_size=96
|
|
||||||
output_size=300
|
|
|
@ -44,8 +44,8 @@ class SentenceSegmenter(object):
|
||||||
class SimilarityHook(Pipe):
|
class SimilarityHook(Pipe):
|
||||||
"""
|
"""
|
||||||
Experimental: A pipeline component to install a hook for supervised
|
Experimental: A pipeline component to install a hook for supervised
|
||||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
similarity into `Doc` objects.
|
||||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
The similarity model can be any object obeying the Thinc `Model`
|
||||||
interface. By default, the model concatenates the elementwise mean and
|
interface. By default, the model concatenates the elementwise mean and
|
||||||
elementwise max of the two tensors, and compares them using the
|
elementwise max of the two tensors, and compares them using the
|
||||||
Cauchy-like similarity function from Chen (2013):
|
Cauchy-like similarity function from Chen (2013):
|
||||||
|
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
|
||||||
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||||
"""Allocate model, using width from tensorizer in pipeline.
|
"""Allocate model, using nO from the first model in the pipeline.
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ..morphology cimport Morphology
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
from .defaults import default_tagger, default_parser, default_ner, default_textcat
|
from .defaults import default_tagger, default_parser, default_ner, default_textcat
|
||||||
from .defaults import default_nel, default_senter, default_tensorizer
|
from .defaults import default_nel, default_senter
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language, component
|
from ..language import Language, component
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
|
@ -238,138 +238,6 @@ class Pipe(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
|
|
||||||
class Tensorizer(Pipe):
|
|
||||||
"""Pre-train position-sensitive vectors for tokens."""
|
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
|
||||||
initialisation.
|
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
|
||||||
`Vocab` instance with the `Doc` objects it will process.
|
|
||||||
**cfg: Config parameters.
|
|
||||||
"""
|
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self.input_models = []
|
|
||||||
self.cfg = dict(cfg)
|
|
||||||
|
|
||||||
def __call__(self, example):
|
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
|
||||||
model. Vectors are set to the `Doc.tensor` attribute.
|
|
||||||
|
|
||||||
docs (Doc or iterable): One or more documents to add vectors to.
|
|
||||||
RETURNS (dict or None): Intermediate computations.
|
|
||||||
"""
|
|
||||||
doc = self._get_doc(example)
|
|
||||||
tokvecses = self.predict([doc])
|
|
||||||
self.set_annotations([doc], tokvecses)
|
|
||||||
if isinstance(example, Example):
|
|
||||||
example.doc = doc
|
|
||||||
return example
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
|
||||||
"""Process `Doc` objects as a stream.
|
|
||||||
|
|
||||||
stream (iterator): A sequence of `Doc` or `Example` objects to process.
|
|
||||||
batch_size (int): Number of `Doc` or `Example` objects to group.
|
|
||||||
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
|
|
||||||
"""
|
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
tensors = self.predict(docs)
|
|
||||||
self.set_annotations(docs, tensors)
|
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
"""Return a single tensor for a batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
|
||||||
RETURNS (object): Vector representations for each token in the docs.
|
|
||||||
"""
|
|
||||||
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
|
||||||
outputs = self.model(inputs)
|
|
||||||
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
|
||||||
|
|
||||||
def set_annotations(self, docs, tensors):
|
|
||||||
"""Set the tensor attribute for a batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
|
||||||
tensors (object): Vector representation for each token in the docs.
|
|
||||||
"""
|
|
||||||
for doc, tensor in zip(docs, tensors):
|
|
||||||
if tensor.shape[0] != len(doc):
|
|
||||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
|
||||||
doc.tensor = tensor
|
|
||||||
|
|
||||||
def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
|
||||||
"""Update the model.
|
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
|
||||||
golds (iterable): A batch of `GoldParse` objects.
|
|
||||||
drop (float): The dropout rate.
|
|
||||||
sgd (callable): An optimizer.
|
|
||||||
RETURNS (dict): Results from the update.
|
|
||||||
"""
|
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
inputs = []
|
|
||||||
bp_inputs = []
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
for tok2vec in self.input_models:
|
|
||||||
set_dropout_rate(tok2vec, drop)
|
|
||||||
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
|
|
||||||
inputs.append(tensor)
|
|
||||||
bp_inputs.append(bp_tensor)
|
|
||||||
inputs = self.model.ops.xp.hstack(inputs)
|
|
||||||
scores, bp_scores = self.model.begin_update(inputs)
|
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
|
||||||
d_inputs = bp_scores(d_scores, sgd=sgd)
|
|
||||||
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
|
||||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
|
||||||
bp_input(d_input)
|
|
||||||
if sgd is not None:
|
|
||||||
for tok2vec in self.input_models:
|
|
||||||
tok2vec.finish_update(sgd)
|
|
||||||
self.model.finish_update(sgd)
|
|
||||||
if losses is not None:
|
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
losses[self.name] += loss
|
|
||||||
return loss
|
|
||||||
|
|
||||||
def get_loss(self, examples, prediction):
|
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
|
||||||
target = self.vocab.vectors.data[ids]
|
|
||||||
d_scores = (prediction - target) / prediction.shape[0]
|
|
||||||
loss = (d_scores ** 2).sum()
|
|
||||||
return loss, d_scores
|
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
|
||||||
"""Allocate models, pre-process training data and acquire an
|
|
||||||
optimizer.
|
|
||||||
|
|
||||||
get_examples (iterable): Gold-standard training data.
|
|
||||||
pipeline (list): The pipeline the model is part of.
|
|
||||||
"""
|
|
||||||
if pipeline is not None:
|
|
||||||
for name, model in pipeline:
|
|
||||||
if model.has_ref("tok2vec"):
|
|
||||||
self.input_models.append(model.get_ref("tok2vec"))
|
|
||||||
self.model.initialize()
|
|
||||||
link_vectors_to_models(self.vocab)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
|
|
||||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
|
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg):
|
||||||
warnings.warn(Warnings.W098.format(name="ner"))
|
warnings.warn(Warnings.W098.format(name="ner"))
|
||||||
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||||
|
|
||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
||||||
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
|
from spacy.pipeline.defaults import default_parser, default_tagger
|
||||||
from spacy.pipeline.defaults import default_textcat, default_senter
|
from spacy.pipeline.defaults import default_textcat, default_senter
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
|
||||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
|
||||||
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
|
||||||
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
|
|
||||||
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
|
||||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "tensorizer"
|
|
||||||
tensorizer.to_disk(file_path)
|
|
||||||
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
|
|
||||||
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
|
||||||
exclude=["vocab"]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(
|
textcat = TextCategorizer(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user