diff --git a/spacy/_ml.py b/spacy/_ml.py index 5f8ce9470..e60e8a610 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -346,16 +346,16 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): def fine_tune(model1, combine=None): - def fine_tune_fwd(docs, drop=0.): + def fine_tune_fwd(docs_tokvecs, drop=0.): + docs, tokvecs = docs_tokvecs + lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') X1, bp_X1 = model1.begin_update(docs) - lengths = [len(doc) for doc in docs] - X2 = model1.ops.flatten(X1) def fine_tune_bwd(d_output, sgd=None): - bp_X1(d_output, sgd=sgd) + bp_X1(model1.ops.flatten(d_output), sgd=sgd) return d_output - return (X1+X2, lengths), fine_tune_bwd + return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd model = wrap(fine_tune_fwd) return model @@ -410,30 +410,21 @@ def preprocess_doc(docs, drop=0.): def build_tagger_model(nr_class, token_vector_width, **cfg): with Model.define_operators({'>>': chain, '+': add}): # Input: (doc, tensor) tuples - embed_docs = with_getitem(0, + embed_docs = ( FeatureExtracter([NORM]) + >> flatten >> HashEmbed(token_vector_width, 1000) - >> flatten_add_lengths ) model = ( fine_tune(embed_docs) - >> - with_getitem(0, - FeatureExtracter([NORM]) - >> HashEmbed(token_vector_width, 1000) - >> flatten_add_lengths - ) - >> with_getitem(1, - flatten_add_lengths) - >> add_tuples >> with_flatten( Maxout(token_vector_width, token_vector_width) >> Softmax(nr_class, token_vector_width) ) ) - return model - + model.nI = None + return model def build_text_classifier(nr_class, width=64, **cfg): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b96387351..848653c5c 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent): self.cfg = dict(cfg) def __call__(self, doc): - tags = self.predict([doc.tensor]) + tags = self.predict(([doc], [doc.tensor])) self.set_annotations([doc], tags) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) tokvecs = [d.tensor for d in docs] - tag_ids = self.predict(tokvecs) + tag_ids = self.predict((docs, tokvecs)) self.set_annotations(docs, tag_ids) yield from docs - def predict(self, tokvecs): - scores = self.model(tokvecs) + def predict(self, docs_tokvecs): + scores = self.model(docs_tokvecs) scores = self.model.ops.flatten(scores) guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() + tokvecs = docs_tokvecs[1] guesses = self.model.ops.unflatten(guesses, [tv.shape[0] for tv in tokvecs]) return guesses @@ -295,7 +297,7 @@ class NeuralTagger(BaseThincComponent): if self.model.nI is None: self.model.nI = tokvecs[0].shape[1] - tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) + tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)