Restore changes to pipeline.pyx from nn-beam-parser branch

This commit is contained in:
Matthew Honnibal 2017-08-18 22:02:35 +02:00
parent 931509d96a
commit ec482580b5

View File

@ -42,7 +42,7 @@ from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import build_text_classifier
from ._ml import build_text_classifier, build_tagger_model
from .parts_of_speech import X
@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=7500, **cfg):
def Model(cls, width=128, embed_size=4000, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
self.cfg = dict(cfg)
def __call__(self, doc):
tags = self.predict([doc.tensor])
tags = self.predict(([doc], [doc.tensor]))
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
tag_ids = self.predict(tokvecs)
tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, tokvecs):
scores = self.model(tokvecs)
def predict(self, docs_tokvecs):
scores = self.model(docs_tokvecs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs])
return guesses
@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, 'get'):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):
if self.model.nI is None:
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@ -346,9 +349,7 @@ class NeuralTagger(BaseThincComponent):
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
return build_tagger_model(n_tags, token_vector_width)
def use_params(self, params):
with self.model.use_params(params):
@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):
@property
def labels(self):
return self.cfg.get('labels', {})
return self.cfg.setdefault('labels', {})
@labels.setter
def labels(self, value):
@ -455,9 +456,7 @@ class NeuralLabeller(NeuralTagger):
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
return build_tagger_model(n_tags, token_vector_width)
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
@ -654,6 +653,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def predict_confidences(self, docs):
tensors = [d.tensor for d in docs]
samples = []
for i in range(10):
states = self.parse_batch(docs, tensors, drop=0.3)
for state in states:
samples.append(self._get_entities(state))
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)