mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Set Doc.tensor from Tagger
This commit is contained in:
parent
b3264aa5f0
commit
d6fc39c8a6
|
@ -91,8 +91,8 @@ class Pipe(object):
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
scores = self.predict([doc])
|
scores, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
@ -103,8 +103,8 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores, tensor=tensors)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -113,7 +113,7 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -338,27 +338,27 @@ class Tagger(Pipe):
|
||||||
return self.vocab.morphology.tag_names
|
return self.vocab.morphology.tag_names
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags, tokvecs = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags, tensors=tokvecs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tag_ids = self.predict(docs)
|
tag_ids, tokvecs = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
scores = self.model(docs)
|
tokvecs = self.model.tok2vec(docs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.softmax(tokvecs)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||||
return guesses
|
return guesses, tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -373,6 +373,8 @@ class Tagger(Pipe):
|
||||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
if tensors is not None:
|
||||||
|
doc.extend_tensor(tensors[i])
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
@ -573,7 +575,7 @@ class MultitaskObjective(Tagger):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg['labels'] = value
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||||
|
@ -720,15 +722,15 @@ class TextCategorizer(Pipe):
|
||||||
self.cfg['labels'] = value
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
scores = self.predict([doc])
|
scores, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -736,8 +738,10 @@ class TextCategorizer(Pipe):
|
||||||
scores = self.model.ops.asarray(scores)
|
scores = self.model.ops.asarray(scores)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
if tensors is not None:
|
||||||
|
doc.extend_tensor(tensors[i])
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user