Refactor towards new Example class

This commit is contained in:
Matthew Honnibal 2020-06-09 23:39:46 +02:00
parent 82810b9846
commit ad547a4b8f

View File

@ -52,23 +52,19 @@ class Pipe(object):
"""Create a new pipe instance.""" """Create a new pipe instance."""
raise NotImplementedError raise NotImplementedError
def __call__(self, example): def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is """Apply the pipe to one document. The document is
modified in-place, and returned. modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
doc = self._get_doc(example)
predictions = self.predict([doc]) predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2: if isinstance(predictions, tuple) and len(predictions) == 2:
scores, tensors = predictions scores, tensors = predictions
self.set_annotations([doc], scores, tensors=tensors) self.set_annotations([doc], scores, tensors=tensors)
else: else:
self.set_annotations([doc], predictions) self.set_annotations([doc], predictions)
if isinstance(example, Example):
example.predicted = doc
return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
@ -77,19 +73,13 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
for examples in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
else: else:
self.set_annotations(docs, predictions) self.set_annotations(docs, predictions)
if as_example:
for ex, doc in zip(examples, docs):
ex.predicted = doc
yield ex
else:
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -102,7 +92,7 @@ class Pipe(object):
"""Modify a batch of documents, using pre-computed scores.""" """Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): def update(self, docs, set_annotations=False, drop=0.0, sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information, """Learn from a batch of documents and gold-standard information,
updating the pipe's model. updating the pipe's model.
@ -247,15 +237,12 @@ class Tagger(Pipe):
def labels(self): def labels(self):
return tuple(self.vocab.morphology.tag_names) return tuple(self.vocab.morphology.tag_names)
def __call__(self, example): def __call__(self, doc):
tags = self.predict([doc]) tags = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
if isinstance(example, Example):
example.predicted = doc
return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
tag_ids = self.predict(docs) tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
@ -833,7 +820,7 @@ class TextCategorizer(Pipe):
def labels(self, value): def labels(self, value):
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
@ -1194,12 +1181,9 @@ class EntityLinker(Pipe):
def __call__(self, doc): def __call__(self, doc):
kb_ids, tensors = self.predict([doc]) kb_ids, tensors = self.predict([doc])
self.set_annotations([doc], kb_ids, tensors=tensors) self.set_annotations([doc], kb_ids, tensors=tensors)
if isinstance(example, Example):
example.x = doc
return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
kb_ids, tensors = self.predict(docs) kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors) self.set_annotations(docs, kb_ids, tensors=tensors)
@ -1400,9 +1384,6 @@ class Sentencizer(Pipe):
seen_period = True seen_period = True
if start < len(doc): if start < len(doc):
doc[start].is_sent_start = True doc[start].is_sent_start = True
if isinstance(example, Example):
example.doc = doc
return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):