Remove the 'pass example into __call__' thing

This commit is contained in:
Matthew Honnibal 2020-06-09 23:30:06 +02:00
parent b3868cd1f8
commit 0714f1fa5c

View File

@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
from .functions import merge_subtokens from .functions import merge_subtokens
from ..language import Language, component from ..language import Language, component
from ..syntax import nonproj from ..syntax import nonproj
from ..gold import Example from ..gold.new_example import NewExample as Example
from ..attrs import POS, ID from ..attrs import POS, ID
from ..util import link_vectors_to_models, create_default_optimizer from ..util import link_vectors_to_models, create_default_optimizer
from ..parts_of_speech import X from ..parts_of_speech import X
@ -48,12 +48,6 @@ class Pipe(object):
def from_nlp(cls, nlp, model, **cfg): def from_nlp(cls, nlp, model, **cfg):
return cls(nlp.vocab, model, **cfg) return cls(nlp.vocab, model, **cfg)
def _get_doc(self, example):
""" Use this method if the `example` can be both a Doc or an Example """
if isinstance(example, Doc):
return example
return example.doc
def __init__(self, vocab, model, **cfg): def __init__(self, vocab, model, **cfg):
"""Create a new pipe instance.""" """Create a new pipe instance."""
raise NotImplementedError raise NotImplementedError
@ -73,18 +67,17 @@ class Pipe(object):
else: else:
self.set_annotations([doc], predictions) self.set_annotations([doc], predictions)
if isinstance(example, Example): if isinstance(example, Example):
example.doc = doc example.predicted = doc
return example return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents. """Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
for examples in util.minibatch(stream, size=batch_size): for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions scores, tensors = predictions
@ -94,7 +87,7 @@ class Pipe(object):
if as_example: if as_example:
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.predicted = doc
yield ex yield ex
else: else:
yield from docs yield from docs
@ -116,7 +109,6 @@ class Pipe(object):
Delegates to predict() and get_loss(). Delegates to predict() and get_loss().
""" """
if set_annotations: if set_annotations:
docs = (self._get_doc(ex) for ex in examples)
docs = list(self.pipe(docs)) docs = list(self.pipe(docs))
def rehearse(self, examples, sgd=None, losses=None, **config): def rehearse(self, examples, sgd=None, losses=None, **config):
@ -256,28 +248,18 @@ class Tagger(Pipe):
return tuple(self.vocab.morphology.tag_names) return tuple(self.vocab.morphology.tag_names)
def __call__(self, example): def __call__(self, example):
doc = self._get_doc(example)
tags = self.predict([doc]) tags = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
if isinstance(example, Example): if isinstance(example, Example):
example.doc = doc example.predicted = doc
return example return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
tag_ids = self.predict(docs) tag_ids = self.predict(docs)
assert len(docs) == len(examples)
assert len(tag_ids) == len(examples)
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
yield from docs
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs): def predict(self, docs):
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
@ -327,15 +309,17 @@ class Tagger(Pipe):
doc.is_tagged = True doc.is_tagged = True
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
examples = Example.to_example_objects(examples) for eg in examples:
assert isinstance(eg, Example)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
if not any(len(ex.doc) if ex.doc else 0 for ex in examples): if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
return return
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) tag_scores, bp_tag_scores = self.model.begin_update(
[eg.predicted for eg in examples])
for sc in tag_scores: for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()): if self.model.ops.xp.isnan(sc.sum()):
raise ValueError("nan value in scores") raise ValueError("nan value in scores")
@ -347,17 +331,16 @@ class Tagger(Pipe):
if losses is not None: if losses is not None:
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [ex.doc for ex in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores)) self.set_annotations(docs, self._scores2guesses(tag_scores))
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
"""Perform a 'rehearsal' update, where we try to match the output of """Perform a 'rehearsal' update, where we try to match the output of
an initial model. an initial model.
""" """
docs = [eg.predicted for eg in examples]
if self._rehearsal_model is None: if self._rehearsal_model is None:
return return
examples = Example.to_example_objects(examples)
docs = [ex.doc for ex in examples]
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
return return
@ -387,7 +370,8 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {} new_tag_map = {}
for example in get_examples(): for example in get_examples():
for tag in example.token_annotation.tags: for token in example.y:
tag = token.tag_
if tag in orig_tag_map: if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag] new_tag_map[tag] = orig_tag_map[tag]
else: else:
@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels) d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
docs = [eg.doc for eg in examples] docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
gold_examples = nonproj.preprocess_training_data(get_examples()) gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples: # for raw_text, doc_annot in gold_tuples:
for example in gold_examples: for example in gold_examples:
for i in range(len(example.token_annotation.ids)): for token in example.y:
label = self.make_label(i, example.token_annotation) label = self.make_label(token)
if label is not None and label not in self.labels: if label is not None and label not in self.labels:
self.labels[label] = len(self.labels) self.labels[label] = len(self.labels)
self.model.initialize() self.model.initialize()
@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i") correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
docs = [ex.doc for ex in examples] docs = [eg.predicted for eg in examples]
for i, eg in enumerate(examples): for i, eg in enumerate(examples):
# Handles alignment for tokenization differences # Handles alignment for tokenization differences
doc_annots = eg.get_aligned() doc_annots = eg.get_aligned()
for j in range(len(eg.doc)): for j in range(len(eg.predicted)):
tok_annots = {key: values[j] for key, values in tok_annots.items()} tok_annots = {key: values[j] for key, values in tok_annots.items()}
label = self.make_label(j, tok_annots) label = self.make_label(j, tok_annots)
if label is None or label not in self.labels: if label is None or label not in self.labels:
@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
return float(loss), d_scores return float(loss), d_scores
@staticmethod @staticmethod
def make_dep(i, token_annotation): def make_dep(token):
if token_annotation.deps[i] is None or token_annotation.heads[i] is None: return token.dep_
return None
return token_annotation.deps[i]
@staticmethod @staticmethod
def make_tag(i, token_annotation): def make_tag(token):
return token_annotation.tags[i] return token.tag_
@staticmethod @staticmethod
def make_ent(i, token_annotation): def make_ent(token):
if token_annotation.entities is None: if token.ent_iob_ == "O":
return None return "O"
return token_annotation.entities[i] else:
return token.ent_iob_ + "-" + token.ent_type_
@staticmethod @staticmethod
def make_dep_tag_offset(i, token_annotation): def make_dep_tag_offset(token):
if token_annotation.deps[i] is None or token_annotation.heads[i] is None: dep = token.dep_
return None tag = token.tag_
offset = token_annotation.heads[i] - i offset = token.head.i - token.i
offset = min(offset, 2) offset = min(offset, 2)
offset = max(offset, -2) offset = max(offset, -2)
return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" return f"{dep}-{tag}:{offset}"
@staticmethod @staticmethod
def make_ent_tag(i, token_annotation): def make_ent_tag(token):
if token_annotation.entities is None or token_annotation.entities[i] is None: if token.ent_iob_ == "O":
return None ent = "O"
else: else:
return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" ent = token.ent_iob_ + "-" + token.ent_type_
tag = token.tag_
return f"{tag}-{ent}"
@staticmethod @staticmethod
def make_sent_start(target, token_annotation, cache=True, _cache={}): def make_sent_start(token):
"""A multi-task objective for representing sentence boundaries, """A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible) using BILU scheme. (O is impossible)
The implementation of this method uses an internal cache that relies
on the identity of the heads array, to avoid requiring a new piece
of gold data. You can pass cache=False if you know the cache will
do the wrong thing.
""" """
words = token_annotation.words if token.is_sent_start and token.is_sent_end:
heads = token_annotation.heads return "U-SENT"
assert len(words) == len(heads) elif token.is_sent_start:
assert target < len(words), (target, len(words)) return "B-SENT"
if cache:
if id(heads) in _cache:
return _cache[id(heads)][target]
else:
for key in list(_cache.keys()):
_cache.pop(key)
sent_tags = ["I-SENT"] * len(words)
_cache[id(heads)] = sent_tags
else: else:
sent_tags = ["I-SENT"] * len(words) return "I-SENT"
def _find_root(child):
seen = set([child])
while child is not None and heads[child] != child:
seen.add(child)
child = heads[child]
return child
sentences = {}
for i in range(len(words)):
root = _find_root(i)
if root is None:
sent_tags[i] = None
else:
sentences.setdefault(root, []).append(i)
for root, span in sorted(sentences.items()):
if len(span) == 1:
sent_tags[span[0]] = "U-SENT"
else:
sent_tags[span[0]] = "B-SENT"
sent_tags[span[-1]] = "L-SENT"
return sent_tags[target]
class ClozeMultitask(Pipe): class ClozeMultitask(Pipe):
@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
# token.vector values, but that's a bit inefficient, especially on GPU. # token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens, # Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying. # and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
target = vectors[ids] target = vectors[ids]
gradient = self.distance.get_grad(prediction, target) gradient = self.distance.get_grad(prediction, target)
loss = self.distance.get_loss(prediction, target) loss = self.distance.get_loss(prediction, target)
@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
pass pass
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) predictions, bp_predictions = self.model.begin_update(
[eg.predicted for eg in examples])
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions) bp_predictions(d_predictions)
if sgd is not None: if sgd is not None:
@ -883,17 +834,10 @@ class TextCategorizer(Pipe):
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
yield from docs
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs): def predict(self, docs):
tensors = [doc.tensor for doc in docs] tensors = [doc.tensor for doc in docs]
@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
doc.cats[label] = float(scores[i, j]) doc.cats[label] = float(scores[i, j])
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
examples = Example.to_example_objects(examples) for eg in examples:
if not any(len(ex.doc) if ex.doc else 0 for ex in examples): assert isinstance(eg, Example)
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
return return
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) scores, bp_scores = self.model.begin_update(
[eg.predicted for eg in examples]
)
loss, d_scores = self.get_loss(examples, scores) loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores) bp_scores(d_scores)
if sgd is not None: if sgd is not None:
@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [ex.doc for ex in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores) self.set_annotations(docs, scores=scores)
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None: if self._rehearsal_model is None:
return return
examples = Example.to_example_objects(examples) for eg in examples:
docs=[ex.doc for ex in examples] assert isinstance(eg, Example)
docs = [eg.predicted for eg in examples]
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
return return
@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
for i, eg in enumerate(examples): for i, eg in enumerate(examples):
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
if label in eg.doc_annotation.cats: if label in eg.predicted.cats:
truths[i, j] = eg.doc_annotation.cats[label] truths[i, j] = eg.reference.cats[label]
else: else:
not_missing[i, j] = 0. not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths) truths = self.model.ops.asarray(truths)
@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
# TODO: begin_training is not guaranteed to see all data / labels ? # TODO: begin_training is not guaranteed to see all data / labels ?
examples = list(get_examples()) examples = list(get_examples())
for example in examples: for example in examples:
for cat in example.doc_annotation.cats: for cat in example.y.cats:
self.add_label(cat) self.add_label(cat)
self.require_labels() self.require_labels()
docs = [Doc(Vocab(), words=["hello"])] docs = [Doc(Vocab(), words=["hello"])]
@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
if not examples: if not examples:
return 0 return 0
examples = Example.to_example_objects(examples) for eg in examples:
assert isinstance(eg, Example)
sentence_docs = [] sentence_docs = []
docs = [ex.doc for ex in examples] docs = [eg.predicted for eg in examples]
if set_annotations: if set_annotations:
# This seems simpler than other ways to get that exact output -- but # This seems simpler than other ways to get that exact output -- but
# it does run the model twice :( # it does run the model twice :(
predictions = self.model.predict(docs) predictions = self.model.predict(docs)
for eg in examples: for eg in examples:
doc = eg.doc doc = eg.predicted
ents_by_offset = dict() ents_by_offset = dict()
for ent in doc.ents: for ent in doc.ents:
ents_by_offset[(ent.start_char, ent.end_char)] = ent ents_by_offset[(ent.start_char, ent.end_char)] = ent
links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in eg.doc_annotation.links.items(): for entity, kb_dict in links.items():
if isinstance(entity, str): if isinstance(entity, str):
entity = literal_eval(entity) entity = literal_eval(entity)
start, end = entity start, end = entity
@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
def get_similarity_loss(self, examples, scores): def get_similarity_loss(self, examples, scores):
entity_encodings = [] entity_encodings = []
for eg in examples: for eg in examples:
for entity, kb_dict in eg.doc_annotation.links.items(): links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in links.items():
for kb_id, value in kb_dict.items(): for kb_id, value in kb_dict.items():
# this loss function assumes we're only using positive examples # this loss function assumes we're only using positive examples
if value: if value:
@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
cats = [] cats = []
for ex in examples: for eg in examples:
for entity, kb_dict in ex.doc_annotation.links.items(): links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in links.items():
for kb_id, value in kb_dict.items(): for kb_id, value in kb_dict.items():
cats.append([value]) cats.append([value])
@ -1237,27 +1188,22 @@ class EntityLinker(Pipe):
loss = loss / len(cats) loss = loss / len(cats)
return loss, d_scores return loss, d_scores
def __call__(self, example): def _get_links_from_doc(self, doc):
doc = self._get_doc(example) return {}
def __call__(self, doc):
kb_ids, tensors = self.predict([doc]) kb_ids, tensors = self.predict([doc])
self.set_annotations([doc], kb_ids, tensors=tensors) self.set_annotations([doc], kb_ids, tensors=tensors)
if isinstance(example, Example): if isinstance(example, Example):
example.doc = doc example.x = doc
return example return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
kb_ids, tensors = self.predict(docs) kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors) self.set_annotations(docs, kb_ids, tensors=tensors)
yield from docs
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs): def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
): ):
pass pass
def __call__(self, example): def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start. """Apply the sentencizer to a Doc and set Token.is_sent_start.
example (Doc or Example): The document to process. example (Doc or Example): The document to process.
@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer#call DOCS: https://spacy.io/api/sentencizer#call
""" """
doc = self._get_doc(example)
start = 0 start = 0
seen_period = False seen_period = False
for i, token in enumerate(doc): for i, token in enumerate(doc):
@ -1460,21 +1405,15 @@ class Sentencizer(Pipe):
return example return example
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1):
for examples in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
else: else:
self.set_annotations(docs, predictions) self.set_annotations(docs, predictions)
if as_example: yield from docs
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without """Apply the pipeline's model to a batch of docs, without