mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
Remove the 'pass example into __call__' thing
This commit is contained in:
parent
b3868cd1f8
commit
0714f1fa5c
|
@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
|
|||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
from ..gold import Example
|
||||
from ..gold.new_example import NewExample as Example
|
||||
from ..attrs import POS, ID
|
||||
from ..util import link_vectors_to_models, create_default_optimizer
|
||||
from ..parts_of_speech import X
|
||||
|
@ -48,12 +48,6 @@ class Pipe(object):
|
|||
def from_nlp(cls, nlp, model, **cfg):
|
||||
return cls(nlp.vocab, model, **cfg)
|
||||
|
||||
def _get_doc(self, example):
|
||||
""" Use this method if the `example` can be both a Doc or an Example """
|
||||
if isinstance(example, Doc):
|
||||
return example
|
||||
return example.doc
|
||||
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
"""Create a new pipe instance."""
|
||||
raise NotImplementedError
|
||||
|
@ -73,18 +67,17 @@ class Pipe(object):
|
|||
else:
|
||||
self.set_annotations([doc], predictions)
|
||||
if isinstance(example, Example):
|
||||
example.doc = doc
|
||||
example.predicted = doc
|
||||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
"""Apply the pipe to a stream of documents.
|
||||
|
||||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
predictions = self.predict(docs)
|
||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||
scores, tensors = predictions
|
||||
|
@ -94,7 +87,7 @@ class Pipe(object):
|
|||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
ex.predicted = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
@ -116,7 +109,6 @@ class Pipe(object):
|
|||
Delegates to predict() and get_loss().
|
||||
"""
|
||||
if set_annotations:
|
||||
docs = (self._get_doc(ex) for ex in examples)
|
||||
docs = list(self.pipe(docs))
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, **config):
|
||||
|
@ -256,27 +248,17 @@ class Tagger(Pipe):
|
|||
return tuple(self.vocab.morphology.tag_names)
|
||||
|
||||
def __call__(self, example):
|
||||
doc = self._get_doc(example)
|
||||
tags = self.predict([doc])
|
||||
self.set_annotations([doc], tags)
|
||||
if isinstance(example, Example):
|
||||
example.doc = doc
|
||||
example.predicted = doc
|
||||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
tag_ids = self.predict(docs)
|
||||
assert len(docs) == len(examples)
|
||||
assert len(tag_ids) == len(examples)
|
||||
self.set_annotations(docs, tag_ids)
|
||||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -327,15 +309,17 @@ class Tagger(Pipe):
|
|||
doc.is_tagged = True
|
||||
|
||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
examples = Example.to_example_objects(examples)
|
||||
for eg in examples:
|
||||
assert isinstance(eg, Example)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
|
||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
set_dropout_rate(self.model, drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
for sc in tag_scores:
|
||||
if self.model.ops.xp.isnan(sc.sum()):
|
||||
raise ValueError("nan value in scores")
|
||||
|
@ -347,17 +331,16 @@ class Tagger(Pipe):
|
|||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [ex.doc for ex in examples]
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||
an initial model.
|
||||
"""
|
||||
docs = [eg.predicted for eg in examples]
|
||||
if self._rehearsal_model is None:
|
||||
return
|
||||
examples = Example.to_example_objects(examples)
|
||||
docs = [ex.doc for ex in examples]
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
|
@ -387,7 +370,8 @@ class Tagger(Pipe):
|
|||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||
new_tag_map = {}
|
||||
for example in get_examples():
|
||||
for tag in example.token_annotation.tags:
|
||||
for token in example.y:
|
||||
tag = token.tag_
|
||||
if tag in orig_tag_map:
|
||||
new_tag_map[tag] = orig_tag_map[tag]
|
||||
else:
|
||||
|
@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
|
|||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
d_scores *= self.model.ops.asarray(known_labels)
|
||||
loss = (d_scores**2).sum()
|
||||
docs = [eg.doc for eg in examples]
|
||||
docs = [eg.predicted for eg in examples]
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
||||
|
@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
|
|||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||
# for raw_text, doc_annot in gold_tuples:
|
||||
for example in gold_examples:
|
||||
for i in range(len(example.token_annotation.ids)):
|
||||
label = self.make_label(i, example.token_annotation)
|
||||
for token in example.y:
|
||||
label = self.make_label(token)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
self.model.initialize()
|
||||
|
@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
|
|||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
docs = [ex.doc for ex in examples]
|
||||
docs = [eg.predicted for eg in examples]
|
||||
for i, eg in enumerate(examples):
|
||||
# Handles alignment for tokenization differences
|
||||
doc_annots = eg.get_aligned()
|
||||
for j in range(len(eg.doc)):
|
||||
for j in range(len(eg.predicted)):
|
||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||
label = self.make_label(j, tok_annots)
|
||||
if label is None or label not in self.labels:
|
||||
|
@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
|
|||
return float(loss), d_scores
|
||||
|
||||
@staticmethod
|
||||
def make_dep(i, token_annotation):
|
||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
||||
return None
|
||||
return token_annotation.deps[i]
|
||||
def make_dep(token):
|
||||
return token.dep_
|
||||
|
||||
@staticmethod
|
||||
def make_tag(i, token_annotation):
|
||||
return token_annotation.tags[i]
|
||||
def make_tag(token):
|
||||
return token.tag_
|
||||
|
||||
@staticmethod
|
||||
def make_ent(i, token_annotation):
|
||||
if token_annotation.entities is None:
|
||||
return None
|
||||
return token_annotation.entities[i]
|
||||
def make_ent(token):
|
||||
if token.ent_iob_ == "O":
|
||||
return "O"
|
||||
else:
|
||||
return token.ent_iob_ + "-" + token.ent_type_
|
||||
|
||||
@staticmethod
|
||||
def make_dep_tag_offset(i, token_annotation):
|
||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
||||
return None
|
||||
offset = token_annotation.heads[i] - i
|
||||
def make_dep_tag_offset(token):
|
||||
dep = token.dep_
|
||||
tag = token.tag_
|
||||
offset = token.head.i - token.i
|
||||
offset = min(offset, 2)
|
||||
offset = max(offset, -2)
|
||||
return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
|
||||
return f"{dep}-{tag}:{offset}"
|
||||
|
||||
@staticmethod
|
||||
def make_ent_tag(i, token_annotation):
|
||||
if token_annotation.entities is None or token_annotation.entities[i] is None:
|
||||
return None
|
||||
def make_ent_tag(token):
|
||||
if token.ent_iob_ == "O":
|
||||
ent = "O"
|
||||
else:
|
||||
return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
|
||||
ent = token.ent_iob_ + "-" + token.ent_type_
|
||||
tag = token.tag_
|
||||
return f"{tag}-{ent}"
|
||||
|
||||
@staticmethod
|
||||
def make_sent_start(target, token_annotation, cache=True, _cache={}):
|
||||
def make_sent_start(token):
|
||||
"""A multi-task objective for representing sentence boundaries,
|
||||
using BILU scheme. (O is impossible)
|
||||
|
||||
The implementation of this method uses an internal cache that relies
|
||||
on the identity of the heads array, to avoid requiring a new piece
|
||||
of gold data. You can pass cache=False if you know the cache will
|
||||
do the wrong thing.
|
||||
"""
|
||||
words = token_annotation.words
|
||||
heads = token_annotation.heads
|
||||
assert len(words) == len(heads)
|
||||
assert target < len(words), (target, len(words))
|
||||
if cache:
|
||||
if id(heads) in _cache:
|
||||
return _cache[id(heads)][target]
|
||||
if token.is_sent_start and token.is_sent_end:
|
||||
return "U-SENT"
|
||||
elif token.is_sent_start:
|
||||
return "B-SENT"
|
||||
else:
|
||||
for key in list(_cache.keys()):
|
||||
_cache.pop(key)
|
||||
sent_tags = ["I-SENT"] * len(words)
|
||||
_cache[id(heads)] = sent_tags
|
||||
else:
|
||||
sent_tags = ["I-SENT"] * len(words)
|
||||
|
||||
def _find_root(child):
|
||||
seen = set([child])
|
||||
while child is not None and heads[child] != child:
|
||||
seen.add(child)
|
||||
child = heads[child]
|
||||
return child
|
||||
|
||||
sentences = {}
|
||||
for i in range(len(words)):
|
||||
root = _find_root(i)
|
||||
if root is None:
|
||||
sent_tags[i] = None
|
||||
else:
|
||||
sentences.setdefault(root, []).append(i)
|
||||
for root, span in sorted(sentences.items()):
|
||||
if len(span) == 1:
|
||||
sent_tags[span[0]] = "U-SENT"
|
||||
else:
|
||||
sent_tags[span[0]] = "B-SENT"
|
||||
sent_tags[span[-1]] = "L-SENT"
|
||||
return sent_tags[target]
|
||||
return "I-SENT"
|
||||
|
||||
|
||||
class ClozeMultitask(Pipe):
|
||||
|
@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
|
|||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
||||
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
|
||||
target = vectors[ids]
|
||||
gradient = self.distance.get_grad(prediction, target)
|
||||
loss = self.distance.get_loss(prediction, target)
|
||||
|
@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
|
|||
pass
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
examples = Example.to_example_objects(examples)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs = [eg.predicted for eg in examples]
|
||||
set_dropout_rate(self.model, drop)
|
||||
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
|
||||
predictions, bp_predictions = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||
bp_predictions(d_predictions)
|
||||
if sgd is not None:
|
||||
|
@ -883,16 +834,9 @@ class TextCategorizer(Pipe):
|
|||
self.cfg["labels"] = tuple(value)
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
scores, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
|
|||
doc.cats[label] = float(scores[i, j])
|
||||
|
||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
examples = Example.to_example_objects(examples)
|
||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
||||
for eg in examples:
|
||||
assert isinstance(eg, Example)
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
set_dropout_rate(self.model, drop)
|
||||
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
|
||||
scores, bp_scores = self.model.begin_update(
|
||||
[eg.predicted for eg in examples]
|
||||
)
|
||||
loss, d_scores = self.get_loss(examples, scores)
|
||||
bp_scores(d_scores)
|
||||
if sgd is not None:
|
||||
|
@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
|
|||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [ex.doc for ex in examples]
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, scores=scores)
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
if self._rehearsal_model is None:
|
||||
return
|
||||
examples = Example.to_example_objects(examples)
|
||||
docs=[ex.doc for ex in examples]
|
||||
for eg in examples:
|
||||
assert isinstance(eg, Example)
|
||||
docs = [eg.predicted for eg in examples]
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
|
@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
|
|||
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
|
||||
for i, eg in enumerate(examples):
|
||||
for j, label in enumerate(self.labels):
|
||||
if label in eg.doc_annotation.cats:
|
||||
truths[i, j] = eg.doc_annotation.cats[label]
|
||||
if label in eg.predicted.cats:
|
||||
truths[i, j] = eg.reference.cats[label]
|
||||
else:
|
||||
not_missing[i, j] = 0.
|
||||
truths = self.model.ops.asarray(truths)
|
||||
|
@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
|
|||
# TODO: begin_training is not guaranteed to see all data / labels ?
|
||||
examples = list(get_examples())
|
||||
for example in examples:
|
||||
for cat in example.doc_annotation.cats:
|
||||
for cat in example.y.cats:
|
||||
self.add_label(cat)
|
||||
self.require_labels()
|
||||
docs = [Doc(Vocab(), words=["hello"])]
|
||||
|
@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
|
|||
losses.setdefault(self.name, 0.0)
|
||||
if not examples:
|
||||
return 0
|
||||
examples = Example.to_example_objects(examples)
|
||||
for eg in examples:
|
||||
assert isinstance(eg, Example)
|
||||
sentence_docs = []
|
||||
docs = [ex.doc for ex in examples]
|
||||
docs = [eg.predicted for eg in examples]
|
||||
if set_annotations:
|
||||
# This seems simpler than other ways to get that exact output -- but
|
||||
# it does run the model twice :(
|
||||
predictions = self.model.predict(docs)
|
||||
|
||||
for eg in examples:
|
||||
doc = eg.doc
|
||||
doc = eg.predicted
|
||||
ents_by_offset = dict()
|
||||
for ent in doc.ents:
|
||||
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
||||
|
||||
for entity, kb_dict in eg.doc_annotation.links.items():
|
||||
links = self._get_links_from_doc(eg.reference)
|
||||
for entity, kb_dict in links.items():
|
||||
if isinstance(entity, str):
|
||||
entity = literal_eval(entity)
|
||||
start, end = entity
|
||||
|
@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
|
|||
def get_similarity_loss(self, examples, scores):
|
||||
entity_encodings = []
|
||||
for eg in examples:
|
||||
for entity, kb_dict in eg.doc_annotation.links.items():
|
||||
links = self._get_links_from_doc(eg.reference)
|
||||
for entity, kb_dict in links.items():
|
||||
for kb_id, value in kb_dict.items():
|
||||
# this loss function assumes we're only using positive examples
|
||||
if value:
|
||||
|
@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):
|
|||
|
||||
def get_loss(self, examples, scores):
|
||||
cats = []
|
||||
for ex in examples:
|
||||
for entity, kb_dict in ex.doc_annotation.links.items():
|
||||
for eg in examples:
|
||||
links = self._get_links_from_doc(eg.reference)
|
||||
for entity, kb_dict in links.items():
|
||||
for kb_id, value in kb_dict.items():
|
||||
cats.append([value])
|
||||
|
||||
|
@ -1237,26 +1188,21 @@ class EntityLinker(Pipe):
|
|||
loss = loss / len(cats)
|
||||
return loss, d_scores
|
||||
|
||||
def __call__(self, example):
|
||||
doc = self._get_doc(example)
|
||||
def _get_links_from_doc(self, doc):
|
||||
return {}
|
||||
|
||||
def __call__(self, doc):
|
||||
kb_ids, tensors = self.predict([doc])
|
||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||
if isinstance(example, Example):
|
||||
example.doc = doc
|
||||
example.x = doc
|
||||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
kb_ids, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
|
|||
):
|
||||
pass
|
||||
|
||||
def __call__(self, example):
|
||||
def __call__(self, doc):
|
||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||
|
||||
example (Doc or Example): The document to process.
|
||||
|
@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/sentencizer#call
|
||||
"""
|
||||
doc = self._get_doc(example)
|
||||
start = 0
|
||||
seen_period = False
|
||||
for i, token in enumerate(doc):
|
||||
|
@ -1460,20 +1405,14 @@ class Sentencizer(Pipe):
|
|||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
predictions = self.predict(docs)
|
||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||
scores, tensors = predictions
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
else:
|
||||
self.set_annotations(docs, predictions)
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
|
Loading…
Reference in New Issue
Block a user