Remove the 'pass example into __call__' thing

2025-07-02 10:53:05 +03:00 · 2020-06-09 23:30:06 +02:00 · 2020-06-09 23:30:06 +02:00 · 0714f1fa5c
commit 0714f1fa5c
parent b3868cd1f8
1 changed files with 88 additions and 149 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
-from ..gold import Example
+from ..gold.new_example import NewExample as Example
 from ..attrs import POS, ID
 from ..util import link_vectors_to_models, create_default_optimizer
 from ..parts_of_speech import X
@ -48,12 +48,6 @@ class Pipe(object):
    def from_nlp(cls, nlp, model, **cfg):
        return cls(nlp.vocab, model, **cfg)
    def _get_doc(self, example):
        """ Use this method if the `example` can be both a Doc or an Example """
        if isinstance(example, Doc):
            return example
        return example.doc
    def __init__(self, vocab, model, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
@ -73,18 +67,17 @@ class Pipe(object):
        else:
            self.set_annotations([doc], predictions)
        if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
            return example
        return doc
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for examples in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
            predictions = self.predict(docs)
            if isinstance(predictions, tuple) and len(tuple) == 2:
                scores, tensors = predictions
@ -94,7 +87,7 @@ class Pipe(object):
            if as_example:
                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
+                    ex.predicted = doc
                    yield ex
            else:
                yield from docs
@ -116,7 +109,6 @@ class Pipe(object):
        Delegates to predict() and get_loss().
        """
        if set_annotations:
            docs = (self._get_doc(ex) for ex in examples)
            docs = list(self.pipe(docs))
    def rehearse(self, examples, sgd=None, losses=None, **config):
@ -256,28 +248,18 @@ class Tagger(Pipe):
        return tuple(self.vocab.morphology.tag_names)
    def __call__(self, example):
        doc = self._get_doc(example)
        tags = self.predict([doc])
        self.set_annotations([doc], tags)
        if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
            return example
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
            tag_ids = self.predict(docs)
            assert len(docs) == len(examples)
            assert len(tag_ids) == len(examples)
            self.set_annotations(docs, tag_ids)
-
+            yield from docs
            if as_example:
                for ex, doc in zip(examples, docs):
                    ex.doc = doc
                    yield ex
            else:
                yield from docs
    def predict(self, docs):
        if not any(len(doc) for doc in docs):
@ -327,15 +309,17 @@ class Tagger(Pipe):
            doc.is_tagged = True
    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
            assert isinstance(eg, Example)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        set_dropout_rate(self.model, drop)
-        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
+        tag_scores, bp_tag_scores = self.model.begin_update(
            [eg.predicted for eg in examples])
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
                raise ValueError("nan value in scores")
@ -347,17 +331,16 @@ class Tagger(Pipe):
        if losses is not None:
            losses[self.name] += loss
        if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, self._scores2guesses(tag_scores))
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
        an initial model.
        """
        docs = [eg.predicted for eg in examples]
        if self._rehearsal_model is None:
            return
        examples = Example.to_example_objects(examples)
        docs = [ex.doc for ex in examples]
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
@ -387,7 +370,8 @@ class Tagger(Pipe):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for example in get_examples():
-            for tag in example.token_annotation.tags:
+            for token in example.y:
                tag = token.tag_
                if tag in orig_tag_map:
                    new_tag_map[tag] = orig_tag_map[tag]
                else:
@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
-        docs = [eg.doc for eg in examples]
+        docs = [eg.predicted for eg in examples]
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
        gold_examples = nonproj.preprocess_training_data(get_examples())
        # for raw_text, doc_annot in gold_tuples:
        for example in gold_examples:
-            for i in range(len(example.token_annotation.ids)):
+            for token in example.y:
-                label = self.make_label(i, example.token_annotation)
+                label = self.make_label(token)
                if label is not None and label not in self.labels:
                    self.labels[label] = len(self.labels)
        self.model.initialize()
@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
        for i, eg in enumerate(examples):
            # Handles alignment for tokenization differences
            doc_annots = eg.get_aligned()
-            for j in range(len(eg.doc)):
+            for j in range(len(eg.predicted)):
                tok_annots = {key: values[j] for key, values in tok_annots.items()}
                label = self.make_label(j, tok_annots)
                if label is None or label not in self.labels:
@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
        return float(loss), d_scores
    @staticmethod
-    def make_dep(i, token_annotation):
+    def make_dep(token):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
+        return token.dep_
            return None
        return token_annotation.deps[i]
    @staticmethod
-    def make_tag(i, token_annotation):
+    def make_tag(token):
-        return token_annotation.tags[i]
+        return token.tag_
    @staticmethod
-    def make_ent(i, token_annotation):
+    def make_ent(token):
-        if token_annotation.entities is None:
+        if token.ent_iob_ == "O":
-            return None
+            return "O"
-        return token_annotation.entities[i]
+        else:
            return token.ent_iob_ + "-" + token.ent_type_
    @staticmethod
-    def make_dep_tag_offset(i, token_annotation):
+    def make_dep_tag_offset(token):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
+        dep = token.dep_
-            return None
+        tag = token.tag_
-        offset = token_annotation.heads[i] - i
+        offset = token.head.i - token.i
        offset = min(offset, 2)
        offset = max(offset, -2)
-        return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
+        return f"{dep}-{tag}:{offset}"
    @staticmethod
-    def make_ent_tag(i, token_annotation):
+    def make_ent_tag(token):
-        if token_annotation.entities is None or token_annotation.entities[i] is None:
+        if token.ent_iob_ == "O":
-            return None
+            ent = "O"
        else:
-            return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
+            ent = token.ent_iob_ + "-" + token.ent_type_
        tag = token.tag_
        return f"{tag}-{ent}"
    @staticmethod
-    def make_sent_start(target, token_annotation, cache=True, _cache={}):
+    def make_sent_start(token):
        """A multi-task objective for representing sentence boundaries,
        using BILU scheme. (O is impossible)
        The implementation of this method uses an internal cache that relies
        on the identity of the heads array, to avoid requiring a new piece
        of gold data. You can pass cache=False if you know the cache will
        do the wrong thing.
        """
-        words = token_annotation.words
+        if token.is_sent_start and token.is_sent_end:
-        heads = token_annotation.heads
+            return "U-SENT"
-        assert len(words) == len(heads)
+        elif token.is_sent_start:
-        assert target < len(words), (target, len(words))
+            return "B-SENT"
        if cache:
            if id(heads) in _cache:
                return _cache[id(heads)][target]
            else:
                for key in list(_cache.keys()):
                    _cache.pop(key)
            sent_tags = ["I-SENT"] * len(words)
            _cache[id(heads)] = sent_tags
        else:
-            sent_tags = ["I-SENT"] * len(words)
+            return "I-SENT"
        def _find_root(child):
            seen = set([child])
            while child is not None and heads[child] != child:
                seen.add(child)
                child = heads[child]
            return child
        sentences = {}
        for i in range(len(words)):
            root = _find_root(i)
            if root is None:
                sent_tags[i] = None
            else:
                sentences.setdefault(root, []).append(i)
        for root, span in sorted(sentences.items()):
            if len(span) == 1:
                sent_tags[span[0]] = "U-SENT"
            else:
                sent_tags[span[0]] = "B-SENT"
                sent_tags[span[-1]] = "L-SENT"
        return sent_tags[target]
 class ClozeMultitask(Pipe):
@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
        # token.vector values, but that's a bit inefficient, especially on GPU.
        # Instead we fetch the index into the vectors table for each of our tokens,
        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
+        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
        target = vectors[ids]
        gradient = self.distance.get_grad(prediction, target)
        loss = self.distance.get_loss(prediction, target)
@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
        pass
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        examples = Example.to_example_objects(examples)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
-        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
+        predictions, bp_predictions = self.model.begin_update(
            [eg.predicted for eg in examples])
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions)
        if sgd is not None:
@ -883,17 +834,10 @@ class TextCategorizer(Pipe):
        self.cfg["labels"] = tuple(value)
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
            scores, tensors = self.predict(docs)
            self.set_annotations(docs, scores, tensors=tensors)
-
+            yield from docs
            if as_example:
                for ex, doc in zip(examples, docs):
                    ex.doc = doc
                    yield ex
            else:
                yield from docs
    def predict(self, docs):
        tensors = [doc.tensor for doc in docs]
@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
                doc.cats[label] = float(scores[i, j])
    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+            assert isinstance(eg, Example)
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        set_dropout_rate(self.model, drop)
-        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
+        scores, bp_scores = self.model.begin_update(
            [eg.predicted for eg in examples]
        )
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if sgd is not None:
@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, scores=scores)
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
            return
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
-        docs=[ex.doc for ex in examples]
+            assert isinstance(eg, Example)
        docs = [eg.predicted for eg in examples]
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
        not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
        for i, eg in enumerate(examples):
            for j, label in enumerate(self.labels):
-                if label in eg.doc_annotation.cats:
+                if label in eg.predicted.cats:
-                    truths[i, j] = eg.doc_annotation.cats[label]
+                    truths[i, j] = eg.reference.cats[label]
                else:
                    not_missing[i, j] = 0.
        truths = self.model.ops.asarray(truths)
@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
        # TODO: begin_training is not guaranteed to see all data / labels ?
        examples = list(get_examples())
        for example in examples:
-            for cat in example.doc_annotation.cats:
+            for cat in example.y.cats:
                self.add_label(cat)
        self.require_labels()
        docs = [Doc(Vocab(), words=["hello"])]
@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
            losses.setdefault(self.name, 0.0)
        if not examples:
            return 0
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
            assert isinstance(eg, Example)
        sentence_docs = []
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
        if set_annotations:
            # This seems simpler than other ways to get that exact output -- but
            # it does run the model twice :(
            predictions = self.model.predict(docs)
        for eg in examples:
-            doc = eg.doc
+            doc = eg.predicted
            ents_by_offset = dict()
            for ent in doc.ents:
                ents_by_offset[(ent.start_char, ent.end_char)] = ent
-
+            links = self._get_links_from_doc(eg.reference)
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            for entity, kb_dict in links.items():
                if isinstance(entity, str):
                    entity = literal_eval(entity)
                start, end = entity
@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
    def get_similarity_loss(self, examples, scores):
        entity_encodings = []
        for eg in examples:
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
            for entity, kb_dict in links.items():
                for kb_id, value in kb_dict.items():
                    # this loss function assumes we're only using positive examples
                    if value:
@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):
    def get_loss(self, examples, scores):
        cats = []
-        for ex in examples:
+        for eg in examples:
-            for entity, kb_dict in ex.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
            for entity, kb_dict in links.items():
                for kb_id, value in kb_dict.items():
                    cats.append([value])
@ -1237,27 +1188,22 @@ class EntityLinker(Pipe):
        loss = loss / len(cats)
        return loss, d_scores
-    def __call__(self, example):
+    def _get_links_from_doc(self, doc):
-        doc = self._get_doc(example)
+        return {}
    def __call__(self, doc):
        kb_ids, tensors = self.predict([doc])
        self.set_annotations([doc], kb_ids, tensors=tensors)
        if isinstance(example, Example):
-            example.doc = doc
+            example.x = doc
            return example
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
            kb_ids, tensors = self.predict(docs)
            self.set_annotations(docs, kb_ids, tensors=tensors)
-
+            yield from docs
            if as_example:
                for ex, doc in zip(examples, docs):
                    ex.doc = doc
                    yield ex
            else:
                yield from docs
    def predict(self, docs):
        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
    ):
        pass
-    def __call__(self, example):
+    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
        example (Doc or Example): The document to process.
@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):
        DOCS: https://spacy.io/api/sentencizer#call
        """
        doc = self._get_doc(example)
        start = 0
        seen_period = False
        for i, token in enumerate(doc):
@ -1460,21 +1405,15 @@ class Sentencizer(Pipe):
            return example
        return doc
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for examples in util.minibatch(stream, size=batch_size):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
            predictions = self.predict(docs)
            if isinstance(predictions, tuple) and len(tuple) == 2:
                scores, tensors = predictions
                self.set_annotations(docs, scores, tensors=tensors)
            else:
                self.set_annotations(docs, predictions)
-            if as_example:
+            yield from docs
                for ex, doc in zip(examples, docs):
                    ex.doc = doc
                    yield ex
            else:
                yield from docs
    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without