Remove the 'pass example into __call__' thing

2025-08-14 17:14:54 +03:00 · 2020-06-09 23:30:06 +02:00 · 2020-06-09 23:30:06 +02:00 · 0714f1fa5c
commit 0714f1fa5c
parent b3868cd1f8
1 changed files with 88 additions and 149 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
-from ..gold import Example
+from ..gold.new_example import NewExample as Example
 from ..attrs import POS, ID
 from ..util import link_vectors_to_models, create_default_optimizer
 from ..parts_of_speech import X
@ -48,12 +48,6 @@ class Pipe(object):
    def from_nlp(cls, nlp, model, **cfg):
        return cls(nlp.vocab, model, **cfg)

-    def _get_doc(self, example):
-        """ Use this method if the `example` can be both a Doc or an Example """
-        if isinstance(example, Doc):
-            return example
-        return example.doc
-
    def __init__(self, vocab, model, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
@ -73,18 +67,17 @@ class Pipe(object):
        else:
            self.set_annotations([doc], predictions)
        if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
            return example
        return doc

-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.

        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
            predictions = self.predict(docs)
            if isinstance(predictions, tuple) and len(tuple) == 2:
                scores, tensors = predictions
@ -94,7 +87,7 @@ class Pipe(object):

            if as_example:
                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
+                    ex.predicted = doc
                    yield ex
            else:
                yield from docs
@ -116,7 +109,6 @@ class Pipe(object):
        Delegates to predict() and get_loss().
        """
        if set_annotations:
-            docs = (self._get_doc(ex) for ex in examples)
            docs = list(self.pipe(docs))

    def rehearse(self, examples, sgd=None, losses=None, **config):
@ -256,28 +248,18 @@ class Tagger(Pipe):
        return tuple(self.vocab.morphology.tag_names)

    def __call__(self, example):
-        doc = self._get_doc(example)
        tags = self.predict([doc])
        self.set_annotations([doc], tags)
        if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
            return example
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
            tag_ids = self.predict(docs)
-            assert len(docs) == len(examples)
-            assert len(tag_ids) == len(examples)
            self.set_annotations(docs, tag_ids)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs

    def predict(self, docs):
        if not any(len(doc) for doc in docs):
@ -327,15 +309,17 @@ class Tagger(Pipe):
            doc.is_tagged = True

    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
+            assert isinstance(eg, Example)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.

-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        set_dropout_rate(self.model, drop)
-        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
+        tag_scores, bp_tag_scores = self.model.begin_update(
+            [eg.predicted for eg in examples])
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
                raise ValueError("nan value in scores")
@ -347,17 +331,16 @@ class Tagger(Pipe):
        if losses is not None:
            losses[self.name] += loss
        if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, self._scores2guesses(tag_scores))

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
        an initial model.
        """
+        docs = [eg.predicted for eg in examples]
        if self._rehearsal_model is None:
            return
-        examples = Example.to_example_objects(examples)
-        docs = [ex.doc for ex in examples]
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
@ -387,7 +370,8 @@ class Tagger(Pipe):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for example in get_examples():
-            for tag in example.token_annotation.tags:
+            for token in example.y:
+                tag = token.tag_
                if tag in orig_tag_map:
                    new_tag_map[tag] = orig_tag_map[tag]
                else:
@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
-        docs = [eg.doc for eg in examples]
+        docs = [eg.predicted for eg in examples]
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores

@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
        gold_examples = nonproj.preprocess_training_data(get_examples())
        # for raw_text, doc_annot in gold_tuples:
        for example in gold_examples:
-            for i in range(len(example.token_annotation.ids)):
-                label = self.make_label(i, example.token_annotation)
+            for token in example.y:
+                label = self.make_label(token)
                if label is not None and label not in self.labels:
                    self.labels[label] = len(self.labels)
        self.model.initialize()
@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
        for i, eg in enumerate(examples):
            # Handles alignment for tokenization differences
            doc_annots = eg.get_aligned()
-            for j in range(len(eg.doc)):
+            for j in range(len(eg.predicted)):
                tok_annots = {key: values[j] for key, values in tok_annots.items()}
                label = self.make_label(j, tok_annots)
                if label is None or label not in self.labels:
@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
        return float(loss), d_scores

    @staticmethod
-    def make_dep(i, token_annotation):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
-            return None
-        return token_annotation.deps[i]
+    def make_dep(token):
+        return token.dep_

    @staticmethod
-    def make_tag(i, token_annotation):
-        return token_annotation.tags[i]
+    def make_tag(token):
+        return token.tag_

    @staticmethod
-    def make_ent(i, token_annotation):
-        if token_annotation.entities is None:
-            return None
-        return token_annotation.entities[i]
+    def make_ent(token):
+        if token.ent_iob_ == "O":
+            return "O"
+        else:
+            return token.ent_iob_ + "-" + token.ent_type_

    @staticmethod
-    def make_dep_tag_offset(i, token_annotation):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
-            return None
-        offset = token_annotation.heads[i] - i
+    def make_dep_tag_offset(token):
+        dep = token.dep_
+        tag = token.tag_
+        offset = token.head.i - token.i
        offset = min(offset, 2)
        offset = max(offset, -2)
-        return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
+        return f"{dep}-{tag}:{offset}"

    @staticmethod
-    def make_ent_tag(i, token_annotation):
-        if token_annotation.entities is None or token_annotation.entities[i] is None:
-            return None
+    def make_ent_tag(token):
+        if token.ent_iob_ == "O":
+            ent = "O"
        else:
-            return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
+            ent = token.ent_iob_ + "-" + token.ent_type_
+        tag = token.tag_
+        return f"{tag}-{ent}"

    @staticmethod
-    def make_sent_start(target, token_annotation, cache=True, _cache={}):
+    def make_sent_start(token):
        """A multi-task objective for representing sentence boundaries,
        using BILU scheme. (O is impossible)
-
-        The implementation of this method uses an internal cache that relies
-        on the identity of the heads array, to avoid requiring a new piece
-        of gold data. You can pass cache=False if you know the cache will
-        do the wrong thing.
        """
-        words = token_annotation.words
-        heads = token_annotation.heads
-        assert len(words) == len(heads)
-        assert target < len(words), (target, len(words))
-        if cache:
-            if id(heads) in _cache:
-                return _cache[id(heads)][target]
-            else:
-                for key in list(_cache.keys()):
-                    _cache.pop(key)
-            sent_tags = ["I-SENT"] * len(words)
-            _cache[id(heads)] = sent_tags
+        if token.is_sent_start and token.is_sent_end:
+            return "U-SENT"
+        elif token.is_sent_start:
+            return "B-SENT"
        else:
-            sent_tags = ["I-SENT"] * len(words)
-
-        def _find_root(child):
-            seen = set([child])
-            while child is not None and heads[child] != child:
-                seen.add(child)
-                child = heads[child]
-            return child
-
-        sentences = {}
-        for i in range(len(words)):
-            root = _find_root(i)
-            if root is None:
-                sent_tags[i] = None
-            else:
-                sentences.setdefault(root, []).append(i)
-        for root, span in sorted(sentences.items()):
-            if len(span) == 1:
-                sent_tags[span[0]] = "U-SENT"
-            else:
-                sent_tags[span[0]] = "B-SENT"
-                sent_tags[span[-1]] = "L-SENT"
-        return sent_tags[target]
+            return "I-SENT"


 class ClozeMultitask(Pipe):
@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
        # token.vector values, but that's a bit inefficient, especially on GPU.
        # Instead we fetch the index into the vectors table for each of our tokens,
        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
+        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
        target = vectors[ids]
        gradient = self.distance.get_grad(prediction, target)
        loss = self.distance.get_loss(prediction, target)
@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
        pass

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        examples = Example.to_example_objects(examples)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
+        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
-        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
+        predictions, bp_predictions = self.model.begin_update(
+            [eg.predicted for eg in examples])
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions)
        if sgd is not None:
@ -883,17 +834,10 @@ class TextCategorizer(Pipe):
        self.cfg["labels"] = tuple(value)

    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
            scores, tensors = self.predict(docs)
            self.set_annotations(docs, scores, tensors=tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs

    def predict(self, docs):
        tensors = [doc.tensor for doc in docs]
@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
                doc.cats[label] = float(scores[i, j])

    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
-        examples = Example.to_example_objects(examples)
-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+        for eg in examples:
+            assert isinstance(eg, Example)
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        set_dropout_rate(self.model, drop)
-        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
+        scores, bp_scores = self.model.begin_update(
+            [eg.predicted for eg in examples]
+        )
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if sgd is not None:
@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, scores=scores)

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
            return
-        examples = Example.to_example_objects(examples)
-        docs=[ex.doc for ex in examples]
+        for eg in examples:
+            assert isinstance(eg, Example)
+        docs = [eg.predicted for eg in examples]
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
        not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
        for i, eg in enumerate(examples):
            for j, label in enumerate(self.labels):
-                if label in eg.doc_annotation.cats:
-                    truths[i, j] = eg.doc_annotation.cats[label]
+                if label in eg.predicted.cats:
+                    truths[i, j] = eg.reference.cats[label]
                else:
                    not_missing[i, j] = 0.
        truths = self.model.ops.asarray(truths)
@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
        # TODO: begin_training is not guaranteed to see all data / labels ?
        examples = list(get_examples())
        for example in examples:
-            for cat in example.doc_annotation.cats:
+            for cat in example.y.cats:
                self.add_label(cat)
        self.require_labels()
        docs = [Doc(Vocab(), words=["hello"])]
@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
            losses.setdefault(self.name, 0.0)
        if not examples:
            return 0
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
+            assert isinstance(eg, Example)
        sentence_docs = []
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
        if set_annotations:
            # This seems simpler than other ways to get that exact output -- but
            # it does run the model twice :(
            predictions = self.model.predict(docs)

        for eg in examples:
-            doc = eg.doc
+            doc = eg.predicted
            ents_by_offset = dict()
            for ent in doc.ents:
                ents_by_offset[(ent.start_char, ent.end_char)] = ent
-
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                if isinstance(entity, str):
                    entity = literal_eval(entity)
                start, end = entity
@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
    def get_similarity_loss(self, examples, scores):
        entity_encodings = []
        for eg in examples:
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                for kb_id, value in kb_dict.items():
                    # this loss function assumes we're only using positive examples
                    if value:
@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):

    def get_loss(self, examples, scores):
        cats = []
-        for ex in examples:
-            for entity, kb_dict in ex.doc_annotation.links.items():
+        for eg in examples:
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                for kb_id, value in kb_dict.items():
                    cats.append([value])

@ -1237,27 +1188,22 @@ class EntityLinker(Pipe):
        loss = loss / len(cats)
        return loss, d_scores

-    def __call__(self, example):
-        doc = self._get_doc(example)
+    def _get_links_from_doc(self, doc):
+        return {}
+
+    def __call__(self, doc):
        kb_ids, tensors = self.predict([doc])
        self.set_annotations([doc], kb_ids, tensors=tensors)
        if isinstance(example, Example):
-            example.doc = doc
+            example.x = doc
            return example
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
            kb_ids, tensors = self.predict(docs)
            self.set_annotations(docs, kb_ids, tensors=tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs

    def predict(self, docs):
        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
    ):
        pass

-    def __call__(self, example):
+    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.

        example (Doc or Example): The document to process.
@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):

        DOCS: https://spacy.io/api/sentencizer#call
        """
-        doc = self._get_doc(example)
        start = 0
        seen_period = False
        for i, token in enumerate(doc):
@ -1460,21 +1405,15 @@ class Sentencizer(Pipe):
            return example
        return doc

-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in util.minibatch(stream, size=batch_size):
            predictions = self.predict(docs)
            if isinstance(predictions, tuple) and len(tuple) == 2:
                scores, tensors = predictions
                self.set_annotations(docs, scores, tensors=tensors)
            else:
                self.set_annotations(docs, predictions)
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs

    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without