Draft missing NewExample stuff

2025-12-24 02:23:19 +03:00 · 2020-06-13 23:10:21 +02:00 · 2020-06-13 23:10:21 +02:00 · caa7508725
commit caa7508725
parent 3eb8f3867e
1 changed files with 61 additions and 9 deletions
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@ -5,7 +5,7 @@ from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
 from .annotation import TokenAnnotation, DocAnnotation
-from .iob_utils import biluo_to_iob, biluo_tags_from_offsets
+from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .align import Alignment
 from ..errors import Errors, AlignmentError
@ -73,18 +73,70 @@ cdef class NewExample:
        return self._alignment
    def get_aligned(self, field):
-        raise NotImplementedError
+        """Return an aligned array for a token attribute."""
        # TODO: This is probably wrong. I just bashed this out and there's probably
        # all sorts of edge-cases.
        alignment = self.alignment
        i2j_multi = alignment.i2j_multi
        gold_to_cand = alignment.gold_to_cand
        cand_to_gold = alignment.cand_to_gold
        gold_values = self.reference.to_array([field])
        output = []
        for i, gold_i in enumerate(cand_to_gold):
            if self.predicted[i].text.isspace():
                output.append(None)
            elif gold_i is None:
                if i in i2j_multi:
                    output.append(gold_values[i2j_multi[i]])
                else:
                    output.append(None)
            else:
                output.append(gold_values[gold_i])
        return output
    def to_dict(self):
-        # We should probably implement this? We could return the 
+        return {
-        # doc_annotation and token_annotation, and this would allow us to
+            "doc_annotation": {
-        # easily implement the `get_parses_from_example` in
+                "cats": dict(self.reference.cats),
-        # spacy.syntax.gold_parse
+                "links": [], # TODO
-        raise NotImplementedError
+            },
            "token_annotation": {
                "ids": [t.i+1 for t in self.reference],
                "words": [t.text for t in self.reference],
                "tags": [t.tag_ for t in self.reference],
                "lemmas": [t.lemma_ for t in self.reference],
                "pos": [t.pos_ for t in self.reference],
                "morphs": [t.morph_ for t in self.reference],
                "heads": [t.head.i for t in self.reference],
                "deps": [t.dep_ for t in self.reference],
                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
                "entities": biluo_tags_from_doc(self.reference)
            }
        }
    def split_sents(self):
-        # Unclear whether we should really implement this. I guess?
+        """ Split the token annotations into multiple Examples based on
-        raise NotImplementedError
+        sent_starts and return a list of the new Examples"""
        if not self.reference.is_sentenced:
            return [self]
        # TODO: Do this for misaligned somehow?
        predicted_words = [t.text for t in self.predicted]
        reference_words = [t.text for t in self.reference]
        if predicted_words != reference_words:
            raise NotImplementedError("TODO: Implement this")
        # Implement the easy case.
        output = []
        cls = self.__class__
        for sent in self.reference.sents:
            # I guess for misaligned we just need to use the gold_to_cand?
            output.append(
                cls(
                    self.predicted[sent.start : sent.end + 1].as_doc(),
                    sent.as_doc()
                )
            )
        return output
    def text(self):
        return self.x.text