From caa75087252649e527923f31ee88fe01e4694f7d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:10:21 +0200
Subject: [PATCH] Draft missing NewExample stuff

---
 spacy/gold/new_example.pyx | 70 +++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index d9a712e38..5b66d0cae 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -5,7 +5,7 @@ from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
 from .annotation import TokenAnnotation, DocAnnotation
-from .iob_utils import biluo_to_iob, biluo_tags_from_offsets
+from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 
@@ -73,18 +73,70 @@ cdef class NewExample:
         return self._alignment
 
     def get_aligned(self, field):
-        raise NotImplementedError
+        """Return an aligned array for a token attribute."""
+        # TODO: This is probably wrong. I just bashed this out and there's probably
+        # all sorts of edge-cases.
+        alignment = self.alignment
+        i2j_multi = alignment.i2j_multi
+        gold_to_cand = alignment.gold_to_cand
+        cand_to_gold = alignment.cand_to_gold
+
+        gold_values = self.reference.to_array([field])
+        output = []
+        for i, gold_i in enumerate(cand_to_gold):
+            if self.predicted[i].text.isspace():
+                output.append(None)
+            elif gold_i is None:
+                if i in i2j_multi:
+                    output.append(gold_values[i2j_multi[i]])
+                else:
+                    output.append(None)
+            else:
+                output.append(gold_values[gold_i])
+        return output
 
     def to_dict(self):
-        # We should probably implement this? We could return the 
-        # doc_annotation and token_annotation, and this would allow us to
-        # easily implement the `get_parses_from_example` in
-        # spacy.syntax.gold_parse
-        raise NotImplementedError
+        return {
+            "doc_annotation": {
+                "cats": dict(self.reference.cats),
+                "links": [], # TODO
+            },
+            "token_annotation": {
+                "ids": [t.i+1 for t in self.reference],
+                "words": [t.text for t in self.reference],
+                "tags": [t.tag_ for t in self.reference],
+                "lemmas": [t.lemma_ for t in self.reference],
+                "pos": [t.pos_ for t in self.reference],
+                "morphs": [t.morph_ for t in self.reference],
+                "heads": [t.head.i for t in self.reference],
+                "deps": [t.dep_ for t in self.reference],
+                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
+                "entities": biluo_tags_from_doc(self.reference)
+            }
+        }
 
     def split_sents(self):
-        # Unclear whether we should really implement this. I guess?
-        raise NotImplementedError
+        """ Split the token annotations into multiple Examples based on
+        sent_starts and return a list of the new Examples"""
+        if not self.reference.is_sentenced:
+            return [self]
+        # TODO: Do this for misaligned somehow?
+        predicted_words = [t.text for t in self.predicted]
+        reference_words = [t.text for t in self.reference]
+        if predicted_words != reference_words:
+            raise NotImplementedError("TODO: Implement this")
+        # Implement the easy case.
+        output = []
+        cls = self.__class__
+        for sent in self.reference.sents:
+            # I guess for misaligned we just need to use the gold_to_cand?
+            output.append(
+                cls(
+                    self.predicted[sent.start : sent.end + 1].as_doc(),
+                    sent.as_doc()
+                )
+            )
+        return output
 
     def text(self):
         return self.x.text