Suggest approach for GoldParse

2025-07-18 20:22:25 +03:00 · 2020-06-13 15:43:35 +02:00 · 2020-06-13 15:43:35 +02:00 · 5564314d32
commit 5564314d32
parent b078b05ecd
2 changed files with 24 additions and 39 deletions
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@ -76,10 +76,15 @@ cdef class NewExample:
        raise NotImplementedError

    def to_dict(self):
-        """ Note that this method does NOT export the doc, only the annotations ! """
-        token_dict = self._token_annotation
-        doc_dict = self._doc_annotation
-        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
+        # We should probably implement this? We could return the 
+        # doc_annotation and token_annotation, and this would allow us to
+        # easily implement the `get_parses_from_example` in
+        # spacy.syntax.gold_parse
+        raise NotImplementedError
+
+    def split_sents(self):
+        # Unclear whether we should really implement this. I guess?
+        raise NotImplementedError

    def text(self):
        return self.x.text
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@ -25,54 +25,34 @@ def is_punct_label(label):


 def get_parses_from_example(
-    eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
+    example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
 ):
    """Return a list of (doc, GoldParse) objects.
    If merge is set to True, keep all Token annotations as one big list."""
-    d = eg.doc_annotation
    # merge == do not modify Example
    if merge:
-        t = eg.token_annotation
-        doc = eg.doc
-        if doc is None or not isinstance(doc, Doc):
-            if not vocab:
-                raise ValueError(Errors.E998)
-            doc = Doc(vocab, words=t.words)
-        try:
-            gp = GoldParse.from_annotation(
-                doc, d, t, make_projective=make_projective
-            )
-        except AlignmentError:
-            if ignore_misaligned:
-                gp = None
+        examples = [example]
    else:
-                raise
-        return [(doc, gp)]
        # not merging: one GoldParse per sentence, defining docs with the words
        # from each sentence
-    else:
-        parses = []
-        split_examples = eg.split_sents()
-        for split_example in split_examples:
-            if not vocab:
-                raise ValueError(Errors.E998)
-            split_doc = Doc(vocab, words=split_example.token_annotation.words)
+        examples = eg.split_sents()
+    outputs = []
+    for eg in examples:
+        eg_dict = eg.to_dict()
        try:
            gp = GoldParse.from_annotation(
-                    split_doc,
-                    d,
-                    split_example.token_annotation,
-                    make_projective=make_projective,
+                eg.predicted,
+                eg_dict["doc_annotation"],
+                eg_dict["token_annotation"],
+                make_projective=make_projective
            )
        except AlignmentError:
            if ignore_misaligned:
                gp = None
            else:
                raise
-            if gp is not None:
-                parses.append((split_doc, gp))
-        return parses
-
+        outputs.append((eg.predicted, gp))
+    return outputs


 cdef class GoldParse: