Suggest approach for GoldParse

2025-07-15 02:32:37 +03:00 · 2020-06-13 15:43:35 +02:00 · 2020-06-13 15:43:35 +02:00 · 5564314d32
commit 5564314d32
parent b078b05ecd
2 changed files with 24 additions and 39 deletions
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@ -76,10 +76,15 @@ cdef class NewExample:
        raise NotImplementedError
    def to_dict(self):
-        """ Note that this method does NOT export the doc, only the annotations ! """
+        # We should probably implement this? We could return the 
-        token_dict = self._token_annotation
+        # doc_annotation and token_annotation, and this would allow us to
-        doc_dict = self._doc_annotation
+        # easily implement the `get_parses_from_example` in
-        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
+        # spacy.syntax.gold_parse
        raise NotImplementedError
    def split_sents(self):
        # Unclear whether we should really implement this. I guess?
        raise NotImplementedError
    def text(self):
        return self.x.text
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@ -25,54 +25,34 @@ def is_punct_label(label):
 def get_parses_from_example(
-    eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
+    example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
 ):
    """Return a list of (doc, GoldParse) objects.
    If merge is set to True, keep all Token annotations as one big list."""
    d = eg.doc_annotation
    # merge == do not modify Example
    if merge:
-        t = eg.token_annotation
+        examples = [example]
-        doc = eg.doc
+    else:
-        if doc is None or not isinstance(doc, Doc):
+        # not merging: one GoldParse per sentence, defining docs with the words
-            if not vocab:
+        # from each sentence
-                raise ValueError(Errors.E998)
+        examples = eg.split_sents()
-            doc = Doc(vocab, words=t.words)
+    outputs = []
    for eg in examples:
        eg_dict = eg.to_dict()
        try:
            gp = GoldParse.from_annotation(
-                doc, d, t, make_projective=make_projective
+                eg.predicted,
                eg_dict["doc_annotation"],
                eg_dict["token_annotation"],
                make_projective=make_projective
            )
        except AlignmentError:
            if ignore_misaligned:
                gp = None
            else:
                raise
-        return [(doc, gp)]
+        outputs.append((eg.predicted, gp))
-    # not merging: one GoldParse per sentence, defining docs with the words
+    return outputs
    # from each sentence
    else:
        parses = []
        split_examples = eg.split_sents()
        for split_example in split_examples:
            if not vocab:
                raise ValueError(Errors.E998)
            split_doc = Doc(vocab, words=split_example.token_annotation.words)
            try:
                gp = GoldParse.from_annotation(
                    split_doc,
                    d,
                    split_example.token_annotation,
                    make_projective=make_projective,
                )
            except AlignmentError:
                if ignore_misaligned:
                    gp = None
                else:
                    raise
            if gp is not None:
                parses.append((split_doc, gp))
        return parses
 cdef class GoldParse: