From 5564314d323f746a180a81888e76166a3687ff11 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 15:43:35 +0200 Subject: [PATCH] Suggest approach for GoldParse --- spacy/gold/new_example.pyx | 13 +++++++--- spacy/syntax/gold_parse.pyx | 50 +++++++++++-------------------------- 2 files changed, 24 insertions(+), 39 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index eb796eb83..d9a712e38 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -76,10 +76,15 @@ cdef class NewExample: raise NotImplementedError def to_dict(self): - """ Note that this method does NOT export the doc, only the annotations ! """ - token_dict = self._token_annotation - doc_dict = self._doc_annotation - return {"token_annotation": token_dict, "doc_annotation": doc_dict} + # We should probably implement this? We could return the + # doc_annotation and token_annotation, and this would allow us to + # easily implement the `get_parses_from_example` in + # spacy.syntax.gold_parse + raise NotImplementedError + + def split_sents(self): + # Unclear whether we should really implement this. I guess? + raise NotImplementedError def text(self): return self.x.text diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx index 05361fd82..9712f6e94 100644 --- a/spacy/syntax/gold_parse.pyx +++ b/spacy/syntax/gold_parse.pyx @@ -25,54 +25,34 @@ def is_punct_label(label): def get_parses_from_example( - eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False + example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False ): """Return a list of (doc, GoldParse) objects. If merge is set to True, keep all Token annotations as one big list.""" - d = eg.doc_annotation # merge == do not modify Example if merge: - t = eg.token_annotation - doc = eg.doc - if doc is None or not isinstance(doc, Doc): - if not vocab: - raise ValueError(Errors.E998) - doc = Doc(vocab, words=t.words) + examples = [example] + else: + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence + examples = eg.split_sents() + outputs = [] + for eg in examples: + eg_dict = eg.to_dict() try: gp = GoldParse.from_annotation( - doc, d, t, make_projective=make_projective + eg.predicted, + eg_dict["doc_annotation"], + eg_dict["token_annotation"], + make_projective=make_projective ) except AlignmentError: if ignore_misaligned: gp = None else: raise - return [(doc, gp)] - # not merging: one GoldParse per sentence, defining docs with the words - # from each sentence - else: - parses = [] - split_examples = eg.split_sents() - for split_example in split_examples: - if not vocab: - raise ValueError(Errors.E998) - split_doc = Doc(vocab, words=split_example.token_annotation.words) - try: - gp = GoldParse.from_annotation( - split_doc, - d, - split_example.token_annotation, - make_projective=make_projective, - ) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - if gp is not None: - parses.append((split_doc, gp)) - return parses - + outputs.append((eg.predicted, gp)) + return outputs cdef class GoldParse: