Suggest approach for GoldParse

This commit is contained in:
Matthew Honnibal 2020-06-13 15:43:35 +02:00
parent b078b05ecd
commit 5564314d32
2 changed files with 24 additions and 39 deletions

View File

@ -76,10 +76,15 @@ cdef class NewExample:
raise NotImplementedError
def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """
token_dict = self._token_annotation
doc_dict = self._doc_annotation
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
# We should probably implement this? We could return the
# doc_annotation and token_annotation, and this would allow us to
# easily implement the `get_parses_from_example` in
# spacy.syntax.gold_parse
raise NotImplementedError
def split_sents(self):
# Unclear whether we should really implement this. I guess?
raise NotImplementedError
def text(self):
return self.x.text

View File

@ -25,54 +25,34 @@ def is_punct_label(label):
def get_parses_from_example(
eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, keep all Token annotations as one big list."""
d = eg.doc_annotation
# merge == do not modify Example
if merge:
t = eg.token_annotation
doc = eg.doc
if doc is None or not isinstance(doc, Doc):
if not vocab:
raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words)
examples = [example]
else:
# not merging: one GoldParse per sentence, defining docs with the words
# from each sentence
examples = eg.split_sents()
outputs = []
for eg in examples:
eg_dict = eg.to_dict()
try:
gp = GoldParse.from_annotation(
doc, d, t, make_projective=make_projective
eg.predicted,
eg_dict["doc_annotation"],
eg_dict["token_annotation"],
make_projective=make_projective
)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
return [(doc, gp)]
# not merging: one GoldParse per sentence, defining docs with the words
# from each sentence
else:
parses = []
split_examples = eg.split_sents()
for split_example in split_examples:
if not vocab:
raise ValueError(Errors.E998)
split_doc = Doc(vocab, words=split_example.token_annotation.words)
try:
gp = GoldParse.from_annotation(
split_doc,
d,
split_example.token_annotation,
make_projective=make_projective,
)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
if gp is not None:
parses.append((split_doc, gp))
return parses
outputs.append((eg.predicted, gp))
return outputs
cdef class GoldParse: