Suggest approach for GoldParse

This commit is contained in:
Matthew Honnibal 2020-06-13 15:43:35 +02:00
parent b078b05ecd
commit 5564314d32
2 changed files with 24 additions and 39 deletions

View File

@ -76,10 +76,15 @@ cdef class NewExample:
raise NotImplementedError raise NotImplementedError
def to_dict(self): def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """ # We should probably implement this? We could return the
token_dict = self._token_annotation # doc_annotation and token_annotation, and this would allow us to
doc_dict = self._doc_annotation # easily implement the `get_parses_from_example` in
return {"token_annotation": token_dict, "doc_annotation": doc_dict} # spacy.syntax.gold_parse
raise NotImplementedError
def split_sents(self):
# Unclear whether we should really implement this. I guess?
raise NotImplementedError
def text(self): def text(self):
return self.x.text return self.x.text

View File

@ -25,54 +25,34 @@ def is_punct_label(label):
def get_parses_from_example( def get_parses_from_example(
eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
): ):
"""Return a list of (doc, GoldParse) objects. """Return a list of (doc, GoldParse) objects.
If merge is set to True, keep all Token annotations as one big list.""" If merge is set to True, keep all Token annotations as one big list."""
d = eg.doc_annotation
# merge == do not modify Example # merge == do not modify Example
if merge: if merge:
t = eg.token_annotation examples = [example]
doc = eg.doc else:
if doc is None or not isinstance(doc, Doc): # not merging: one GoldParse per sentence, defining docs with the words
if not vocab: # from each sentence
raise ValueError(Errors.E998) examples = eg.split_sents()
doc = Doc(vocab, words=t.words) outputs = []
for eg in examples:
eg_dict = eg.to_dict()
try: try:
gp = GoldParse.from_annotation( gp = GoldParse.from_annotation(
doc, d, t, make_projective=make_projective eg.predicted,
eg_dict["doc_annotation"],
eg_dict["token_annotation"],
make_projective=make_projective
) )
except AlignmentError: except AlignmentError:
if ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise
return [(doc, gp)] outputs.append((eg.predicted, gp))
# not merging: one GoldParse per sentence, defining docs with the words return outputs
# from each sentence
else:
parses = []
split_examples = eg.split_sents()
for split_example in split_examples:
if not vocab:
raise ValueError(Errors.E998)
split_doc = Doc(vocab, words=split_example.token_annotation.words)
try:
gp = GoldParse.from_annotation(
split_doc,
d,
split_example.token_annotation,
make_projective=make_projective,
)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
if gp is not None:
parses.append((split_doc, gp))
return parses
cdef class GoldParse: cdef class GoldParse: