Draft missing NewExample stuff

This commit is contained in:
Matthew Honnibal 2020-06-13 23:10:21 +02:00
parent 3eb8f3867e
commit caa7508725

View File

@ -5,7 +5,7 @@ from ..tokens.doc cimport Doc
from ..attrs import IDS from ..attrs import IDS
from .align cimport Alignment from .align cimport Alignment
from .annotation import TokenAnnotation, DocAnnotation from .annotation import TokenAnnotation, DocAnnotation
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
from .align import Alignment from .align import Alignment
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
@ -73,18 +73,70 @@ cdef class NewExample:
return self._alignment return self._alignment
def get_aligned(self, field): def get_aligned(self, field):
raise NotImplementedError """Return an aligned array for a token attribute."""
# TODO: This is probably wrong. I just bashed this out and there's probably
# all sorts of edge-cases.
alignment = self.alignment
i2j_multi = alignment.i2j_multi
gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold
gold_values = self.reference.to_array([field])
output = []
for i, gold_i in enumerate(cand_to_gold):
if self.predicted[i].text.isspace():
output.append(None)
elif gold_i is None:
if i in i2j_multi:
output.append(gold_values[i2j_multi[i]])
else:
output.append(None)
else:
output.append(gold_values[gold_i])
return output
def to_dict(self): def to_dict(self):
# We should probably implement this? We could return the return {
# doc_annotation and token_annotation, and this would allow us to "doc_annotation": {
# easily implement the `get_parses_from_example` in "cats": dict(self.reference.cats),
# spacy.syntax.gold_parse "links": [], # TODO
raise NotImplementedError },
"token_annotation": {
"ids": [t.i+1 for t in self.reference],
"words": [t.text for t in self.reference],
"tags": [t.tag_ for t in self.reference],
"lemmas": [t.lemma_ for t in self.reference],
"pos": [t.pos_ for t in self.reference],
"morphs": [t.morph_ for t in self.reference],
"heads": [t.head.i for t in self.reference],
"deps": [t.dep_ for t in self.reference],
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
"entities": biluo_tags_from_doc(self.reference)
}
}
def split_sents(self): def split_sents(self):
# Unclear whether we should really implement this. I guess? """ Split the token annotations into multiple Examples based on
raise NotImplementedError sent_starts and return a list of the new Examples"""
if not self.reference.is_sentenced:
return [self]
# TODO: Do this for misaligned somehow?
predicted_words = [t.text for t in self.predicted]
reference_words = [t.text for t in self.reference]
if predicted_words != reference_words:
raise NotImplementedError("TODO: Implement this")
# Implement the easy case.
output = []
cls = self.__class__
for sent in self.reference.sents:
# I guess for misaligned we just need to use the gold_to_cand?
output.append(
cls(
self.predicted[sent.start : sent.end + 1].as_doc(),
sent.as_doc()
)
)
return output
def text(self): def text(self):
return self.x.text return self.x.text