Set spaces on gold doc after conversion

This commit is contained in:
Matthew Honnibal 2020-06-25 15:19:36 +02:00
parent c2fd1e4eb9
commit c39401105b

View File

@ -13,6 +13,7 @@ from .iob_utils import spans_from_biluo_tags
from .align import Alignment from .align import Alignment
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from ..syntax import nonproj from ..syntax import nonproj
from ..util import get_words_and_spaces
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
@ -65,8 +66,8 @@ cdef class Example:
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if "SPACY" not in tok_dict: if not _has_field(tok_dict, "SPACY"):
tok_dict["SPACY"] = None spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -127,19 +128,20 @@ cdef class Example:
def get_aligned_ner(self): def get_aligned_ner(self):
x_ents = [] x_ents = []
gold_to_cand = self.alignment.gold_to_cand gold_to_cand = self.alignment.gold_to_cand
x_text = self.x.text
for y_ent in self.y.ents: for y_ent in self.y.ents:
x_start = gold_to_cand[y_ent.start] x_start = gold_to_cand[y_ent.start]
x_end = gold_to_cand[y_ent.end-1] x_end = gold_to_cand[y_ent.end-1]
if x_start is not None and x_end is not None: if x_start is not None and x_end is not None:
x_ents.append(Span(self.x, x_start, x_end+1, label=y_ent.label)) x_ents.append(Span(self.x, x_start, x_end+1, label=y_ent.label))
else: elif x_text.count(y_ent.text) == 1:
x_span = self.x.char_span( start_char = x_text.index(y_ent.text)
y_ent.start_char, end_char = start_char + len(y_ent.text)
y_ent.end_char, x_span = self.x.char_span(start_char, end_char, label=y_ent.label)
label=y_ent.label
)
if x_span is not None: if x_span is not None:
x_ents.append(x_span) x_ents.append(x_span)
else:
print(y_ent, y_ent.label_)
x_tags = biluo_tags_from_offsets( x_tags = biluo_tags_from_offsets(
self.x, self.x,
[(e.start_char, e.end_char, e.label_) for e in x_ents], [(e.start_char, e.end_char, e.label_) for e in x_ents],
@ -318,6 +320,9 @@ def _fix_legacy_dict_data(example_dict):
token_dict[remapping[key]] = value token_dict[remapping[key]] = value
else: else:
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
text = example_dict.get("text", example_dict.get("raw"))
if text and not _has_field(token_dict, "SPACY"):
token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
if "HEAD" in token_dict and "SENT_START" in token_dict: if "HEAD" in token_dict and "SENT_START" in token_dict:
# If heads are set, we don't also redundantly specify SENT_START. # If heads are set, we don't also redundantly specify SENT_START.
token_dict.pop("SENT_START") token_dict.pop("SENT_START")
@ -327,6 +332,18 @@ def _fix_legacy_dict_data(example_dict):
"doc_annotation": doc_dict "doc_annotation": doc_dict
} }
def _has_field(annot, field):
if field not in annot:
return False
elif annot[field] is None:
return False
elif len(annot[field]) == 0:
return False
elif all([value is None for value in annot[field]]):
return False
else:
return True
def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
if isinstance(biluo_or_offsets[0], (list, tuple)): if isinstance(biluo_or_offsets[0], (list, tuple)):
@ -380,3 +397,21 @@ def _parse_links(vocab, words, links, entities):
ent_kb_ids[i] = true_kb_ids[0] ent_kb_ids[i] = true_kb_ids[0]
return ent_kb_ids return ent_kb_ids
def _guess_spaces(text, words):
spaces = []
text_pos = 0
# align words with text
for word in words:
try:
word_start = text[text_pos:].index(word)
except ValueError:
spaces.append(True)
continue
text_pos += word_start + len(word)
if text_pos < len(text) and text[text_pos] == " ":
spaces.append(True)
else:
spaces.append(False)
return spaces