attempt to fix cases with weird spaces

This commit is contained in:
svlandeg 2020-06-16 11:52:29 +02:00
parent 0702a1d3fb
commit 6fea5fa4bd

View File

@ -10,10 +10,9 @@ from ..errors import Errors, AlignmentError
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH is set. """ """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
words = tok_annot["ORTH"]
attrs, array = _annot2array(vocab, tok_annot, doc_annot) attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=words) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if array.size: if array.size:
output = output.from_array(attrs, array) output = output.from_array(attrs, array)
output.cats.update(doc_annot.get("cats", {})) output.cats.update(doc_annot.get("cats", {}))
@ -56,6 +55,8 @@ cdef class Example:
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
if "SPACY" not in tok_dict:
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -110,7 +111,7 @@ cdef class Example:
prev_value = value prev_value = value
if field in ["ENT_IOB", "ENT_TYPE"]: if field in ["ENT_IOB", "ENT_TYPE"]:
# Assign O/- for one-to-many O/- NER tags # Assign one-to-many NER tags
for j, cand_j in enumerate(gold_to_cand): for j, cand_j in enumerate(gold_to_cand):
if cand_j is None: if cand_j is None:
if j in j2i_multi: if j in j2i_multi:
@ -175,7 +176,8 @@ def _annot2array(vocab, tok_annot, doc_annot):
for key, value in doc_annot.items(): for key, value in doc_annot.items():
if key == "entities": if key == "entities":
words = tok_annot["ORTH"] words = tok_annot["ORTH"]
ent_iobs, ent_types = _parse_ner_tags(vocab, words, value) spaces = tok_annot["SPACY"]
ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces)
tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_IOB"] = ent_iobs
tok_annot["ENT_TYPE"] = ent_types tok_annot["ENT_TYPE"] = ent_types
elif key == "links": elif key == "links":
@ -192,7 +194,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
for key, value in tok_annot.items(): for key, value in tok_annot.items():
if key not in IDS: if key not in IDS:
raise ValueError(f"Unknown token attribute: {key}") raise ValueError(f"Unknown token attribute: {key}")
elif key == "ORTH": elif key in ["ORTH", "SPACY"]:
pass pass
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
@ -249,6 +251,7 @@ def _fix_legacy_dict_data(example_dict):
"heads": "HEAD", "heads": "HEAD",
"sent_starts": "SENT_START", "sent_starts": "SENT_START",
"morphs": "MORPH", "morphs": "MORPH",
"spaces": "SPACY",
} }
old_token_dict = token_dict old_token_dict = token_dict
token_dict = {} token_dict = {}
@ -268,12 +271,12 @@ def _fix_legacy_dict_data(example_dict):
} }
def _parse_ner_tags(vocab, words, biluo_or_offsets): def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces=None):
if isinstance(biluo_or_offsets[0], (list, tuple)): if isinstance(biluo_or_offsets[0], (list, tuple)):
# Convert to biluo if necessary # Convert to biluo if necessary
# This is annoying but to convert the offsets we need a Doc # This is annoying but to convert the offsets we need a Doc
# that has the target tokenization. # that has the target tokenization.
reference = Doc(vocab, words=words) reference = Doc(vocab, words=words, spaces=spaces)
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
else: else:
biluo = biluo_or_offsets biluo = biluo_or_offsets