Improve handling of missing values in NER

This commit is contained in:
Matthew Honnibal 2020-06-25 16:26:44 +02:00
parent b8c85e593b
commit 6bda23ad26

View File

@ -91,6 +91,10 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
biluo = ["-" for _ in doc] biluo = ["-" for _ in doc]
# Handle entity cases # Handle entity cases
for start_char, end_char, label in entities: for start_char, end_char, label in entities:
if not label:
if start_char in starts:
biluo[starts[start_char]] = "O"
else:
for token_index in range(start_char, end_char): for token_index in range(start_char, end_char):
if token_index in tokens_in_ents.keys(): if token_index in tokens_in_ents.keys():
raise ValueError( raise ValueError(
@ -127,7 +131,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
break break
else: else:
biluo[token.i] = missing biluo[token.i] = missing
if "-" in biluo: if "-" in biluo and missing != "-":
ent_str = str(entities) ent_str = str(entities)
warnings.warn( warnings.warn(
Warnings.W030.format( Warnings.W030.format(