mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Improve handling of missing values in NER
This commit is contained in:
parent
b8c85e593b
commit
6bda23ad26
|
@ -91,31 +91,35 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
biluo = ["-" for _ in doc]
|
biluo = ["-" for _ in doc]
|
||||||
# Handle entity cases
|
# Handle entity cases
|
||||||
for start_char, end_char, label in entities:
|
for start_char, end_char, label in entities:
|
||||||
for token_index in range(start_char, end_char):
|
if not label:
|
||||||
if token_index in tokens_in_ents.keys():
|
if start_char in starts:
|
||||||
raise ValueError(
|
biluo[starts[start_char]] = "O"
|
||||||
Errors.E103.format(
|
else:
|
||||||
span1=(
|
for token_index in range(start_char, end_char):
|
||||||
tokens_in_ents[token_index][0],
|
if token_index in tokens_in_ents.keys():
|
||||||
tokens_in_ents[token_index][1],
|
raise ValueError(
|
||||||
tokens_in_ents[token_index][2],
|
Errors.E103.format(
|
||||||
),
|
span1=(
|
||||||
span2=(start_char, end_char, label),
|
tokens_in_ents[token_index][0],
|
||||||
|
tokens_in_ents[token_index][1],
|
||||||
|
tokens_in_ents[token_index][2],
|
||||||
|
),
|
||||||
|
span2=(start_char, end_char, label),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
tokens_in_ents[token_index] = (start_char, end_char, label)
|
||||||
tokens_in_ents[token_index] = (start_char, end_char, label)
|
|
||||||
|
|
||||||
start_token = starts.get(start_char)
|
start_token = starts.get(start_char)
|
||||||
end_token = ends.get(end_char)
|
end_token = ends.get(end_char)
|
||||||
# Only interested if the tokenization is correct
|
# Only interested if the tokenization is correct
|
||||||
if start_token is not None and end_token is not None:
|
if start_token is not None and end_token is not None:
|
||||||
if start_token == end_token:
|
if start_token == end_token:
|
||||||
biluo[start_token] = f"U-{label}"
|
biluo[start_token] = f"U-{label}"
|
||||||
else:
|
else:
|
||||||
biluo[start_token] = f"B-{label}"
|
biluo[start_token] = f"B-{label}"
|
||||||
for i in range(start_token + 1, end_token):
|
for i in range(start_token + 1, end_token):
|
||||||
biluo[i] = f"I-{label}"
|
biluo[i] = f"I-{label}"
|
||||||
biluo[end_token] = f"L-{label}"
|
biluo[end_token] = f"L-{label}"
|
||||||
# Now distinguish the O cases from ones where we miss the tokenization
|
# Now distinguish the O cases from ones where we miss the tokenization
|
||||||
entity_chars = set()
|
entity_chars = set()
|
||||||
for start_char, end_char, label in entities:
|
for start_char, end_char, label in entities:
|
||||||
|
@ -127,7 +131,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
biluo[token.i] = missing
|
biluo[token.i] = missing
|
||||||
if "-" in biluo:
|
if "-" in biluo and missing != "-":
|
||||||
ent_str = str(entities)
|
ent_str = str(entities)
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
Warnings.W030.format(
|
Warnings.W030.format(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user