mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Feat/debug data warn spread ents (#9960)
* added check for crossing boundaries * formatted blacked * Rephrasing slightly Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
56dcb39fb7
commit
55cf492218
|
@ -203,6 +203,7 @@ def debug_data(
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
has_ws_ents_error = False
|
has_ws_ents_error = False
|
||||||
|
has_boundary_cross_ents_warning = False
|
||||||
|
|
||||||
msg.divider("Named Entity Recognition")
|
msg.divider("Named Entity Recognition")
|
||||||
msg.info(f"{len(model_labels)} label(s)")
|
msg.info(f"{len(model_labels)} label(s)")
|
||||||
|
@ -242,12 +243,20 @@ def debug_data(
|
||||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||||
has_no_neg_warning = True
|
has_no_neg_warning = True
|
||||||
|
|
||||||
|
if gold_train_data["boundary_cross_ents"]:
|
||||||
|
msg.warn(
|
||||||
|
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
|
||||||
|
)
|
||||||
|
has_boundary_cross_ents_warning = True
|
||||||
|
|
||||||
if not has_low_data_warning:
|
if not has_low_data_warning:
|
||||||
msg.good("Good amount of examples for all labels")
|
msg.good("Good amount of examples for all labels")
|
||||||
if not has_no_neg_warning:
|
if not has_no_neg_warning:
|
||||||
msg.good("Examples without occurrences available for all labels")
|
msg.good("Examples without occurrences available for all labels")
|
||||||
if not has_ws_ents_error:
|
if not has_ws_ents_error:
|
||||||
msg.good("No entities consisting of or starting/ending with whitespace")
|
msg.good("No entities consisting of or starting/ending with whitespace")
|
||||||
|
if not has_boundary_cross_ents_warning:
|
||||||
|
msg.good("No entities crossing sentence boundaries")
|
||||||
|
|
||||||
if has_low_data_warning:
|
if has_low_data_warning:
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -565,6 +574,7 @@ def _compile_gold(
|
||||||
"words": Counter(),
|
"words": Counter(),
|
||||||
"roots": Counter(),
|
"roots": Counter(),
|
||||||
"ws_ents": 0,
|
"ws_ents": 0,
|
||||||
|
"boundary_cross_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
"words_missing_vectors": Counter(),
|
"words_missing_vectors": Counter(),
|
||||||
|
@ -602,6 +612,8 @@ def _compile_gold(
|
||||||
if label.startswith(("B-", "U-")):
|
if label.startswith(("B-", "U-")):
|
||||||
combined_label = label.split("-")[1]
|
combined_label = label.split("-")[1]
|
||||||
data["ner"][combined_label] += 1
|
data["ner"][combined_label] += 1
|
||||||
|
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
|
||||||
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user