add warning in debug_data for punctuation in entities (#4853)

This commit is contained in:
Sofie Van Landeghem 2020-01-06 14:59:28 +01:00 committed by Matthew Honnibal
parent d652ff215d
commit 6e9b61b49d
2 changed files with 23 additions and 2 deletions

View File

@ -192,6 +192,7 @@ def debug_data(
has_low_data_warning = False has_low_data_warning = False
has_no_neg_warning = False has_no_neg_warning = False
has_ws_ents_error = False has_ws_ents_error = False
has_punct_ents_warning = False
msg.divider("Named Entity Recognition") msg.divider("Named Entity Recognition")
msg.info( msg.info(
@ -226,10 +227,16 @@ def debug_data(
if gold_train_data["ws_ents"]: if gold_train_data["ws_ents"]:
msg.fail( msg.fail(
"{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]) "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
) )
has_ws_ents_error = True has_ws_ents_error = True
if gold_train_data["punct_ents"]:
msg.warn(
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
)
has_punct_ents_warning = True
for label in new_labels: for label in new_labels:
if label_counts[label] <= NEW_LABEL_THRESHOLD: if label_counts[label] <= NEW_LABEL_THRESHOLD:
msg.warn( msg.warn(
@ -253,6 +260,8 @@ def debug_data(
msg.good("Examples without occurrences available for all labels") msg.good("Examples without occurrences available for all labels")
if not has_ws_ents_error: if not has_ws_ents_error:
msg.good("No entities consisting of or starting/ending with whitespace") msg.good("No entities consisting of or starting/ending with whitespace")
if not has_punct_ents_warning:
msg.good("No entities consisting of or starting/ending with punctuation")
if has_low_data_warning: if has_low_data_warning:
msg.text( msg.text(
@ -273,6 +282,12 @@ def debug_data(
"with whitespace characters are considered invalid." "with whitespace characters are considered invalid."
) )
if has_punct_ents_warning:
msg.text(
"Entity spans consisting of or starting/ending "
"with punctuation can not be trained with a noise level > 0."
)
if "textcat" in pipeline: if "textcat" in pipeline:
msg.divider("Text Classification") msg.divider("Text Classification")
labels = [label for label in gold_train_data["cats"]] labels = [label for label in gold_train_data["cats"]]
@ -547,6 +562,7 @@ def _compile_gold(train_docs, pipeline):
"words": Counter(), "words": Counter(),
"roots": Counter(), "roots": Counter(),
"ws_ents": 0, "ws_ents": 0,
"punct_ents": 0,
"n_words": 0, "n_words": 0,
"n_misaligned_words": 0, "n_misaligned_words": 0,
"n_sents": 0, "n_sents": 0,
@ -568,6 +584,10 @@ def _compile_gold(train_docs, pipeline):
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
# "Illegal" whitespace entity # "Illegal" whitespace entity
data["ws_ents"] += 1 data["ws_ents"] += 1
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
# punctuation entity: could be replaced by whitespace when training with noise,
# so add a warning to alert the user to this unexpected side effect.
data["punct_ents"] += 1
if label.startswith(("B-", "U-")): if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1] combined_label = label.split("-")[1]
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1

View File

@ -172,7 +172,8 @@ class Errors(object):
"and satisfies the correct annotations specified in the GoldParse. " "and satisfies the correct annotations specified in the GoldParse. "
"For example, are all labels added to the model? If you're " "For example, are all labels added to the model? If you're "
"training a named entity recognizer, also make sure that none of " "training a named entity recognizer, also make sure that none of "
"your annotated entity spans have leading or trailing whitespace. " "your annotated entity spans have leading or trailing whitespace "
"or punctuation. "
"You can also use the experimental `debug-data` command to " "You can also use the experimental `debug-data` command to "
"validate your JSON-formatted training data. For details, run:\n" "validate your JSON-formatted training data. For details, run:\n"
"python -m spacy debug-data --help") "python -m spacy debug-data --help")