mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
add warning in debug_data for punctuation in entities (#4853)
This commit is contained in:
parent
d652ff215d
commit
6e9b61b49d
|
@ -192,6 +192,7 @@ def debug_data(
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
has_ws_ents_error = False
|
has_ws_ents_error = False
|
||||||
|
has_punct_ents_warning = False
|
||||||
|
|
||||||
msg.divider("Named Entity Recognition")
|
msg.divider("Named Entity Recognition")
|
||||||
msg.info(
|
msg.info(
|
||||||
|
@ -226,10 +227,16 @@ def debug_data(
|
||||||
|
|
||||||
if gold_train_data["ws_ents"]:
|
if gold_train_data["ws_ents"]:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
|
"{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
|
||||||
)
|
)
|
||||||
has_ws_ents_error = True
|
has_ws_ents_error = True
|
||||||
|
|
||||||
|
if gold_train_data["punct_ents"]:
|
||||||
|
msg.warn(
|
||||||
|
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
|
||||||
|
)
|
||||||
|
has_punct_ents_warning = True
|
||||||
|
|
||||||
for label in new_labels:
|
for label in new_labels:
|
||||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -253,6 +260,8 @@ def debug_data(
|
||||||
msg.good("Examples without occurrences available for all labels")
|
msg.good("Examples without occurrences available for all labels")
|
||||||
if not has_ws_ents_error:
|
if not has_ws_ents_error:
|
||||||
msg.good("No entities consisting of or starting/ending with whitespace")
|
msg.good("No entities consisting of or starting/ending with whitespace")
|
||||||
|
if not has_punct_ents_warning:
|
||||||
|
msg.good("No entities consisting of or starting/ending with punctuation")
|
||||||
|
|
||||||
if has_low_data_warning:
|
if has_low_data_warning:
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -273,6 +282,12 @@ def debug_data(
|
||||||
"with whitespace characters are considered invalid."
|
"with whitespace characters are considered invalid."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if has_punct_ents_warning:
|
||||||
|
msg.text(
|
||||||
|
"Entity spans consisting of or starting/ending "
|
||||||
|
"with punctuation can not be trained with a noise level > 0."
|
||||||
|
)
|
||||||
|
|
||||||
if "textcat" in pipeline:
|
if "textcat" in pipeline:
|
||||||
msg.divider("Text Classification")
|
msg.divider("Text Classification")
|
||||||
labels = [label for label in gold_train_data["cats"]]
|
labels = [label for label in gold_train_data["cats"]]
|
||||||
|
@ -547,6 +562,7 @@ def _compile_gold(train_docs, pipeline):
|
||||||
"words": Counter(),
|
"words": Counter(),
|
||||||
"roots": Counter(),
|
"roots": Counter(),
|
||||||
"ws_ents": 0,
|
"ws_ents": 0,
|
||||||
|
"punct_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
"n_sents": 0,
|
"n_sents": 0,
|
||||||
|
@ -568,6 +584,10 @@ def _compile_gold(train_docs, pipeline):
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||||
# "Illegal" whitespace entity
|
# "Illegal" whitespace entity
|
||||||
data["ws_ents"] += 1
|
data["ws_ents"] += 1
|
||||||
|
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
|
||||||
|
# punctuation entity: could be replaced by whitespace when training with noise,
|
||||||
|
# so add a warning to alert the user to this unexpected side effect.
|
||||||
|
data["punct_ents"] += 1
|
||||||
if label.startswith(("B-", "U-")):
|
if label.startswith(("B-", "U-")):
|
||||||
combined_label = label.split("-")[1]
|
combined_label = label.split("-")[1]
|
||||||
data["ner"][combined_label] += 1
|
data["ner"][combined_label] += 1
|
||||||
|
|
|
@ -172,7 +172,8 @@ class Errors(object):
|
||||||
"and satisfies the correct annotations specified in the GoldParse. "
|
"and satisfies the correct annotations specified in the GoldParse. "
|
||||||
"For example, are all labels added to the model? If you're "
|
"For example, are all labels added to the model? If you're "
|
||||||
"training a named entity recognizer, also make sure that none of "
|
"training a named entity recognizer, also make sure that none of "
|
||||||
"your annotated entity spans have leading or trailing whitespace. "
|
"your annotated entity spans have leading or trailing whitespace "
|
||||||
|
"or punctuation. "
|
||||||
"You can also use the experimental `debug-data` command to "
|
"You can also use the experimental `debug-data` command to "
|
||||||
"validate your JSON-formatted training data. For details, run:\n"
|
"validate your JSON-formatted training data. For details, run:\n"
|
||||||
"python -m spacy debug-data --help")
|
"python -m spacy debug-data --help")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user