mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
add warning in debug_data for punctuation in entities (#4853)
This commit is contained in:
parent
d652ff215d
commit
6e9b61b49d
|
@ -192,6 +192,7 @@ def debug_data(
|
|||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
has_ws_ents_error = False
|
||||
has_punct_ents_warning = False
|
||||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
|
@ -226,10 +227,16 @@ def debug_data(
|
|||
|
||||
if gold_train_data["ws_ents"]:
|
||||
msg.fail(
|
||||
"{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
|
||||
"{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
|
||||
)
|
||||
has_ws_ents_error = True
|
||||
|
||||
if gold_train_data["punct_ents"]:
|
||||
msg.warn(
|
||||
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
|
||||
)
|
||||
has_punct_ents_warning = True
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
|
@ -253,6 +260,8 @@ def debug_data(
|
|||
msg.good("Examples without occurrences available for all labels")
|
||||
if not has_ws_ents_error:
|
||||
msg.good("No entities consisting of or starting/ending with whitespace")
|
||||
if not has_punct_ents_warning:
|
||||
msg.good("No entities consisting of or starting/ending with punctuation")
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
|
@ -273,6 +282,12 @@ def debug_data(
|
|||
"with whitespace characters are considered invalid."
|
||||
)
|
||||
|
||||
if has_punct_ents_warning:
|
||||
msg.text(
|
||||
"Entity spans consisting of or starting/ending "
|
||||
"with punctuation can not be trained with a noise level > 0."
|
||||
)
|
||||
|
||||
if "textcat" in pipeline:
|
||||
msg.divider("Text Classification")
|
||||
labels = [label for label in gold_train_data["cats"]]
|
||||
|
@ -547,6 +562,7 @@ def _compile_gold(train_docs, pipeline):
|
|||
"words": Counter(),
|
||||
"roots": Counter(),
|
||||
"ws_ents": 0,
|
||||
"punct_ents": 0,
|
||||
"n_words": 0,
|
||||
"n_misaligned_words": 0,
|
||||
"n_sents": 0,
|
||||
|
@ -568,6 +584,10 @@ def _compile_gold(train_docs, pipeline):
|
|||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||
# "Illegal" whitespace entity
|
||||
data["ws_ents"] += 1
|
||||
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
|
||||
# punctuation entity: could be replaced by whitespace when training with noise,
|
||||
# so add a warning to alert the user to this unexpected side effect.
|
||||
data["punct_ents"] += 1
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
|
|
|
@ -172,7 +172,8 @@ class Errors(object):
|
|||
"and satisfies the correct annotations specified in the GoldParse. "
|
||||
"For example, are all labels added to the model? If you're "
|
||||
"training a named entity recognizer, also make sure that none of "
|
||||
"your annotated entity spans have leading or trailing whitespace. "
|
||||
"your annotated entity spans have leading or trailing whitespace "
|
||||
"or punctuation. "
|
||||
"You can also use the experimental `debug-data` command to "
|
||||
"validate your JSON-formatted training data. For details, run:\n"
|
||||
"python -m spacy debug-data --help")
|
||||
|
|
Loading…
Reference in New Issue
Block a user