diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 5d044e617..4b12052c3 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -192,6 +192,7 @@ def debug_data( has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False + has_punct_ents_warning = False msg.divider("Named Entity Recognition") msg.info( @@ -226,10 +227,16 @@ def debug_data( if gold_train_data["ws_ents"]: msg.fail( - "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]) + "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) ) has_ws_ents_error = True + if gold_train_data["punct_ents"]: + msg.warn( + "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) + ) + has_punct_ents_warning = True + for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( @@ -253,6 +260,8 @@ def debug_data( msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") + if not has_punct_ents_warning: + msg.good("No entities consisting of or starting/ending with punctuation") if has_low_data_warning: msg.text( @@ -273,6 +282,12 @@ def debug_data( "with whitespace characters are considered invalid." ) + if has_punct_ents_warning: + msg.text( + "Entity spans consisting of or starting/ending " + "with punctuation can not be trained with a noise level > 0." + ) + if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_train_data["cats"]] @@ -547,6 +562,7 @@ def _compile_gold(train_docs, pipeline): "words": Counter(), "roots": Counter(), "ws_ents": 0, + "punct_ents": 0, "n_words": 0, "n_misaligned_words": 0, "n_sents": 0, @@ -568,6 +584,10 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + # punctuation entity: could be replaced by whitespace when training with noise, + # so add a warning to alert the user to this unexpected side effect. + data["punct_ents"] += 1 if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 diff --git a/spacy/errors.py b/spacy/errors.py index fd0f66cd9..2f0a8a2ad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -172,7 +172,8 @@ class Errors(object): "and satisfies the correct annotations specified in the GoldParse. " "For example, are all labels added to the model? If you're " "training a named entity recognizer, also make sure that none of " - "your annotated entity spans have leading or trailing whitespace. " + "your annotated entity spans have leading or trailing whitespace " + "or punctuation. " "You can also use the experimental `debug-data` command to " "validate your JSON-formatted training data. For details, run:\n" "python -m spacy debug-data --help")