add warning in debug_data for punctuation in entities (#4853)

2025-11-24 03:46:02 +03:00 · 2020-01-06 14:59:28 +01:00 · 2020-01-06 14:59:28 +01:00 · 6e9b61b49d
commit 6e9b61b49d
parent d652ff215d
2 changed files with 23 additions and 2 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -192,6 +192,7 @@ def debug_data(
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
+        has_punct_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(
@ -226,10 +227,16 @@ def debug_data(

        if gold_train_data["ws_ents"]:
            msg.fail(
-                "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
+                "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
            )
            has_ws_ents_error = True

+        if gold_train_data["punct_ents"]:
+            msg.warn(
+                "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
+            )
+            has_punct_ents_warning = True
+
        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
@ -253,6 +260,8 @@ def debug_data(
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")
+        if not has_punct_ents_warning:
+            msg.good("No entities consisting of or starting/ending with punctuation")

        if has_low_data_warning:
            msg.text(
@ -273,6 +282,12 @@ def debug_data(
                "with whitespace characters are considered invalid."
            )

+        if has_punct_ents_warning:
+            msg.text(
+                "Entity spans consisting of or starting/ending "
+                "with punctuation can not be trained with a noise level > 0."
+            )
+
    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["cats"]]
@ -547,6 +562,7 @@ def _compile_gold(train_docs, pipeline):
        "words": Counter(),
        "roots": Counter(),
        "ws_ents": 0,
+        "punct_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
        "n_sents": 0,
@ -568,6 +584,10 @@ def _compile_gold(train_docs, pipeline):
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
                    # "Illegal" whitespace entity
                    data["ws_ents"] += 1
+                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
+                    # punctuation entity: could be replaced by whitespace when training with noise,
+                    # so add a warning to alert the user to this unexpected side effect.
+                    data["punct_ents"] += 1
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -172,7 +172,8 @@ class Errors(object):
            "and satisfies the correct annotations specified in the GoldParse. "
            "For example, are all labels added to the model? If you're "
            "training a named entity recognizer, also make sure that none of "
-            "your annotated entity spans have leading or trailing whitespace. "
+            "your annotated entity spans have leading or trailing whitespace "
+            "or punctuation. "
            "You can also use the experimental `debug-data` command to "
            "validate your JSON-formatted training data. For details, run:\n"
            "python -m spacy debug-data --help")