cleanup

2026-03-07 05:11:27 +03:00 · 2022-09-07 10:30:58 -04:00 · 2022-09-07 10:30:58 -04:00 · c09d99a069
commit c09d99a069
parent 64b57204e2
1 changed files with 17 additions and 13 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -676,6 +676,19 @@ def debug_data(
        trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
        trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]

+        # This is necessary context if someone is attempting to interpret whether the
+        # number of trees exclusively in the dev set is meaningful.
+        msg.info(f"{len(trees_train)} lemmatizer trees generated from training data.")
+        msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data.")
+        dev_not_train = trees_dev - trees_train
+        if len(dev_not_train) != 0:
+            msg.warn(
+                f"{len(dev_not_train)} lemmatizer trees were found exclusively in the dev data."
+            )
+        else:
+            # Would we ever expect this case? It seems like it would be pretty rare.
+            msg.good("All trees in dev data present in training data.")
+
        if gold_train_data["n_low_cardinality_lemmas"] > 0:
            msg.warn(
                f"{gold_train_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas."
@ -683,12 +696,6 @@ def debug_data(
        else:
            msg.good("Training docs have sufficient unique lemmas")

-        train_not_dev = trees_train - trees_dev
-        if len(train_not_dev) != 0:
-            msg.warn(f"{len(train_not_dev)} labels were found only in the train data.")
-        else:
-            msg.good("Training data contains all lemmatizer trees in dev set.")
-
        if gold_train_data["n_low_cardinality_lemmas"] > 0:
            msg.warn(
                f"{gold_dev_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas."
@ -696,12 +703,6 @@ def debug_data(
        else:
            msg.good("Dev docs have sufficient unique lemmas")

-        dev_not_train = trees_dev - trees_train
-        if len(dev_not_train) != 0:
-            msg.warn(f"{len(dev_not_train)} labels were found only in the dev data.")
-        else:
-            msg.good("Trees in dev data present in training data.")
-
    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
@ -906,7 +907,10 @@ def _compile_gold(
                    tree_id = trees.add(token.text, token.lemma_)
                    tree_str = trees.tree_to_str(tree_id)
                    data["lemmatizer_trees"].add(tree_str)
-            if len(lemma_set) < 2:
+            # We want to identify cases where lemmas aren't assigned
+            # or are all assigned the same value, as this would indicate
+            # an issue since we're expecting a large set of lemmas
+            if len(lemma_set) < 2 and len(gold) > 1:
                data["n_low_cardinality_lemmas"] += 1
    return data