partial annotation support

2025-11-15 15:25:53 +03:00 · 2023-01-12 09:10:39 -05:00 · 2023-01-12 09:10:39 -05:00 · a944e55291
commit a944e55291
parent 28c31048c9
1 changed files with 29 additions and 14 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -708,11 +708,23 @@ def debug_data(
            n = gold_train_data["no_lemma_annotations"]
            msg.warn(f"{n} docs with no lemma annotations.")
        else:
-            msg.good("All training docs have complete lemma annotations.")
+            msg.good("All training docs have lemma annotations.")
        if gold_dev_data["no_lemma_annotations"] > 0:
            n = gold_dev_data["no_lemma_annotations"]
            msg.warn(f"{n} docs with no lemma annotations.")
        else:
            msg.good("All dev docs have lemma annotations.")
        if gold_train_data["partial_lemma_annotations"] > 0:
            n = gold_train_data["partial_lemma_annotations"]
            msg.info(f"{n} docs with partial lemma annotations.")
        else:
            msg.good("All training docs have complete lemma annotations.")
        if gold_dev_data["partial_lemma_annotations"] > 0:
            n = gold_dev_data["partial_lemma_annotations"]
            msg.info(f"{n} docs with partial lemma annotations.")
        else:
            msg.good("All dev docs have complete lemma annotations.")
@ -779,6 +791,7 @@ def _compile_gold(
        "texts": set(),
        "lemmatizer_trees": set(),
        "no_lemma_annotations": 0,
        "partial_lemma_annotations": 0,
        "n_low_cardinality_lemmas": 0,
    }
    if "trainable_lemmatizer" in factory_names:
@ -916,19 +929,21 @@ def _compile_gold(
            # from EditTreeLemmatizer._labels_from_data
            if all(token.lemma == 0 for token in gold):
                data["no_lemma_annotations"] += 1
-            else:
+                continue
-                lemma_set = set()
+            if any(token.lemma == 0 for token in gold):
-                for token in gold:
+                data["partial_lemma_annotations"] += 1
-                    if token.lemma != 0:
+            lemma_set = set()
-                        lemma_set.add(token.lemma)
+            for token in gold:
-                        tree_id = trees.add(token.text, token.lemma_)
+                if token.lemma != 0:
-                        tree_str = trees.tree_to_str(tree_id)
+                    lemma_set.add(token.lemma)
-                        data["lemmatizer_trees"].add(tree_str)
+                    tree_id = trees.add(token.text, token.lemma_)
-                # We want to identify cases where lemmas aren't assigned
+                    tree_str = trees.tree_to_str(tree_id)
-                # or are all assigned the same value, as this would indicate
+                    data["lemmatizer_trees"].add(tree_str)
-                # an issue since we're expecting a large set of lemmas
+            # We want to identify cases where lemmas aren't assigned
-                if len(lemma_set) < 2 and len(gold) > 1:
+            # or are all assigned the same value, as this would indicate
-                    data["n_low_cardinality_lemmas"] += 1
+            # an issue since we're expecting a large set of lemmas
            if len(lemma_set) < 2 and len(gold) > 1:
                data["n_low_cardinality_lemmas"] += 1
    return data