diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index f2d743e10..9a9ad9ae1 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -693,6 +693,18 @@ def debug_data( # and we might actually want a warning? msg.info("All trees in dev data present in training data.") + if gold_train_data["n_low_cardinality_lemmas"] > 0: + n = gold_train_data["n_low_cardinality_lemmas"] + msg.warn(f"{n} docs with 1 or 0 unique lemmas.") + else: + msg.good("All training docs meet lemma uniqueness requirements.") + + if gold_dev_data["n_low_cardinality_lemmas"] > 0: + n = gold_dev_data["n_low_cardinality_lemmas"] + msg.warn(f"{n} docs with 1 or 0 unique lemmas.") + else: + msg.good("All dev docs meet lemma uniqueness requirements.") + if gold_train_data["no_lemma_annotations"] > 0: n = gold_train_data["no_lemma_annotations"] msg.warn(f"{n} docs with no lemma annotations.") @@ -781,6 +793,7 @@ def _compile_gold( "lemmatizer_trees": set(), "no_lemma_annotations": 0, "partial_lemma_annotations": 0, + "n_low_cardinality_lemmas": 0, } if "trainable_lemmatizer" in factory_names: trees = EditTrees(nlp.vocab.strings) @@ -920,11 +933,18 @@ def _compile_gold( continue if any(token.lemma == 0 for token in gold): data["partial_lemma_annotations"] += 1 + lemma_set = set() for token in gold: if token.lemma != 0: + lemma_set.add(token.lemma) tree_id = trees.add(token.text, token.lemma_) tree_str = trees.tree_to_str(tree_id) data["lemmatizer_trees"].add(tree_str) + # We want to identify cases where lemmas aren't assigned + # or are all assigned the same value, as this would indicate + # an issue since we're expecting a large set of lemmas + if len(lemma_set) < 2 and len(gold) > 1: + data["n_low_cardinality_lemmas"] += 1 return data diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index bc5d6b826..ba7efc704 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1244,6 +1244,19 @@ def test_debug_data_trainable_lemmatizer_partial(): data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) assert data["partial_lemma_annotations"] == 2 +def test_debug_data_trainable_lemmatizer_low_cardinality(): + low_cardinality_examples = [ + ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}), + ("Eat blue ham", {"lemmas": ["no", "no", "no"]}), + ] + nlp = Language() + train_examples = [] + for t in low_cardinality_examples: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) + assert data["n_low_cardinality_lemmas"] == 2 + def test_debug_data_trainable_lemmatizer_not_annotated(): unannotated_examples = [ ("She likes green eggs", {}),