diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 9a9ad9ae1..f2d743e10 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -693,18 +693,6 @@ def debug_data( # and we might actually want a warning? msg.info("All trees in dev data present in training data.") - if gold_train_data["n_low_cardinality_lemmas"] > 0: - n = gold_train_data["n_low_cardinality_lemmas"] - msg.warn(f"{n} docs with 1 or 0 unique lemmas.") - else: - msg.good("All training docs meet lemma uniqueness requirements.") - - if gold_dev_data["n_low_cardinality_lemmas"] > 0: - n = gold_dev_data["n_low_cardinality_lemmas"] - msg.warn(f"{n} docs with 1 or 0 unique lemmas.") - else: - msg.good("All dev docs meet lemma uniqueness requirements.") - if gold_train_data["no_lemma_annotations"] > 0: n = gold_train_data["no_lemma_annotations"] msg.warn(f"{n} docs with no lemma annotations.") @@ -793,7 +781,6 @@ def _compile_gold( "lemmatizer_trees": set(), "no_lemma_annotations": 0, "partial_lemma_annotations": 0, - "n_low_cardinality_lemmas": 0, } if "trainable_lemmatizer" in factory_names: trees = EditTrees(nlp.vocab.strings) @@ -933,18 +920,11 @@ def _compile_gold( continue if any(token.lemma == 0 for token in gold): data["partial_lemma_annotations"] += 1 - lemma_set = set() for token in gold: if token.lemma != 0: - lemma_set.add(token.lemma) tree_id = trees.add(token.text, token.lemma_) tree_str = trees.tree_to_str(tree_id) data["lemmatizer_trees"].add(tree_str) - # We want to identify cases where lemmas aren't assigned - # or are all assigned the same value, as this would indicate - # an issue since we're expecting a large set of lemmas - if len(lemma_set) < 2 and len(gold) > 1: - data["n_low_cardinality_lemmas"] += 1 return data diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ba7efc704..bc5d6b826 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1244,19 +1244,6 @@ def test_debug_data_trainable_lemmatizer_partial(): data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) assert data["partial_lemma_annotations"] == 2 -def test_debug_data_trainable_lemmatizer_low_cardinality(): - low_cardinality_examples = [ - ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}), - ("Eat blue ham", {"lemmas": ["no", "no", "no"]}), - ] - nlp = Language() - train_examples = [] - for t in low_cardinality_examples: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - - data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) - assert data["n_low_cardinality_lemmas"] == 2 - def test_debug_data_trainable_lemmatizer_not_annotated(): unannotated_examples = [ ("She likes green eggs", {}),