trainable_lemmatizer in debug data (#11419)

* WIP * rm ipython embeds * rm total * WIP * cleanup * cleanup + reword * rm component function * remove migration support form * fix reference dataset for dev data * additional fixes - set approach to identifying unique trees - adjust line length on messages - add logic for detecting docs without annotations * use 0 instead of none for no annotation * partial annotation support * initial tests for _compile_gold lemma attributes Using the example data from the edit tree lemmatizer tests for: - lemmatizer_trees - partial_lemma_annotations - n_low_cardinality_lemmas - no_lemma_annotations * adds output test for cli app * switch msg level * rm unclear uniqueness check * Revert "rm unclear uniqueness check" This reverts commit 6ea2b3524b. * remove good message on uniqueness * formatting * use en_vocab fixture * clarify data set source in messages * remove unnecessary import Co-authored-by: svlandeg <svlandeg@github.com>
2025-07-18 20:22:25 +03:00 · 2023-01-26 11:36:50 -05:00 · 2023-01-26 11:36:50 -05:00 · c68e6b8a96
commit c68e6b8a96
parent 8d69874afb
3 changed files with 194 additions and 0 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -17,6 +17,7 @@ from ..pipeline import TrainablePipe
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
 from ..pipeline import Morphologizer, SpanCategorizer
 from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
@ -671,6 +672,59 @@ def debug_data(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )
    if "trainable_lemmatizer" in factory_names:
        msg.divider("Trainable Lemmatizer")
        trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
        trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
        # This is necessary context when someone is attempting to interpret whether the
        # number of trees exclusively in the dev set is meaningful.
        msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
        msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
        dev_not_train = trees_dev - trees_train
        if len(dev_not_train) != 0:
            pct = len(dev_not_train) / len(trees_dev)
            msg.info(
                f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
                " were found exclusively in the dev data."
            )
        else:
            # Would we ever expect this case? It seems like it would be pretty rare,
            # and we might actually want a warning?
            msg.info("All trees in dev data present in training data.")
        if gold_train_data["n_low_cardinality_lemmas"] > 0:
            n = gold_train_data["n_low_cardinality_lemmas"]
            msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
        if gold_dev_data["n_low_cardinality_lemmas"] > 0:
            n = gold_dev_data["n_low_cardinality_lemmas"]
            msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
        if gold_train_data["no_lemma_annotations"] > 0:
            n = gold_train_data["no_lemma_annotations"]
            msg.warn(f"{n} training docs with no lemma annotations.")
        else:
            msg.good("All training docs have lemma annotations.")
        if gold_dev_data["no_lemma_annotations"] > 0:
            n = gold_dev_data["no_lemma_annotations"]
            msg.warn(f"{n} dev docs with no lemma annotations.")
        else:
            msg.good("All dev docs have lemma annotations.")
        if gold_train_data["partial_lemma_annotations"] > 0:
            n = gold_train_data["partial_lemma_annotations"]
            msg.info(f"{n} training docs with partial lemma annotations.")
        else:
            msg.good("All training docs have complete lemma annotations.")
        if gold_dev_data["partial_lemma_annotations"] > 0:
            n = gold_dev_data["partial_lemma_annotations"]
            msg.info(f"{n} dev docs with partial lemma annotations.")
        else:
            msg.good("All dev docs have complete lemma annotations.")
    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
@ -732,7 +786,13 @@ def _compile_gold(
        "n_cats_multilabel": 0,
        "n_cats_bad_values": 0,
        "texts": set(),
        "lemmatizer_trees": set(),
        "no_lemma_annotations": 0,
        "partial_lemma_annotations": 0,
        "n_low_cardinality_lemmas": 0,
    }
    if "trainable_lemmatizer" in factory_names:
        trees = EditTrees(nlp.vocab.strings)
    for eg in examples:
        gold = eg.reference
        doc = eg.predicted
@ -862,6 +922,25 @@ def _compile_gold(
                data["n_nonproj"] += 1
            if nonproj.contains_cycle(aligned_heads):
                data["n_cycles"] += 1
        if "trainable_lemmatizer" in factory_names:
            # from EditTreeLemmatizer._labels_from_data
            if all(token.lemma == 0 for token in gold):
                data["no_lemma_annotations"] += 1
                continue
            if any(token.lemma == 0 for token in gold):
                data["partial_lemma_annotations"] += 1
            lemma_set = set()
            for token in gold:
                if token.lemma != 0:
                    lemma_set.add(token.lemma)
                    tree_id = trees.add(token.text, token.lemma_)
                    tree_str = trees.tree_to_str(tree_id)
                    data["lemmatizer_trees"].add(tree_str)
            # We want to identify cases where lemmas aren't assigned
            # or are all assigned the same value, as this would indicate
            # an issue since we're expecting a large set of lemmas
            if len(lemma_set) < 2 and len(gold) > 1:
                data["n_low_cardinality_lemmas"] += 1
    return data
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1207,3 +1207,69 @@ def test_walk_directory():
        assert (len(walk_directory(d, suffix="iob"))) == 2
        assert (len(walk_directory(d, suffix="conll"))) == 3
        assert (len(walk_directory(d, suffix="pdf"))) == 0
 def test_debug_data_trainable_lemmatizer_basic():
    examples = [
        ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
        ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
    ]
    nlp = Language()
    train_examples = []
    for t in examples:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
    # ref test_edit_tree_lemmatizer::test_initialize_from_labels
    # this results in 4 trees
    assert len(data["lemmatizer_trees"]) == 4
 def test_debug_data_trainable_lemmatizer_partial():
    partial_examples = [
        # partial annotation
        ("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
        # misaligned partial annotation
        (
            "He hates green eggs",
            {
                "words": ["He", "hat", "es", "green", "eggs"],
                "lemmas": ["", "hat", "e", "green", ""],
            },
        ),
    ]
    nlp = Language()
    train_examples = []
    for t in partial_examples:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
    assert data["partial_lemma_annotations"] == 2
 def test_debug_data_trainable_lemmatizer_low_cardinality():
    low_cardinality_examples = [
        ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
        ("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
    ]
    nlp = Language()
    train_examples = []
    for t in low_cardinality_examples:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
    assert data["n_low_cardinality_lemmas"] == 2
 def test_debug_data_trainable_lemmatizer_not_annotated():
    unannotated_examples = [
        ("She likes green eggs", {}),
        ("Eat blue ham", {}),
    ]
    nlp = Language()
    train_examples = []
    for t in unannotated_examples:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
    assert data["no_lemma_annotations"] == 2
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 from typer.testing import CliRunner
 from spacy.tokens import DocBin, Doc
 from spacy.cli._util import app
 from .util import make_tempdir
@ -40,3 +41,51 @@ def test_benchmark_accuracy_alias():
    assert result_benchmark.stdout == result_evaluate.stdout.replace(
        "spacy evaluate", "spacy benchmark accuracy"
    )
 def test_debug_data_trainable_lemmatizer_cli(en_vocab):
    train_docs = [
        Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
        Doc(
            en_vocab,
            words=["Dogs", "are", "great", "too"],
            lemmas=["dog", "be", "great", "too"],
        ),
    ]
    dev_docs = [
        Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
        Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
    ]
    with make_tempdir() as d_in:
        train_bin = DocBin(docs=train_docs)
        train_bin.to_disk(d_in / "train.spacy")
        dev_bin = DocBin(docs=dev_docs)
        dev_bin.to_disk(d_in / "dev.spacy")
        # `debug data` requires an input pipeline config
        CliRunner().invoke(
            app,
            [
                "init",
                "config",
                f"{d_in}/config.cfg",
                "--lang",
                "en",
                "--pipeline",
                "trainable_lemmatizer",
            ],
        )
        result_debug_data = CliRunner().invoke(
            app,
            [
                "debug",
                "data",
                f"{d_in}/config.cfg",
                "--paths.train",
                f"{d_in}/train.spacy",
                "--paths.dev",
                f"{d_in}/dev.spacy",
            ],
        )
        # Instead of checking specific wording of the output, which may change,
        # we'll check that this section of the debug output is present.
        assert "= Trainable Lemmatizer =" in result_debug_data.stdout