From 1ffb1a12ea1e1b25702b7519b53750bf2cef197f Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Thu, 19 Jan 2023 14:53:23 -0500
Subject: [PATCH] initial tests for _compile_gold lemma attributes

Using the example data from the edit tree lemmatizer tests for:
- lemmatizer_trees
- partial_lemma_annotations
- n_low_cardinality_lemmas
- no_lemma_annotations
---
 spacy/tests/test_cli.py | 62 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index d00f66c60..ba7efc704 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1207,3 +1207,65 @@ def test_walk_directory():
         assert (len(walk_directory(d, suffix="iob"))) == 2
         assert (len(walk_directory(d, suffix="conll"))) == 3
         assert (len(walk_directory(d, suffix="pdf"))) == 0
+
+def test_debug_data_trainable_lemmatizer_basic():
+    examples = [
+        ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
+        ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
+        ]
+    nlp = Language()
+    train_examples = []
+    for t in examples:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+    # ref test_edit_tree_lemmatizer::test_initialize_from_labels
+    # this results in 4 trees
+    assert len(data["lemmatizer_trees"]) == 4
+
+def test_debug_data_trainable_lemmatizer_partial():
+    partial_examples = [
+        # partial annotation
+        ("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
+        # misaligned partial annotation
+        (
+            "He hates green eggs",
+            {
+                "words": ["He", "hat", "es", "green", "eggs"],
+                "lemmas": ["", "hat", "e", "green", ""],
+            },
+        ),
+    ]
+    nlp = Language()
+    train_examples = []
+    for t in partial_examples:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+    assert data["partial_lemma_annotations"] == 2
+
+def test_debug_data_trainable_lemmatizer_low_cardinality():
+    low_cardinality_examples = [
+        ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
+        ("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
+    ]
+    nlp = Language()
+    train_examples = []
+    for t in low_cardinality_examples:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+    assert data["n_low_cardinality_lemmas"] == 2
+
+def test_debug_data_trainable_lemmatizer_not_annotated():
+    unannotated_examples = [
+        ("She likes green eggs", {}),
+        ("Eat blue ham", {}),
+    ]
+    nlp = Language()
+    train_examples = []
+    for t in unannotated_examples:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+    assert data["no_lemma_annotations"] == 2