add basic tests for debugging

2025-08-09 14:44:52 +03:00 · 2021-05-28 14:19:55 +02:00 · 2021-05-28 14:19:55 +02:00 · 0f5c586e2f
commit 0f5c586e2f
parent 391b512afd
2 changed files with 126 additions and 4 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -59,6 +59,7 @@ def tuplify(layer1: Model, layer2: Model, *layers) -> Model:
    )


+# TODO replace this with thinc version once PR is in
 def tuplify_forward(model, X, is_train):
    Ys = []
    backprops = []
@ -77,16 +78,27 @@ def tuplify_forward(model, X, is_train):
    return tuple(Ys), backprop_tuplify


-# TODO make more robust, see chain
+# TODO replace this with thinc version once PR is in
 def tuplify_init(model, X, Y) -> Model:
    if X is None and Y is None:
        for layer in model.layers:
            layer.initialize()
-
+        if model.layers[0].has_dim("nI"):
+            model.set_dim("nI", model.layers[0].get_dim("nI"))
        return model

-    for layer in model.layers:
-        layer.initialize(X=X)
+    # Try to set nO on each layer, where available.
+    # All layers have the same input, and the output should map directly from the
+    # given Y, if provided.
+    for ii, layer in enumerate(model.layers):
+        if Y is not None and layer.has_dim("nO") is None:
+            layer.initialize(X=X, Y=Y[ii])
+        else:
+            layer.initialize(X=X)
+
+    if model.layers[0].has_dim("nI"):
+        model.set_dim("nI", model.layers[0].get_dim("nI"))
+    # this model can have an input dimension, but can't have an output dimension
    return model


--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@ -0,0 +1,110 @@
+import pytest
+from spacy import util
+from spacy.training import Example
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+from spacy.pipeline.coref import DEFAULT_CLUSTERS_PREFIX
+
+# fmt: off
+TRAIN_DATA = [
+    (
+        "Yes, I noticed that many friends around me received it. It seems that almost everyone received this SMS.",
+        {
+            "spans": {
+                f"{DEFAULT_CLUSTERS_PREFIX}_1": [
+                    (5, 6, "MENTION"),      # I
+                    (40, 42, "MENTION"),    # me
+
+                ],
+                f"{DEFAULT_CLUSTERS_PREFIX}_2": [
+                    (52, 54, "MENTION"),     # it
+                    (95, 103, "MENTION"),    # this SMS
+                ]
+            }
+        },
+    ),
+]
+# fmt: on
+
+
+@pytest.fixture
+def nlp():
+    return English()
+
+
+def test_add_pipe(nlp):
+    nlp.add_pipe("coref")
+    assert nlp.pipe_names == ["coref"]
+
+
+def test_not_initialized(nlp):
+    nlp.add_pipe("coref")
+    text = "She gave me her pen."
+    with pytest.raises(ValueError):
+        nlp(text)
+
+
+def test_initialized(nlp):
+    nlp.add_pipe("coref")
+    nlp.initialize()
+    assert nlp.pipe_names == ["coref"]
+    text = "She gave me her pen."
+    doc = nlp(text)
+    # The results of this are weird & non-deterministic
+    print(doc.spans)
+
+
+def test_initialized_2(nlp):
+    nlp.add_pipe("coref")
+    nlp.initialize()
+    assert nlp.pipe_names == ["coref"]
+    text = "She gave me her pen."
+    doc = nlp(text)
+    # TODO: THIS CRASHES
+    print(nlp(text).spans)
+
+
+def test_overfitting_IO(nlp):
+    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
+
+    nlp.add_pipe("coref")
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+    print("BEFORE", doc.spans)
+
+    for i in range(5):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+        doc = nlp(test_text)
+        print(i, doc.spans)
+    print(losses["coref"]) # < 0.001
+
+    # test the trained model
+    doc = nlp(test_text)
+    print("AFTER", doc.spans)
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        print("doc2", doc2.spans)
+
+    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
+    texts = [
+        test_text,
+        "I noticed many friends around me",
+        "They received it. They received the SMS.",
+    ]
+    batch_deps_1 = [doc.spans for doc in nlp.pipe(texts)]
+    print(batch_deps_1)
+    batch_deps_2 = [doc.spans for doc in nlp.pipe(texts)]
+    print(batch_deps_2)
+    no_batch_deps = [doc.spans for doc in [nlp(text) for text in texts]]
+    print(no_batch_deps)
+    # assert_equal(batch_deps_1, batch_deps_2)
+    # assert_equal(batch_deps_1, no_batch_deps)