From cfc72c29959b1ab715d4b647c66e0c2e0b5f4979 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 8 Dec 2020 23:29:15 +0100
Subject: [PATCH] Bugfix multi-label textcat reproducibility (#6481)

* add test for multi-label textcat reproducibility

* remove positive_label

* fix lengths dtype

* fix comments

* remove comment that we should not have forgotten :-)
---
 spacy/ml/extract_ngrams.py           |  5 +--
 spacy/tests/pipeline/test_textcat.py | 59 +++++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index bdc297232..93878c81b 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -23,10 +23,7 @@ def forward(model: Model, docs, is_train: bool):
         keys, vals = model.ops.xp.unique(keys, return_counts=True)
         batch_keys.append(keys)
         batch_vals.append(vals)
-    # The dtype here matches what thinc is expecting -- which differs per
-    # platform (by int definition). This should be fixed once the problem
-    # is fixed on Thinc's side.
-    lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
+    lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype="int32")
     batch_keys = model.ops.xp.concatenate(batch_keys)
     batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 06d512a32..733535b32 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -135,7 +135,7 @@ def test_initialize_examples():
 
 
 def test_overfitting_IO():
-    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
+    # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
     nlp = English()
     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
@@ -177,11 +177,58 @@ def test_overfitting_IO():
 
     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
     texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
-    batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
-    batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
-    no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
-    assert_equal(batch_deps_1, batch_deps_2)
-    assert_equal(batch_deps_1, no_batch_deps)
+    batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
+    batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
+    no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
+    assert_equal(batch_cats_1, batch_cats_2)
+    assert_equal(batch_cats_1, no_batch_cats)
+
+
+def test_overfitting_IO_multi():
+    # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
+    fix_random_seed(0)
+    nlp = English()
+    # Set exclusive labels to False
+    config = {"model": {"linear_model": {"exclusive_classes": False}}}
+    textcat = nlp.add_pipe("textcat", config=config)
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    assert textcat.model.get_dim("nO") == 2
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["textcat"] < 0.01
+
+    # test the trained model
+    test_text = "I am happy."
+    doc = nlp(test_text)
+    cats = doc.cats
+    assert cats["POSITIVE"] > 0.9
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        cats2 = doc2.cats
+        assert cats2["POSITIVE"] > 0.9
+
+    # Test scoring
+    scores = nlp.evaluate(train_examples)
+    assert scores["cats_micro_f"] == 1.0
+    assert scores["cats_score"] == 1.0
+    assert "cats_score_desc" in scores
+
+    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
+    texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
+    batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
+    batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
+    no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
+    assert_equal(batch_cats_1, batch_cats_2)
+    assert_equal(batch_cats_1, no_batch_cats)
 
 
 # fmt: off