From c23041ae6006fdcf9d942df928a5b08b0ae1c781 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 16:26:53 +0200 Subject: [PATCH 1/6] component tests single or multiple prediction --- spacy/tests/pipeline/test_models.py | 46 +++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 spacy/tests/pipeline/test_models.py diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py new file mode 100644 index 000000000..d1c877953 --- /dev/null +++ b/spacy/tests/pipeline/test_models.py @@ -0,0 +1,46 @@ +from typing import List +import pytest +from numpy.testing import assert_equal +from thinc.api import get_current_ops, Model, data_validation +from thinc.types import Array2d + +from spacy.lang.en import English +from spacy.tokens import Doc + +OPS = get_current_ops() + +texts = ["These are 4 words", "These just three"] +l0 = [[1, 2], [3, 4], [5, 6], [7, 8]] +l1 = [[9, 8], [7, 6], [5, 4]] +out_list = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")] +a1 = OPS.xp.asarray(l1, dtype="f") + +# Test components with a model of type Model[List[Doc], List[Floats2d]] +@pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"]) +def test_layers_batching_all_list(name): + nlp = English() + in_data = [nlp(text) for text in texts] + proc = nlp.create_pipe(name) + util_batch_unbatch_List(proc.model, in_data, out_list) + +def util_batch_unbatch_List(model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]): + with data_validation(True): + model.initialize(in_data, out_data) + Y_batched = model.predict(in_data) + Y_not_batched = [model.predict([u])[0] for u in in_data] + assert_equal(Y_batched, Y_not_batched) + +# Test components with a model of type Model[List[Doc], Floats2d] +@pytest.mark.parametrize("name", ["textcat"]) +def test_layers_batching_all_array(name): + nlp = English() + in_data = [nlp(text) for text in texts] + proc = nlp.create_pipe(name) + util_batch_unbatch_Array(proc.model, in_data, a1) + +def util_batch_unbatch_Array(model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d): + with data_validation(True): + model.initialize(in_data, out_data) + Y_batched = model.predict(in_data) + Y_not_batched = [model.predict([u])[0] for u in in_data] + assert_equal(Y_batched, Y_not_batched) \ No newline at end of file From 6ccacff54e4c279c0b37652119bd507ee466a5df Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 18:50:07 +0200 Subject: [PATCH 2/6] add tests for individual spacy layers --- spacy/tests/pipeline/test_models.py | 98 +++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index d1c877953..12de9d23e 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -1,46 +1,108 @@ from typing import List + +import numpy import pytest -from numpy.testing import assert_equal +from numpy.testing import assert_almost_equal +from spacy.vocab import Vocab from thinc.api import get_current_ops, Model, data_validation -from thinc.types import Array2d +from thinc.types import Array2d, Ragged from spacy.lang.en import English +from spacy.ml import FeatureExtractor, StaticVectors +from spacy.ml._character_embed import CharacterEmbed from spacy.tokens import Doc OPS = get_current_ops() -texts = ["These are 4 words", "These just three"] +texts = ["These are 4 words", "Here just three"] l0 = [[1, 2], [3, 4], [5, 6], [7, 8]] l1 = [[9, 8], [7, 6], [5, 4]] -out_list = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")] -a1 = OPS.xp.asarray(l1, dtype="f") +list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")] +list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")] +array = OPS.xp.asarray(l1, dtype="f") +ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i")) + + +def get_docs(): + vocab = Vocab() + for t in texts: + for word in t.split(): + hash_id = vocab.strings.add(word) + vector = numpy.random.uniform(-1, 1, (7,)) + vocab.set_vector(hash_id, vector) + docs = [English(vocab)(t) for t in texts] + return docs + # Test components with a model of type Model[List[Doc], List[Floats2d]] @pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"]) -def test_layers_batching_all_list(name): +def test_components_batching_list(name): nlp = English() - in_data = [nlp(text) for text in texts] proc = nlp.create_pipe(name) - util_batch_unbatch_List(proc.model, in_data, out_list) + util_batch_unbatch_List(proc.model, get_docs(), list_floats) -def util_batch_unbatch_List(model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]): - with data_validation(True): - model.initialize(in_data, out_data) - Y_batched = model.predict(in_data) - Y_not_batched = [model.predict([u])[0] for u in in_data] - assert_equal(Y_batched, Y_not_batched) # Test components with a model of type Model[List[Doc], Floats2d] @pytest.mark.parametrize("name", ["textcat"]) -def test_layers_batching_all_array(name): +def test_components_batching_array(name): nlp = English() in_data = [nlp(text) for text in texts] proc = nlp.create_pipe(name) - util_batch_unbatch_Array(proc.model, in_data, a1) + util_batch_unbatch_Array(proc.model, get_docs(), array) -def util_batch_unbatch_Array(model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d): + +LAYERS = [ + (CharacterEmbed(nM=5, nC=3), get_docs(), list_floats), + (FeatureExtractor([100, 200]), get_docs(), list_ints), + (StaticVectors(), get_docs(), ragged), +] + + +@pytest.mark.parametrize("model,in_data,out_data", LAYERS) +def test_layers_batching_all(model, in_data, out_data): + # In = List[Doc] + if isinstance(in_data, list) and isinstance(in_data[0], Doc): + if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2: + util_batch_unbatch_Array(model, in_data, out_data) + elif ( + isinstance(out_data, list) + and isinstance(out_data[0], OPS.xp.ndarray) + and out_data[0].ndim == 2 + ): + util_batch_unbatch_List(model, in_data, out_data) + elif isinstance(out_data, Ragged): + util_batch_unbatch_Ragged(model, in_data, out_data) + + + +def util_batch_unbatch_List( + model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d] +): with data_validation(True): model.initialize(in_data, out_data) Y_batched = model.predict(in_data) Y_not_batched = [model.predict([u])[0] for u in in_data] - assert_equal(Y_batched, Y_not_batched) \ No newline at end of file + for i in range(len(Y_batched)): + assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4) + + +def util_batch_unbatch_Array( + model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d +): + with data_validation(True): + model.initialize(in_data, out_data) + Y_batched = model.predict(in_data).tolist() + Y_not_batched = [model.predict([u])[0] for u in in_data] + assert_almost_equal(Y_batched, Y_not_batched, decimal=4) + + +def util_batch_unbatch_Ragged( + model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged +): + with data_validation(True): + model.initialize(in_data, out_data) + Y_batched = model.predict(in_data) + Y_not_batched = [] + for u in in_data: + Y_not_batched.extend(model.predict([u]).data.tolist()) + assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4) From ff83bfae3f8bcf7c401af61bd04f4d8d0e6936a8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 18:52:37 +0200 Subject: [PATCH 3/6] naming --- spacy/tests/pipeline/test_models.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index 12de9d23e..b3982e714 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -39,7 +39,7 @@ def get_docs(): def test_components_batching_list(name): nlp = English() proc = nlp.create_pipe(name) - util_batch_unbatch_List(proc.model, get_docs(), list_floats) + util_batch_unbatch_docs_list(proc.model, get_docs(), list_floats) # Test components with a model of type Model[List[Doc], Floats2d] @@ -48,7 +48,7 @@ def test_components_batching_array(name): nlp = English() in_data = [nlp(text) for text in texts] proc = nlp.create_pipe(name) - util_batch_unbatch_Array(proc.model, get_docs(), array) + util_batch_unbatch_docs_array(proc.model, get_docs(), array) LAYERS = [ @@ -63,19 +63,19 @@ def test_layers_batching_all(model, in_data, out_data): # In = List[Doc] if isinstance(in_data, list) and isinstance(in_data[0], Doc): if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2: - util_batch_unbatch_Array(model, in_data, out_data) + util_batch_unbatch_docs_array(model, in_data, out_data) elif ( isinstance(out_data, list) and isinstance(out_data[0], OPS.xp.ndarray) and out_data[0].ndim == 2 ): - util_batch_unbatch_List(model, in_data, out_data) + util_batch_unbatch_docs_list(model, in_data, out_data) elif isinstance(out_data, Ragged): - util_batch_unbatch_Ragged(model, in_data, out_data) + util_batch_unbatch_docs_ragged(model, in_data, out_data) -def util_batch_unbatch_List( +def util_batch_unbatch_docs_list( model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d] ): with data_validation(True): @@ -86,7 +86,7 @@ def util_batch_unbatch_List( assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4) -def util_batch_unbatch_Array( +def util_batch_unbatch_docs_array( model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d ): with data_validation(True): @@ -96,7 +96,7 @@ def util_batch_unbatch_Array( assert_almost_equal(Y_batched, Y_not_batched, decimal=4) -def util_batch_unbatch_Ragged( +def util_batch_unbatch_docs_ragged( model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged ): with data_validation(True): From ede979d42fa23e79f37af33dd725b03756af5447 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 18:53:17 +0200 Subject: [PATCH 4/6] formattting --- spacy/tests/pipeline/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index b3982e714..0d1309cd8 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -74,7 +74,6 @@ def test_layers_batching_all(model, in_data, out_data): util_batch_unbatch_docs_ragged(model, in_data, out_data) - def util_batch_unbatch_docs_list( model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d] ): From e94a21638e27aba51ad38660c6136becb9b4466f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 21:07:13 +0200 Subject: [PATCH 5/6] adding tests for trained models to ensure predict reproducibility --- spacy/tests/parser/test_ner.py | 16 ++++++++++++++++ spacy/tests/parser/test_parse.py | 16 ++++++++++++++++ spacy/tests/pipeline/test_entity_linker.py | 15 +++++++++++++++ spacy/tests/pipeline/test_models.py | 1 - spacy/tests/pipeline/test_morphologizer.py | 15 +++++++++++++++ spacy/tests/pipeline/test_senter.py | 17 +++++++++++++++++ spacy/tests/pipeline/test_tagger.py | 16 ++++++++++++++++ spacy/tests/pipeline/test_textcat.py | 9 +++++++++ 8 files changed, 104 insertions(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b657ae2e8..b4c22b48d 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,4 +1,7 @@ import pytest +from numpy.testing import assert_equal +from spacy.attrs import ENT_IOB + from spacy import util from spacy.lang.en import English from spacy.language import Language @@ -332,6 +335,19 @@ def test_overfitting_IO(): assert ents2[0].text == "London" assert ents2[0].label_ == "LOC" + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Just a sentence.", + "Then one more sentence about London.", + "Here is another one.", + "I like London.", + ] + batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] + no_batch_deps = [doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) + def test_ner_warns_no_lookups(caplog): nlp = English() diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index ffb6f23f1..a914eb17a 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,4 +1,7 @@ import pytest +from numpy.testing import assert_equal +from spacy.attrs import DEP + from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc @@ -210,3 +213,16 @@ def test_overfitting_IO(): assert doc2[0].dep_ == "nsubj" assert doc2[2].dep_ == "dobj" assert doc2[3].dep_ == "punct" + + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Just a sentence.", + "Then one more sentence about London.", + "Here is another one.", + "I like London.", + ] + batch_deps_1 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] + no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index f2e6defcb..8ba2d0d3e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,5 +1,7 @@ from typing import Callable, Iterable import pytest +from numpy.testing import assert_equal +from spacy.attrs import ENT_KB_ID from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.vocab import Vocab @@ -496,6 +498,19 @@ def test_overfitting_IO(): predictions.append(ent.kb_id_) assert predictions == GOLD_entities + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Russ Cochran captured his first major title with his son as caddie.", + "Russ Cochran his reprints include EC Comics.", + "Russ Cochran has been publishing comic art.", + "Russ Cochran was a member of University of Kentucky's golf team.", + ] + batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] + no_batch_deps = [doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) + def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index 0d1309cd8..1ab5f7ea5 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -46,7 +46,6 @@ def test_components_batching_list(name): @pytest.mark.parametrize("name", ["textcat"]) def test_components_batching_array(name): nlp = English() - in_data = [nlp(text) for text in texts] proc = nlp.create_pipe(name) util_batch_unbatch_docs_array(proc.model, get_docs(), array) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index fd7aa05be..85d1d6c8b 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,4 +1,5 @@ import pytest +from numpy.testing import assert_equal from spacy import util from spacy.training import Example @@ -6,6 +7,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir from spacy.morphology import Morphology +from spacy.attrs import MORPH def test_label_types(): @@ -101,3 +103,16 @@ def test_overfitting_IO(): doc2 = nlp2(test_text) assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags + + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Just a sentence.", + "Then one more sentence about London.", + "Here is another one.", + "I like London.", + ] + batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] + no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index c9722e5de..7a256f79b 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,4 +1,6 @@ import pytest +from numpy.testing import assert_equal +from spacy.attrs import SENT_START from spacy import util from spacy.training import Example @@ -80,3 +82,18 @@ def test_overfitting_IO(): nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts + + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Just a sentence.", + "Then one more sentence about London.", + "Here is another one.", + "I like London.", + ] + batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] + no_batch_deps = [ + doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts] + ] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index b9db76cdf..885bdbce1 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,4 +1,7 @@ import pytest +from numpy.testing import assert_equal +from spacy.attrs import TAG + from spacy import util from spacy.training import Example from spacy.lang.en import English @@ -117,6 +120,19 @@ def test_overfitting_IO(): assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + "Just a sentence.", + "I like green eggs.", + "Here is another one.", + "I eat ham.", + ] + batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] + no_batch_deps = [doc.to_array([TAG]) for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) + def test_tagger_requires_labels(): nlp = English() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index dd2f1070b..91348b1b3 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,6 +1,7 @@ import pytest import random import numpy.random +from numpy.testing import assert_equal from thinc.api import fix_random_seed from spacy import util from spacy.lang.en import English @@ -174,6 +175,14 @@ def test_overfitting_IO(): assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."] + batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] + no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) + # fmt: off @pytest.mark.parametrize( From 0796401c1955fc3508b2d1f50b402b492fa690b2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 14 Oct 2020 16:55:00 +0200 Subject: [PATCH 6/6] call NumpyOps instead of get_current_ops() --- spacy/tests/pipeline/test_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index 1ab5f7ea5..d04ac9cd4 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -4,7 +4,7 @@ import numpy import pytest from numpy.testing import assert_almost_equal from spacy.vocab import Vocab -from thinc.api import get_current_ops, Model, data_validation +from thinc.api import NumpyOps, Model, data_validation from thinc.types import Array2d, Ragged from spacy.lang.en import English @@ -12,7 +12,8 @@ from spacy.ml import FeatureExtractor, StaticVectors from spacy.ml._character_embed import CharacterEmbed from spacy.tokens import Doc -OPS = get_current_ops() + +OPS = NumpyOps() texts = ["These are 4 words", "Here just three"] l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]