mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge pull request #6249 from svlandeg/feature/batch-tests
This commit is contained in:
		
						commit
						bc85b12e6d
					
				|  | @ -1,4 +1,7 @@ | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
|  | from spacy.attrs import ENT_IOB | ||||||
|  | 
 | ||||||
| from spacy import util | from spacy import util | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
|  | @ -332,6 +335,19 @@ def test_overfitting_IO(): | ||||||
|         assert ents2[0].text == "London" |         assert ents2[0].text == "London" | ||||||
|         assert ents2[0].label_ == "LOC" |         assert ents2[0].label_ == "LOC" | ||||||
| 
 | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Just a sentence.", | ||||||
|  |         "Then one more sentence about London.", | ||||||
|  |         "Here is another one.", | ||||||
|  |         "I like London.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_ner_warns_no_lookups(caplog): | def test_ner_warns_no_lookups(caplog): | ||||||
|     nlp = English() |     nlp = English() | ||||||
|  |  | ||||||
|  | @ -1,4 +1,7 @@ | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
|  | from spacy.attrs import DEP | ||||||
|  | 
 | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
|  | @ -210,3 +213,16 @@ def test_overfitting_IO(): | ||||||
|         assert doc2[0].dep_ == "nsubj" |         assert doc2[0].dep_ == "nsubj" | ||||||
|         assert doc2[2].dep_ == "dobj" |         assert doc2[2].dep_ == "dobj" | ||||||
|         assert doc2[3].dep_ == "punct" |         assert doc2[3].dep_ == "punct" | ||||||
|  | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Just a sentence.", | ||||||
|  |         "Then one more sentence about London.", | ||||||
|  |         "Here is another one.", | ||||||
|  |         "I like London.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  |  | ||||||
|  | @ -1,5 +1,7 @@ | ||||||
| from typing import Callable, Iterable | from typing import Callable, Iterable | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
|  | from spacy.attrs import ENT_KB_ID | ||||||
| 
 | 
 | ||||||
| from spacy.kb import KnowledgeBase, get_candidates, Candidate | from spacy.kb import KnowledgeBase, get_candidates, Candidate | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
|  | @ -496,6 +498,19 @@ def test_overfitting_IO(): | ||||||
|                 predictions.append(ent.kb_id_) |                 predictions.append(ent.kb_id_) | ||||||
|         assert predictions == GOLD_entities |         assert predictions == GOLD_entities | ||||||
| 
 | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Russ Cochran captured his first major title with his son as caddie.", | ||||||
|  |         "Russ Cochran his reprints include EC Comics.", | ||||||
|  |         "Russ Cochran has been publishing comic art.", | ||||||
|  |         "Russ Cochran was a member of University of Kentucky's golf team.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_kb_serialization(): | def test_kb_serialization(): | ||||||
|     # Test that the KB can be used in a pipeline with a different vocab |     # Test that the KB can be used in a pipeline with a different vocab | ||||||
|  |  | ||||||
							
								
								
									
										107
									
								
								spacy/tests/pipeline/test_models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								spacy/tests/pipeline/test_models.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,107 @@ | ||||||
|  | from typing import List | ||||||
|  | 
 | ||||||
|  | import numpy | ||||||
|  | import pytest | ||||||
|  | from numpy.testing import assert_almost_equal | ||||||
|  | from spacy.vocab import Vocab | ||||||
|  | from thinc.api import NumpyOps, Model, data_validation | ||||||
|  | from thinc.types import Array2d, Ragged | ||||||
|  | 
 | ||||||
|  | from spacy.lang.en import English | ||||||
|  | from spacy.ml import FeatureExtractor, StaticVectors | ||||||
|  | from spacy.ml._character_embed import CharacterEmbed | ||||||
|  | from spacy.tokens import Doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | OPS = NumpyOps() | ||||||
|  | 
 | ||||||
|  | texts = ["These are 4 words", "Here just three"] | ||||||
|  | l0 = [[1, 2], [3, 4], [5, 6], [7, 8]] | ||||||
|  | l1 = [[9, 8], [7, 6], [5, 4]] | ||||||
|  | list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")] | ||||||
|  | list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")] | ||||||
|  | array = OPS.xp.asarray(l1, dtype="f") | ||||||
|  | ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_docs(): | ||||||
|  |     vocab = Vocab() | ||||||
|  |     for t in texts: | ||||||
|  |         for word in t.split(): | ||||||
|  |             hash_id = vocab.strings.add(word) | ||||||
|  |             vector = numpy.random.uniform(-1, 1, (7,)) | ||||||
|  |             vocab.set_vector(hash_id, vector) | ||||||
|  |     docs = [English(vocab)(t) for t in texts] | ||||||
|  |     return docs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Test components with a model of type Model[List[Doc], List[Floats2d]] | ||||||
|  | @pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"]) | ||||||
|  | def test_components_batching_list(name): | ||||||
|  |     nlp = English() | ||||||
|  |     proc = nlp.create_pipe(name) | ||||||
|  |     util_batch_unbatch_docs_list(proc.model, get_docs(), list_floats) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Test components with a model of type Model[List[Doc], Floats2d] | ||||||
|  | @pytest.mark.parametrize("name", ["textcat"]) | ||||||
|  | def test_components_batching_array(name): | ||||||
|  |     nlp = English() | ||||||
|  |     proc = nlp.create_pipe(name) | ||||||
|  |     util_batch_unbatch_docs_array(proc.model, get_docs(), array) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | LAYERS = [ | ||||||
|  |     (CharacterEmbed(nM=5, nC=3), get_docs(), list_floats), | ||||||
|  |     (FeatureExtractor([100, 200]), get_docs(), list_ints), | ||||||
|  |     (StaticVectors(), get_docs(), ragged), | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("model,in_data,out_data", LAYERS) | ||||||
|  | def test_layers_batching_all(model, in_data, out_data): | ||||||
|  |     # In = List[Doc] | ||||||
|  |     if isinstance(in_data, list) and isinstance(in_data[0], Doc): | ||||||
|  |         if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2: | ||||||
|  |             util_batch_unbatch_docs_array(model, in_data, out_data) | ||||||
|  |         elif ( | ||||||
|  |             isinstance(out_data, list) | ||||||
|  |             and isinstance(out_data[0], OPS.xp.ndarray) | ||||||
|  |             and out_data[0].ndim == 2 | ||||||
|  |         ): | ||||||
|  |             util_batch_unbatch_docs_list(model, in_data, out_data) | ||||||
|  |         elif isinstance(out_data, Ragged): | ||||||
|  |             util_batch_unbatch_docs_ragged(model, in_data, out_data) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def util_batch_unbatch_docs_list( | ||||||
|  |     model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d] | ||||||
|  | ): | ||||||
|  |     with data_validation(True): | ||||||
|  |         model.initialize(in_data, out_data) | ||||||
|  |         Y_batched = model.predict(in_data) | ||||||
|  |         Y_not_batched = [model.predict([u])[0] for u in in_data] | ||||||
|  |         for i in range(len(Y_batched)): | ||||||
|  |             assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def util_batch_unbatch_docs_array( | ||||||
|  |     model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d | ||||||
|  | ): | ||||||
|  |     with data_validation(True): | ||||||
|  |         model.initialize(in_data, out_data) | ||||||
|  |         Y_batched = model.predict(in_data).tolist() | ||||||
|  |         Y_not_batched = [model.predict([u])[0] for u in in_data] | ||||||
|  |         assert_almost_equal(Y_batched, Y_not_batched, decimal=4) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def util_batch_unbatch_docs_ragged( | ||||||
|  |     model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged | ||||||
|  | ): | ||||||
|  |     with data_validation(True): | ||||||
|  |         model.initialize(in_data, out_data) | ||||||
|  |         Y_batched = model.predict(in_data) | ||||||
|  |         Y_not_batched = [] | ||||||
|  |         for u in in_data: | ||||||
|  |             Y_not_batched.extend(model.predict([u]).data.tolist()) | ||||||
|  |         assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4) | ||||||
|  | @ -1,4 +1,5 @@ | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
| 
 | 
 | ||||||
| from spacy import util | from spacy import util | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
|  | @ -6,6 +7,7 @@ from spacy.lang.en import English | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.tests.util import make_tempdir | from spacy.tests.util import make_tempdir | ||||||
| from spacy.morphology import Morphology | from spacy.morphology import Morphology | ||||||
|  | from spacy.attrs import MORPH | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_label_types(): | def test_label_types(): | ||||||
|  | @ -101,3 +103,16 @@ def test_overfitting_IO(): | ||||||
|         doc2 = nlp2(test_text) |         doc2 = nlp2(test_text) | ||||||
|         assert [str(t.morph) for t in doc2] == gold_morphs |         assert [str(t.morph) for t in doc2] == gold_morphs | ||||||
|         assert [t.pos_ for t in doc2] == gold_pos_tags |         assert [t.pos_ for t in doc2] == gold_pos_tags | ||||||
|  | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Just a sentence.", | ||||||
|  |         "Then one more sentence about London.", | ||||||
|  |         "Here is another one.", | ||||||
|  |         "I like London.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  |  | ||||||
|  | @ -1,4 +1,6 @@ | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
|  | from spacy.attrs import SENT_START | ||||||
| 
 | 
 | ||||||
| from spacy import util | from spacy import util | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
|  | @ -80,3 +82,18 @@ def test_overfitting_IO(): | ||||||
|         nlp2 = util.load_model_from_path(tmp_dir) |         nlp2 = util.load_model_from_path(tmp_dir) | ||||||
|         doc2 = nlp2(test_text) |         doc2 = nlp2(test_text) | ||||||
|         assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts |         assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts | ||||||
|  | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Just a sentence.", | ||||||
|  |         "Then one more sentence about London.", | ||||||
|  |         "Here is another one.", | ||||||
|  |         "I like London.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [ | ||||||
|  |         doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts] | ||||||
|  |     ] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  |  | ||||||
|  | @ -1,4 +1,7 @@ | ||||||
| import pytest | import pytest | ||||||
|  | from numpy.testing import assert_equal | ||||||
|  | from spacy.attrs import TAG | ||||||
|  | 
 | ||||||
| from spacy import util | from spacy import util | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
|  | @ -117,6 +120,19 @@ def test_overfitting_IO(): | ||||||
|         assert doc2[2].tag_ is "J" |         assert doc2[2].tag_ is "J" | ||||||
|         assert doc2[3].tag_ is "N" |         assert doc2[3].tag_ is "N" | ||||||
| 
 | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = [ | ||||||
|  |         "Just a sentence.", | ||||||
|  |         "I like green eggs.", | ||||||
|  |         "Here is another one.", | ||||||
|  |         "I eat ham.", | ||||||
|  |     ] | ||||||
|  |     batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.to_array([TAG]) for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_tagger_requires_labels(): | def test_tagger_requires_labels(): | ||||||
|     nlp = English() |     nlp = English() | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| import pytest | import pytest | ||||||
| import random | import random | ||||||
| import numpy.random | import numpy.random | ||||||
|  | from numpy.testing import assert_equal | ||||||
| from thinc.api import fix_random_seed | from thinc.api import fix_random_seed | ||||||
| from spacy import util | from spacy import util | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
|  | @ -174,6 +175,14 @@ def test_overfitting_IO(): | ||||||
|     assert scores["cats_score"] == 1.0 |     assert scores["cats_score"] == 1.0 | ||||||
|     assert "cats_score_desc" in scores |     assert "cats_score_desc" in scores | ||||||
| 
 | 
 | ||||||
|  |     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions | ||||||
|  |     texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."] | ||||||
|  |     batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] | ||||||
|  |     batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] | ||||||
|  |     no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] | ||||||
|  |     assert_equal(batch_deps_1, batch_deps_2) | ||||||
|  |     assert_equal(batch_deps_1, no_batch_deps) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # fmt: off | # fmt: off | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user