mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge pull request #6249 from svlandeg/feature/batch-tests
This commit is contained in:
commit
bc85b12e6d
|
@ -1,4 +1,7 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import ENT_IOB
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
|
@ -332,6 +335,19 @@ def test_overfitting_IO():
|
|||
assert ents2[0].text == "London"
|
||||
assert ents2[0].label_ == "LOC"
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"Then one more sentence about London.",
|
||||
"Here is another one.",
|
||||
"I like London.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
def test_ner_warns_no_lookups(caplog):
|
||||
nlp = English()
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import DEP
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.training import Example
|
||||
from spacy.tokens import Doc
|
||||
|
@ -210,3 +213,16 @@ def test_overfitting_IO():
|
|||
assert doc2[0].dep_ == "nsubj"
|
||||
assert doc2[2].dep_ == "dobj"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"Then one more sentence about London.",
|
||||
"Here is another one.",
|
||||
"I like London.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from typing import Callable, Iterable
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
|
||||
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -496,6 +498,19 @@ def test_overfitting_IO():
|
|||
predictions.append(ent.kb_id_)
|
||||
assert predictions == GOLD_entities
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Russ Cochran captured his first major title with his son as caddie.",
|
||||
"Russ Cochran his reprints include EC Comics.",
|
||||
"Russ Cochran has been publishing comic art.",
|
||||
"Russ Cochran was a member of University of Kentucky's golf team.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
def test_kb_serialization():
|
||||
# Test that the KB can be used in a pipeline with a different vocab
|
||||
|
|
107
spacy/tests/pipeline/test_models.py
Normal file
107
spacy/tests/pipeline/test_models.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
from typing import List
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
from numpy.testing import assert_almost_equal
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import NumpyOps, Model, data_validation
|
||||
from thinc.types import Array2d, Ragged
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import FeatureExtractor, StaticVectors
|
||||
from spacy.ml._character_embed import CharacterEmbed
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
OPS = NumpyOps()
|
||||
|
||||
texts = ["These are 4 words", "Here just three"]
|
||||
l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
||||
l1 = [[9, 8], [7, 6], [5, 4]]
|
||||
list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")]
|
||||
list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")]
|
||||
array = OPS.xp.asarray(l1, dtype="f")
|
||||
ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i"))
|
||||
|
||||
|
||||
def get_docs():
|
||||
vocab = Vocab()
|
||||
for t in texts:
|
||||
for word in t.split():
|
||||
hash_id = vocab.strings.add(word)
|
||||
vector = numpy.random.uniform(-1, 1, (7,))
|
||||
vocab.set_vector(hash_id, vector)
|
||||
docs = [English(vocab)(t) for t in texts]
|
||||
return docs
|
||||
|
||||
|
||||
# Test components with a model of type Model[List[Doc], List[Floats2d]]
|
||||
@pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"])
|
||||
def test_components_batching_list(name):
|
||||
nlp = English()
|
||||
proc = nlp.create_pipe(name)
|
||||
util_batch_unbatch_docs_list(proc.model, get_docs(), list_floats)
|
||||
|
||||
|
||||
# Test components with a model of type Model[List[Doc], Floats2d]
|
||||
@pytest.mark.parametrize("name", ["textcat"])
|
||||
def test_components_batching_array(name):
|
||||
nlp = English()
|
||||
proc = nlp.create_pipe(name)
|
||||
util_batch_unbatch_docs_array(proc.model, get_docs(), array)
|
||||
|
||||
|
||||
LAYERS = [
|
||||
(CharacterEmbed(nM=5, nC=3), get_docs(), list_floats),
|
||||
(FeatureExtractor([100, 200]), get_docs(), list_ints),
|
||||
(StaticVectors(), get_docs(), ragged),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model,in_data,out_data", LAYERS)
|
||||
def test_layers_batching_all(model, in_data, out_data):
|
||||
# In = List[Doc]
|
||||
if isinstance(in_data, list) and isinstance(in_data[0], Doc):
|
||||
if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2:
|
||||
util_batch_unbatch_docs_array(model, in_data, out_data)
|
||||
elif (
|
||||
isinstance(out_data, list)
|
||||
and isinstance(out_data[0], OPS.xp.ndarray)
|
||||
and out_data[0].ndim == 2
|
||||
):
|
||||
util_batch_unbatch_docs_list(model, in_data, out_data)
|
||||
elif isinstance(out_data, Ragged):
|
||||
util_batch_unbatch_docs_ragged(model, in_data, out_data)
|
||||
|
||||
|
||||
def util_batch_unbatch_docs_list(
|
||||
model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]
|
||||
):
|
||||
with data_validation(True):
|
||||
model.initialize(in_data, out_data)
|
||||
Y_batched = model.predict(in_data)
|
||||
Y_not_batched = [model.predict([u])[0] for u in in_data]
|
||||
for i in range(len(Y_batched)):
|
||||
assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4)
|
||||
|
||||
|
||||
def util_batch_unbatch_docs_array(
|
||||
model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d
|
||||
):
|
||||
with data_validation(True):
|
||||
model.initialize(in_data, out_data)
|
||||
Y_batched = model.predict(in_data).tolist()
|
||||
Y_not_batched = [model.predict([u])[0] for u in in_data]
|
||||
assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
|
||||
|
||||
|
||||
def util_batch_unbatch_docs_ragged(
|
||||
model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged
|
||||
):
|
||||
with data_validation(True):
|
||||
model.initialize(in_data, out_data)
|
||||
Y_batched = model.predict(in_data)
|
||||
Y_not_batched = []
|
||||
for u in in_data:
|
||||
Y_not_batched.extend(model.predict([u]).data.tolist())
|
||||
assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4)
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
|
@ -6,6 +7,7 @@ from spacy.lang.en import English
|
|||
from spacy.language import Language
|
||||
from spacy.tests.util import make_tempdir
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.attrs import MORPH
|
||||
|
||||
|
||||
def test_label_types():
|
||||
|
@ -101,3 +103,16 @@ def test_overfitting_IO():
|
|||
doc2 = nlp2(test_text)
|
||||
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"Then one more sentence about London.",
|
||||
"Here is another one.",
|
||||
"I like London.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import SENT_START
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
|
@ -80,3 +82,18 @@ def test_overfitting_IO():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"Then one more sentence about London.",
|
||||
"Here is another one.",
|
||||
"I like London.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [
|
||||
doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts]
|
||||
]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import TAG
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
|
@ -117,6 +120,19 @@ def test_overfitting_IO():
|
|||
assert doc2[2].tag_ is "J"
|
||||
assert doc2[3].tag_ is "N"
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"I like green eggs.",
|
||||
"Here is another one.",
|
||||
"I eat ham.",
|
||||
]
|
||||
batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.to_array([TAG]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
def test_tagger_requires_labels():
|
||||
nlp = English()
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
import random
|
||||
import numpy.random
|
||||
from numpy.testing import assert_equal
|
||||
from thinc.api import fix_random_seed
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
|
@ -174,6 +175,14 @@ def test_overfitting_IO():
|
|||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
|
||||
batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
|
|
Loading…
Reference in New Issue
Block a user