spaCy/spacy/tests/pipeline/test_models.py

from typing import List

import numpy
import pytest
from numpy.testing import assert_almost_equal
from spacy.vocab import Vocab
from thinc.api import Model, data_validation, get_current_ops
from thinc.types import Array2d, Ragged

from spacy.lang.en import English
from spacy.ml import FeatureExtractor, StaticVectors
from spacy.ml.character_embed import CharacterEmbed
from spacy.tokens import Doc


OPS = get_current_ops()

texts = ["These are 4 words", "Here just three"]
l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]
l1 = [[9, 8], [7, 6], [5, 4]]
list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")]
list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")]
array = OPS.xp.asarray(l1, dtype="f")
ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i"))


def get_docs():
    vocab = Vocab()
    for t in texts:
        for word in t.split():
            hash_id = vocab.strings.add(word)
            vector = numpy.random.uniform(-1, 1, (7,))
            vocab.set_vector(hash_id, vector)
    docs = [English(vocab)(t) for t in texts]
    return docs


# Test components with a model of type Model[List[Doc], List[Floats2d]]
@pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"])
def test_components_batching_list(name):
    nlp = English()
    proc = nlp.create_pipe(name)
    util_batch_unbatch_docs_list(proc.model, get_docs(), list_floats)


# Test components with a model of type Model[List[Doc], Floats2d]
@pytest.mark.parametrize("name", ["textcat"])
def test_components_batching_array(name):
    nlp = English()
    proc = nlp.create_pipe(name)
    util_batch_unbatch_docs_array(proc.model, get_docs(), array)


LAYERS = [
    (CharacterEmbed(nM=5, nC=3), get_docs(), list_floats),
    (FeatureExtractor([100, 200]), get_docs(), list_ints),
    (StaticVectors(), get_docs(), ragged),
]


@pytest.mark.parametrize("model,in_data,out_data", LAYERS)
def test_layers_batching_all(model, in_data, out_data):
    # In = List[Doc]
    if isinstance(in_data, list) and isinstance(in_data[0], Doc):
        if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2:
            util_batch_unbatch_docs_array(model, in_data, out_data)
        elif (
            isinstance(out_data, list)
            and isinstance(out_data[0], OPS.xp.ndarray)
            and out_data[0].ndim == 2
        ):
            util_batch_unbatch_docs_list(model, in_data, out_data)
        elif isinstance(out_data, Ragged):
            util_batch_unbatch_docs_ragged(model, in_data, out_data)


def util_batch_unbatch_docs_list(
    model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]
):
    with data_validation(True):
        model.initialize(in_data, out_data)
        Y_batched = model.predict(in_data)
        Y_not_batched = [model.predict([u])[0] for u in in_data]
        for i in range(len(Y_batched)):
            assert_almost_equal(
                OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4
            )


def util_batch_unbatch_docs_array(
    model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d
):
    with data_validation(True):
        model.initialize(in_data, out_data)
        Y_batched = model.predict(in_data).tolist()
        Y_not_batched = [model.predict([u])[0].tolist() for u in in_data]
        assert_almost_equal(Y_batched, Y_not_batched, decimal=4)


def util_batch_unbatch_docs_ragged(
    model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged
):
    with data_validation(True):
        model.initialize(in_data, out_data)
        Y_batched = model.predict(in_data).data.tolist()
        Y_not_batched = []
        for u in in_data:
            Y_not_batched.extend(model.predict([u]).data.tolist())
        assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`from typing import List`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00
			`import numpy`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`import pytest`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`from numpy.testing import assert_almost_equal`
			`from spacy.vocab import Vocab`
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 15:58:29 +03:00			`from thinc.api import Model, data_validation, get_current_ops`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`from thinc.types import Array2d, Ragged`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00
			`from spacy.lang.en import English`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`from spacy.ml import FeatureExtractor, StaticVectors`
Make stable private modules public and adjust names (#11353) * Make stable private modules public and adjust names * `spacy.ml._character_embed` -> `spacy.ml.character_embed` * `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine` * `spacy.tokens._serialize` -> `spacy.tokens.doc_bin` * `spacy.tokens._retokenize` -> `spacy.tokens.retokenize` * `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups` * Skip _precomputable_affine * retokenize -> retokenizer * Fix imports 2022-08-30 14:56:35 +03:00			`from spacy.ml.character_embed import CharacterEmbed`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`from spacy.tokens import Doc`

call NumpyOps instead of get_current_ops() 2020-10-14 17:55:00 +03:00
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 15:58:29 +03:00			`OPS = get_current_ops()`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`texts = ["These are 4 words", "Here just three"]`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]`
			`l1 = [[9, 8], [7, 6], [5, 4]]`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")]`
			`list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")]`
			`array = OPS.xp.asarray(l1, dtype="f")`
			`ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i"))`


			`def get_docs():`
			`vocab = Vocab()`
			`for t in texts:`
			`for word in t.split():`
			`hash_id = vocab.strings.add(word)`
			`vector = numpy.random.uniform(-1, 1, (7,))`
			`vocab.set_vector(hash_id, vector)`
			`docs = [English(vocab)(t) for t in texts]`
			`return docs`

component tests single or multiple prediction 2020-10-13 17:26:53 +03:00
			`# Test components with a model of type Model[List[Doc], List[Floats2d]]`
			`@pytest.mark.parametrize("name", ["tagger", "tok2vec", "morphologizer", "senter"])`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`def test_components_batching_list(name):`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`nlp = English()`
			`proc = nlp.create_pipe(name)`
naming 2020-10-13 19:52:37 +03:00			`util_batch_unbatch_docs_list(proc.model, get_docs(), list_floats)`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00

			`# Test components with a model of type Model[List[Doc], Floats2d]`
			`@pytest.mark.parametrize("name", ["textcat"])`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`def test_components_batching_array(name):`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`nlp = English()`
			`proc = nlp.create_pipe(name)`
naming 2020-10-13 19:52:37 +03:00			`util_batch_unbatch_docs_array(proc.model, get_docs(), array)`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00

			`LAYERS = [`
			`(CharacterEmbed(nM=5, nC=3), get_docs(), list_floats),`
			`(FeatureExtractor([100, 200]), get_docs(), list_ints),`
			`(StaticVectors(), get_docs(), ragged),`
			`]`


			`@pytest.mark.parametrize("model,in_data,out_data", LAYERS)`
			`def test_layers_batching_all(model, in_data, out_data):`
			`# In = List[Doc]`
			`if isinstance(in_data, list) and isinstance(in_data[0], Doc):`
			`if isinstance(out_data, OPS.xp.ndarray) and out_data.ndim == 2:`
naming 2020-10-13 19:52:37 +03:00			`util_batch_unbatch_docs_array(model, in_data, out_data)`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`elif (`
			`isinstance(out_data, list)`
			`and isinstance(out_data[0], OPS.xp.ndarray)`
			`and out_data[0].ndim == 2`
			`):`
naming 2020-10-13 19:52:37 +03:00			`util_batch_unbatch_docs_list(model, in_data, out_data)`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`elif isinstance(out_data, Ragged):`
naming 2020-10-13 19:52:37 +03:00			`util_batch_unbatch_docs_ragged(model, in_data, out_data)`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00
naming 2020-10-13 19:52:37 +03:00			`def util_batch_unbatch_docs_list(`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]`
			`):`
component tests single or multiple prediction 2020-10-13 17:26:53 +03:00			`with data_validation(True):`
			`model.initialize(in_data, out_data)`
			`Y_batched = model.predict(in_data)`
			`Y_not_batched = [model.predict([u])[0] for u in in_data]`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`for i in range(len(Y_batched)):`
Tidy up code 2021-06-28 12:48:00 +03:00			`assert_almost_equal(`
			`OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4`
			`)`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00

naming 2020-10-13 19:52:37 +03:00			`def util_batch_unbatch_docs_array(`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d`
			`):`
			`with data_validation(True):`
			`model.initialize(in_data, out_data)`
			`Y_batched = model.predict(in_data).tolist()`
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 15:58:29 +03:00			`Y_not_batched = [model.predict([u])[0].tolist() for u in in_data]`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`assert_almost_equal(Y_batched, Y_not_batched, decimal=4)`


naming 2020-10-13 19:52:37 +03:00			`def util_batch_unbatch_docs_ragged(`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged`
			`):`
			`with data_validation(True):`
			`model.initialize(in_data, out_data)`
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 15:58:29 +03:00			`Y_batched = model.predict(in_data).data.tolist()`
add tests for individual spacy layers 2020-10-13 19:50:07 +03:00			`Y_not_batched = []`
			`for u in in_data:`
			`Y_not_batched.extend(model.predict([u]).data.tolist())`
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 15:58:29 +03:00			`assert_almost_equal(Y_batched, Y_not_batched, decimal=4)`