Add test for old architectures (#10751)

* add v1 and v2 tests for tok2vec architectures

* textcat architectures are not "layers"

* test older textcat architectures

* test older parser architecture
This commit is contained in:
Sofie Van Landeghem 2022-05-10 08:24:42 +02:00 committed by GitHub
parent 733114bdd9
commit 1543558d08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 119 additions and 53 deletions

View File

@ -12,6 +12,7 @@ from spacy.vocab import Vocab
from ...pipeline import DependencyParser from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
from ..util import apply_transition_sequence, make_tempdir from ..util import apply_transition_sequence, make_tempdir
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
TRAIN_DATA = [ TRAIN_DATA = [
( (
@ -395,6 +396,34 @@ def test_overfitting_IO(pipe_name):
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
# fmt: off
@pytest.mark.slow
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize(
"parser_config",
[
# TransitionBasedParser V1
({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
# TransitionBasedParser V2
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
],
)
# fmt: on
def test_parser_configs(pipe_name, parser_config):
pipe_config = {"model": parser_config}
nlp = English()
parser = nlp.add_pipe(pipe_name, config=pipe_config)
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.initialize()
for i in range(5):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
def test_beam_parser_scores(): def test_beam_parser_scores():
# Test that we can get confidence values out of the beam_parser pipe # Test that we can get confidence values out of the beam_parser pipe
beam_width = 16 beam_width = 16

View File

@ -382,6 +382,7 @@ def test_implicit_label(name, get_examples):
# fmt: off # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
@ -390,7 +391,10 @@ def test_implicit_label(name, get_examples):
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# ENSEMBLE # ENSEMBLE V1
("textcat", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
# ENSEMBLE V2
("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}), ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}),
("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}), ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}),
("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}), ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}),
@ -643,15 +647,28 @@ def test_overfitting_IO_multi():
# fmt: off # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,train_data,textcat_config", "name,train_data,textcat_config",
[ [
# BOW V1
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
# ENSEMBLE V1
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
# CNN V1
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
# BOW V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# CNN V2
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
], ],

View File

@ -1,13 +1,13 @@
import pytest import pytest
from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.ml.models.tok2vec import build_Tok2Vec_model
from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.training import Example from spacy.training import Example
from spacy import util from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import registry
from thinc.api import Config, get_current_ops from thinc.api import Config, get_current_ops
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
@ -55,24 +55,41 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
assert doc_vec.shape == (len(doc), width) assert doc_vec.shape == (len(doc), width)
@pytest.mark.slow
@pytest.mark.parametrize("width", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"width,embed_arch,embed_config,encode_arch,encode_config", "embed_arch,embed_config",
# fmt: off # fmt: off
[ [
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}),
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}),
], ],
# fmt: on # fmt: on
) )
def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): @pytest.mark.parametrize(
"tok2vec_arch,encode_arch,encode_config",
# fmt: off
[
("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}),
("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}),
],
# fmt: on
)
def test_tok2vec_configs(
width, tok2vec_arch, embed_arch, embed_config, encode_arch, encode_config
):
embed = registry.get("architectures", embed_arch)
encode = registry.get("architectures", encode_arch)
tok2vec_model = registry.get("architectures", tok2vec_arch)
embed_config["width"] = width embed_config["width"] = width
encode_config["width"] = width encode_config["width"] = width
docs = get_batch(3) docs = get_batch(3)
tok2vec = build_Tok2Vec_model( tok2vec = tok2vec_model(embed(**embed_config), encode(**encode_config))
embed_arch(**embed_config), encode_arch(**encode_config)
)
tok2vec.initialize(docs) tok2vec.initialize(docs)
vectors, backprop = tok2vec.begin_update(docs) vectors, backprop = tok2vec.begin_update(docs)
assert len(vectors) == len(docs) assert len(vectors) == len(docs)

View File

@ -103,11 +103,22 @@ and residual connections.
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ |
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
Identical to Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
except the `use_upper` was set to `True` by default.
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
@ -147,41 +158,6 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
## Layers {#layers}
These functions are available from `@spacy.registry.layers`.
### spacy.StaticVectors.v1 {#StaticVectors_v1}
Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
for the handling of tokens without vectors.
<Infobox title="Bugs for tokens without vectors" variant="warning">
`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
vectors table, which causes the model predictions to change if new vectors are
added to an existing vectors table. See more details in
[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
</Infobox>
### spacy.TextCatCNN.v1 {#TextCatCNN_v1} ### spacy.TextCatCNN.v1 {#TextCatCNN_v1}
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
@ -246,8 +222,35 @@ the others, but may not be as accurate, especially if texts are short.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
Identical to
[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
except the `use_upper` was set to `True` by default.
## Layers {#layers}
These functions are available from `@spacy.registry.layers`.
### spacy.StaticVectors.v1 {#StaticVectors_v1}
Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
for the handling of tokens without vectors.
<Infobox title="Bugs for tokens without vectors" variant="warning">
`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
vectors table, which causes the model predictions to change if new vectors are
added to an existing vectors table. See more details in
[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
</Infobox>
## Loggers {#loggers} ## Loggers {#loggers}
Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. Logging utilities for spaCy are implemented in the
[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
functions are typically available from `@spacy.registry.loggers`.
More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. More documentation can be found in that repo's
[readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file.