Add test for old architectures (#10751)

* add v1 and v2 tests for tok2vec architectures * textcat architectures are not "layers" * test older textcat architectures * test older parser architecture
2025-11-18 16:56:07 +03:00 · 2022-05-10 08:24:42 +02:00 · 2022-05-10 08:24:42 +02:00 · 1543558d08
commit 1543558d08
parent 733114bdd9
4 changed files with 119 additions and 53 deletions
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -12,6 +12,7 @@ from spacy.vocab import Vocab
 from ...pipeline import DependencyParser
 from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from ..util import apply_transition_sequence, make_tempdir
 from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 TRAIN_DATA = [
    (
@ -395,6 +396,34 @@ def test_overfitting_IO(pipe_name):
    assert_equal(batch_deps_1, no_batch_deps)
 # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize(
    "parser_config",
    [
        # TransitionBasedParser V1
        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
        # TransitionBasedParser V2
        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
    ],
 )
 # fmt: on
 def test_parser_configs(pipe_name, parser_config):
    pipe_config = {"model": parser_config}
    nlp = English()
    parser = nlp.add_pipe(pipe_name, config=pipe_config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
 def test_beam_parser_scores():
    # Test that we can get confidence values out of the beam_parser pipe
    beam_width = 16
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -382,6 +382,7 @@ def test_implicit_label(name, get_examples):
 # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize(
    "name,textcat_config",
    [
@ -390,7 +391,10 @@ def test_implicit_label(name, get_examples):
        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # ENSEMBLE
+        # ENSEMBLE V1
        ("textcat", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
        # ENSEMBLE V2
        ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}),
        ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}),
@ -643,15 +647,28 @@ def test_overfitting_IO_multi():
 # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize(
    "name,train_data,textcat_config",
    [
        # BOW V1
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
        # ENSEMBLE V1
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}),
        # CNN V1
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
        # BOW V2
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
        # ENSEMBLE V2
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
        # CNN V2
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
    ],
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -1,13 +1,13 @@
 import pytest
 from spacy.ml.models.tok2vec import build_Tok2Vec_model
-from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed
+from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
 from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
 from spacy.util import registry
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal
@ -55,24 +55,41 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
        assert doc_vec.shape == (len(doc), width)
@pytest.mark.slow
@pytest.mark.parametrize("width", [8])
@pytest.mark.parametrize(
-    "width,embed_arch,embed_config,encode_arch,encode_config",
+    "embed_arch,embed_config",
    # fmt: off
    [
-        (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
+        ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}),
-        (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+        ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}),
    ],
    # fmt: on
 )
-def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config):
+@pytest.mark.parametrize(
    "tok2vec_arch,encode_arch,encode_config",
    # fmt: off
    [
        ("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
        ("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
        ("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}),
        ("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}),
    ],
    # fmt: on
 )
 def test_tok2vec_configs(
    width, tok2vec_arch, embed_arch, embed_config, encode_arch, encode_config
 ):
    embed = registry.get("architectures", embed_arch)
    encode = registry.get("architectures", encode_arch)
    tok2vec_model = registry.get("architectures", tok2vec_arch)
    embed_config["width"] = width
    encode_config["width"] = width
    docs = get_batch(3)
-    tok2vec = build_Tok2Vec_model(
+    tok2vec = tok2vec_model(embed(**embed_config), encode(**encode_config))
        embed_arch(**embed_config), encode_arch(**encode_config)
    )
    tok2vec.initialize(docs)
    vectors, backprop = tok2vec.begin_update(docs)
    assert len(vectors) == len(docs)
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@ -103,11 +103,22 @@ and residual connections.
 | `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**   | The model using the architecture. ~~Model[Floats2d, Floats2d]~~                                                                                                                                                |
-### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
+### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
-Identical to
+Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
-except the `use_upper` was set to `True` by default.
+
 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
 Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
 except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
 included.
 ### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
 Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
 except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
 included.
 ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
@ -147,41 +158,6 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
 Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
 using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
 Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
 except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
 included.
 ### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
 Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
 except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
 included.
 ## Layers {#layers}
 These functions are available from `@spacy.registry.layers`.
 ### spacy.StaticVectors.v1 {#StaticVectors_v1}
 Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
 for the handling of tokens without vectors.
 <Infobox title="Bugs for tokens without vectors" variant="warning">
 `spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
 vectors table, which causes the model predictions to change if new vectors are
 added to an existing vectors table. See more details in
 [issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
 </Infobox>
 ### spacy.TextCatCNN.v1 {#TextCatCNN_v1}
 Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
@ -246,8 +222,35 @@ the others, but may not be as accurate, especially if texts are short.
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
 Identical to
 [`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 ## Layers {#layers}
 These functions are available from `@spacy.registry.layers`.
 ### spacy.StaticVectors.v1 {#StaticVectors_v1}
 Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
 for the handling of tokens without vectors.
 <Infobox title="Bugs for tokens without vectors" variant="warning">
 `spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
 vectors table, which causes the model predictions to change if new vectors are
 added to an existing vectors table. See more details in
 [issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
 </Infobox>
 ## Loggers {#loggers}
-Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`.
+Logging utilities for spaCy are implemented in the
 [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
 functions are typically available from `@spacy.registry.loggers`.
-More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file.
+More documentation can be found in that repo's
 [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file.