From 260cb9c6febc5c7c2c17686d4f4e62c13b6833b9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 18:06:31 +0100 Subject: [PATCH] Raise error for non-default vectors with PretrainVectors (#12366) --- spacy/errors.py | 2 + spacy/ml/models/multi_task.py | 3 ++ spacy/tests/training/test_pretraining.py | 47 +++++++++++++++++++----- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2c8b98aad..1047ed21a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -549,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E850 = ("The PretrainVectors objective currently only supports default " + "vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") E852 = ("The tar file pulled from the remote attempted an unsafe path " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index a7d67c6dd..826fddd4f 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -8,6 +8,7 @@ from thinc.loss import Loss from ...util import registry, OOV_RANK from ...errors import Errors from ...attrs import ID +from ...vectors import Mode as VectorsMode import numpy from functools import partial @@ -23,6 +24,8 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: + if vocab.vectors.mode != VectorsMode.default: + raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 9359c8485..c0d64f1e7 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -2,17 +2,19 @@ from pathlib import Path import numpy as np import pytest import srsly -from spacy.vocab import Vocab -from thinc.api import Config +from thinc.api import Config, get_current_ops +from spacy import util +from spacy.lang.en import English +from spacy.training.initialize import init_nlp +from spacy.training.loop import train +from spacy.training.pretrain import pretrain +from spacy.tokens import Doc, DocBin +from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH +from spacy.ml.models.multi_task import create_pretrain_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir -from ... import util -from ...lang.en import English -from ...training.initialize import init_nlp -from ...training.loop import train -from ...training.pretrain import pretrain -from ...tokens import Doc, DocBin -from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH pretrain_string_listener = """ [nlp] @@ -346,3 +348,30 @@ def write_vectors_model(tmp_dir): nlp = English(vocab) nlp.to_disk(nlp_path) return str(nlp_path) + + +def test_pretrain_default_vectors(): + nlp = English() + nlp.add_pipe("tok2vec") + nlp.initialize() + + # default vectors are supported + nlp.vocab.vectors = Vectors(shape=(10, 10)) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + + # error for no vectors + with pytest.raises(ValueError, match="E875"): + nlp.vocab.vectors = Vectors() + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) + + # error for floret vectors + with pytest.raises(ValueError, match="E850"): + ops = get_current_ops() + nlp.vocab.vectors = Vectors( + data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1 + ) + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + )