From 7c43f8a52d537379202afc1a0131fa0966a83b8f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:22:36 +0100 Subject: [PATCH 1/4] Fix Tok2Vec for empty batches (#10324) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types --- spacy/pipeline/tok2vec.py | 4 ++++ spacy/tests/pipeline/test_tok2vec.py | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index cb601e5dc..2e3dde3cb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#predict """ + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + width = self.model.get_dim("nO") + return [self.model.ops.alloc((0, width)) for doc in docs] tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index eeea906bb..a5ac85e1e 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -11,7 +11,7 @@ from spacy.lang.en import English from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal -from ..util import get_batch, make_tempdir +from ..util import get_batch, make_tempdir, add_vecs_to_vocab def test_empty_doc(): @@ -140,9 +140,25 @@ TRAIN_DATA = [ ] -def test_tok2vec_listener(): +@pytest.mark.parametrize("with_vectors", (False, True)) +def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) + orig_config["components"]["tok2vec"]["model"]["embed"][ + "include_static_vectors" + ] = with_vectors nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + + if with_vectors: + ops = get_current_ops() + vectors = [ + ("apple", ops.asarray([1, 2, 3])), + ("orange", ops.asarray([-1, -2, -3])), + ("and", ops.asarray([-1, -1, -1])), + ("juice", ops.asarray([5, 5, 10])), + ("pie", ops.asarray([7, 6.3, 8.9])), + ] + add_vecs_to_vocab(nlp.vocab, vectors) + assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") @@ -169,6 +185,9 @@ def test_tok2vec_listener(): ops = get_current_ops() assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) + # test with empty doc + doc = nlp("") + # TODO: should this warn or error? nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] From fa8f03047d81255f95e324e0a5bd9059c6c6d214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 16 Dec 2021 09:31:45 +0100 Subject: [PATCH 2/4] Pin mypy to 0.910 until there is a compatible pydantic version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 36cf5c58e..03ecbc9cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.910 +mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests From 900741401eca8566decd823493fb3bbf97c80981 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 30 Nov 2021 10:08:51 +0100 Subject: [PATCH 3/4] Switch to latest CI images (#9773) --- azure-pipelines.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4291b6e0a..71a793911 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -23,7 +23,7 @@ jobs: # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-18.04" + vmImage: "ubuntu-latest" steps: - task: UsePythonVersion@0 inputs: @@ -39,49 +39,49 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.6" # Python36Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.6" # Python36Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.7" Python37Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.7" # Python37Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.7" # Python38Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.8" # Python38Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.8" Python38Mac: - imageName: "macos-10.14" + imageName: "macos-latest" python.version: "3.8" Python39Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.9" # Python39Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.9" # Python39Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.9" Python310Linux: - imageName: "ubuntu-20.04" + imageName: "ubuntu-latest" python.version: "3.10" Python310Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.10" Python310Mac: - imageName: "macos-10.15" + imageName: "macos-latest" python.version: "3.10" maxParallel: 4 pool: From 5d0cc7994022b7dbc42f405aa6695916f6c3ae03 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 4 Jan 2022 13:15:25 +0100 Subject: [PATCH 4/4] fix type of lexeme.rank (#9979) --- spacy/lexeme.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4eae6be43..4fcaa82cf 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -19,7 +19,7 @@ class Lexeme: @property def vector_norm(self) -> float: ... vector: Floats1d - rank: str + rank: int sentiment: float @property def orth_(self) -> str: ...