Set version to v3.1.5 (#10388 )

Merge pull request #10356 from adrianeboyd/chore/backports-v3.1.5
Backports for v3.1.5
2025-08-05 04:40:20 +03:00 · 2022-02-28 12:54:14 +01:00 · 2022-02-28 08:59:13 +01:00 · 2022-02-22 18:11:43 +01:00 · 2022-02-21 16:42:09 +01:00 · 2022-02-21 15:21:46 +01:00
9 changed files with 83 additions and 25 deletions
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -23,7 +23,7 @@ jobs:
  # defined in .flake8 and overwrites the selected codes.
  - job: "Validate"
    pool:
-      vmImage: "ubuntu-18.04"
+      vmImage: "ubuntu-latest"
    steps:
      - task: UsePythonVersion@0
        inputs:
@ -39,49 +39,49 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-latest"
          python.version: "3.6"
        #        Python36Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
        #          python.version: "3.6"
        #        Python36Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-latest"
        #          python.version: "3.7"
        Python37Windows:
-          imageName: "windows-2019"
+          imageName: "windows-latest"
          python.version: "3.7"
        #        Python37Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
        #          python.version: "3.7"
        #        Python38Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-latest"
        #          python.version: "3.8"
        #        Python38Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
        #          python.version: "3.8"
        Python38Mac:
-          imageName: "macos-10.14"
+          imageName: "macos-latest"
          python.version: "3.8"
        Python39Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-latest"
          python.version: "3.9"
        #        Python39Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
        #          python.version: "3.9"
        #        Python39Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
        #          python.version: "3.9"
        Python310Linux:
-          imageName: "ubuntu-20.04"
+          imageName: "ubuntu-latest"
          python.version: "3.10"
        Python310Windows:
-          imageName: "windows-2019"
+          imageName: "windows-latest"
          python.version: "3.10"
        Python310Mac:
-          imageName: "macos-10.15"
+          imageName: "macos-latest"
          python.version: "3.10"
      maxParallel: 4
    pool:
--- a/requirements.txt
+++ b/requirements.txt
@ -29,7 +29,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.910
+mypy==0.910
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-requests
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.1.4"
+__version__ = "3.1.5"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -19,7 +19,7 @@ class Lexeme:
    @property
    def vector_norm(self) -> float: ...
    vector: Floats1d
-    rank: str
+    rank: int
    sentiment: float
    @property
    def orth_(self) -> str: ...
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@ -28,7 +28,13 @@ def forward(
    X, spans = source_spans
    assert spans.dataXd.ndim == 2
    indices = _get_span_indices(ops, spans, X.lengths)
-    Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    if len(indices) > 0:
+        Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    else:
+        Y = Ragged(
+            ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
+            ops.xp.zeros((len(X.lengths),), dtype="i"),
+        )
    x_shape = X.dataXd.shape
    x_lengths = X.lengths

@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
        for j in range(spans_i.shape[0]):
            indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[call-overload, index]
        offset += length
-    return ops.flatten(indices)
+    return ops.flatten(indices, dtype="i", ndim_if_empty=1)


 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
        if len(spans) > 0:
            output = Ragged(ops.xp.vstack(spans), lengths_array)
        else:
-            output = Ragged(ops.xp.zeros((0, 0)), lengths_array)
+            output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)

        assert output.dataXd.ndim == 2
        return output
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):

        DOCS: https://spacy.io/api/tok2vec#predict
        """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            width = self.model.get_dim("nO")
+            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -1,7 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops
+from thinc.api import get_current_ops, Ragged

 from spacy import util
 from spacy.lang.en import English
@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [
        "I like London and Berlin",
        {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
    ),
+    ("", {"spans": {SPAN_KEY: []}}),
 ]


@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping():
            "London and Berlin",
        }
        assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
+
+
+def test_zero_suggestions():
+    # Test with a suggester that returns 0 suggestions
+
+    @registry.misc("test_zero_suggester")
+    def make_zero_suggester():
+        def zero_suggester(docs, *, ops=None):
+            if ops is None:
+                ops = get_current_ops()
+            return Ragged(
+                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+            )
+
+        return zero_suggester
+
+    fix_random_seed(0)
+    nlp = English()
+    spancat = nlp.add_pipe(
+        "spancat",
+        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+    )
+    train_examples = make_examples(nlp)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    assert spancat.model.get_dim("nO") == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    nlp.update(train_examples, sgd=optimizer)
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal

-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab


 def test_empty_doc():
@ -140,9 +140,25 @@ TRAIN_DATA = [
 ]


-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
+def test_tok2vec_listener(with_vectors):
    orig_config = Config().from_str(cfg_string)
+    orig_config["components"]["tok2vec"]["model"]["embed"][
+        "include_static_vectors"
+    ] = with_vectors
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+
+    if with_vectors:
+        ops = get_current_ops()
+        vectors = [
+            ("apple", ops.asarray([1, 2, 3])),
+            ("orange", ops.asarray([-1, -2, -3])),
+            ("and", ops.asarray([-1, -1, -1])),
+            ("juice", ops.asarray([5, 5, 10])),
+            ("pie", ops.asarray([7, 6.3, 8.9])),
+        ]
+        add_vecs_to_vocab(nlp.vocab, vectors)
+
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
@ -169,6 +185,9 @@ def test_tok2vec_listener():
    ops = get_current_ops()
    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))

+    # test with empty doc
+    doc = nlp("")
+
    # TODO: should this warn or error?
    nlp.select_pipes(disable="tok2vec")
    assert nlp.pipe_names == ["tagger"]
Author	SHA1	Message	Date
Adriane Boyd	1355396051	Set version to v3.1.5 (#10388 )	2022-02-28 12:54:14 +01:00
Adriane Boyd	c51c4534d8	Merge pull request #10356 from adrianeboyd/chore/backports-v3.1.5 Backports for v3.1.5	2022-02-28 08:59:13 +01:00
Adriane Boyd	2dc383ae1c	Fix spancat for empty docs and zero suggestions (#9654 ) * Fix spancat for empty docs and zero suggestions * Use ops.xp.zeros in test	2022-02-22 18:11:43 +01:00
Adriane Boyd	c69a8756b6	Merge pull request #10345 from adrianeboyd/chore/v3.1-backport-10324 Fix Tok2Vec for empty batches (#10324)	2022-02-21 16:42:09 +01:00
Sofie Van Landeghem	5d0cc79940	fix type of lexeme.rank (#9979 )	2022-02-21 15:21:46 +01:00
Adriane Boyd	900741401e	Switch to latest CI images (#9773 )	2022-02-21 15:00:37 +01:00
Daniël de Kok	fa8f03047d	Pin mypy to 0.910 until there is a compatible pydantic version	2022-02-21 14:59:35 +01:00
Adriane Boyd	7c43f8a52d	Fix Tok2Vec for empty batches (#10324 ) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types	2022-02-21 14:30:35 +01:00