Compare commits

...

8 Commits

Author SHA1 Message Date
Adriane Boyd
1355396051
Set version to v3.1.5 (#10388) 2022-02-28 12:54:14 +01:00
Adriane Boyd
c51c4534d8
Merge pull request #10356 from adrianeboyd/chore/backports-v3.1.5
Backports for v3.1.5
2022-02-28 08:59:13 +01:00
Adriane Boyd
2dc383ae1c Fix spancat for empty docs and zero suggestions (#9654)
* Fix spancat for empty docs and zero suggestions

* Use ops.xp.zeros in test
2022-02-22 18:11:43 +01:00
Adriane Boyd
c69a8756b6
Merge pull request #10345 from adrianeboyd/chore/v3.1-backport-10324
Fix Tok2Vec for empty batches (#10324)
2022-02-21 16:42:09 +01:00
Sofie Van Landeghem
5d0cc79940 fix type of lexeme.rank (#9979) 2022-02-21 15:21:46 +01:00
Adriane Boyd
900741401e Switch to latest CI images (#9773) 2022-02-21 15:00:37 +01:00
Daniël de Kok
fa8f03047d Pin mypy to 0.910 until there is a compatible pydantic version 2022-02-21 14:59:35 +01:00
Adriane Boyd
7c43f8a52d Fix Tok2Vec for empty batches (#10324)
* Add test for tok2vec with vectors and empty docs

* Add shortcut for empty batch in Tok2Vec.predict

* Avoid types
2022-02-21 14:30:35 +01:00
9 changed files with 83 additions and 25 deletions

View File

@ -23,7 +23,7 @@ jobs:
# defined in .flake8 and overwrites the selected codes. # defined in .flake8 and overwrites the selected codes.
- job: "Validate" - job: "Validate"
pool: pool:
vmImage: "ubuntu-18.04" vmImage: "ubuntu-latest"
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
@ -39,49 +39,49 @@ jobs:
matrix: matrix:
# We're only running one platform per Python version to speed up builds # We're only running one platform per Python version to speed up builds
Python36Linux: Python36Linux:
imageName: "ubuntu-18.04" imageName: "ubuntu-latest"
python.version: "3.6" python.version: "3.6"
# Python36Windows: # Python36Windows:
# imageName: "windows-2019" # imageName: "windows-latest"
# python.version: "3.6" # python.version: "3.6"
# Python36Mac: # Python36Mac:
# imageName: "macos-10.14" # imageName: "macos-latest"
# python.version: "3.6" # python.version: "3.6"
# Python37Linux: # Python37Linux:
# imageName: "ubuntu-18.04" # imageName: "ubuntu-latest"
# python.version: "3.7" # python.version: "3.7"
Python37Windows: Python37Windows:
imageName: "windows-2019" imageName: "windows-latest"
python.version: "3.7" python.version: "3.7"
# Python37Mac: # Python37Mac:
# imageName: "macos-10.14" # imageName: "macos-latest"
# python.version: "3.7" # python.version: "3.7"
# Python38Linux: # Python38Linux:
# imageName: "ubuntu-18.04" # imageName: "ubuntu-latest"
# python.version: "3.8" # python.version: "3.8"
# Python38Windows: # Python38Windows:
# imageName: "windows-2019" # imageName: "windows-latest"
# python.version: "3.8" # python.version: "3.8"
Python38Mac: Python38Mac:
imageName: "macos-10.14" imageName: "macos-latest"
python.version: "3.8" python.version: "3.8"
Python39Linux: Python39Linux:
imageName: "ubuntu-18.04" imageName: "ubuntu-latest"
python.version: "3.9" python.version: "3.9"
# Python39Windows: # Python39Windows:
# imageName: "windows-2019" # imageName: "windows-latest"
# python.version: "3.9" # python.version: "3.9"
# Python39Mac: # Python39Mac:
# imageName: "macos-10.14" # imageName: "macos-latest"
# python.version: "3.9" # python.version: "3.9"
Python310Linux: Python310Linux:
imageName: "ubuntu-20.04" imageName: "ubuntu-latest"
python.version: "3.10" python.version: "3.10"
Python310Windows: Python310Windows:
imageName: "windows-2019" imageName: "windows-latest"
python.version: "3.10" python.version: "3.10"
Python310Mac: Python310Mac:
imageName: "macos-10.15" imageName: "macos-latest"
python.version: "3.10" python.version: "3.10"
maxParallel: 4 maxParallel: 4
pool: pool:

View File

@ -29,7 +29,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.910 mypy==0.910
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-requests types-requests

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.1.4" __version__ = "3.1.5"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -19,7 +19,7 @@ class Lexeme:
@property @property
def vector_norm(self) -> float: ... def vector_norm(self) -> float: ...
vector: Floats1d vector: Floats1d
rank: str rank: int
sentiment: float sentiment: float
@property @property
def orth_(self) -> str: ... def orth_(self) -> str: ...

View File

@ -28,7 +28,13 @@ def forward(
X, spans = source_spans X, spans = source_spans
assert spans.dataXd.ndim == 2 assert spans.dataXd.ndim == 2
indices = _get_span_indices(ops, spans, X.lengths) indices = _get_span_indices(ops, spans, X.lengths)
if len(indices) > 0:
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index] Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index]
else:
Y = Ragged(
ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
ops.xp.zeros((len(X.lengths),), dtype="i"),
)
x_shape = X.dataXd.shape x_shape = X.dataXd.shape
x_lengths = X.lengths x_lengths = X.lengths
@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
for j in range(spans_i.shape[0]): for j in range(spans_i.shape[0]):
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index] indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
offset += length offset += length
return ops.flatten(indices) return ops.flatten(indices, dtype="i", ndim_if_empty=1)
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:

View File

@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
if len(spans) > 0: if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), lengths_array) output = Ragged(ops.xp.vstack(spans), lengths_array)
else: else:
output = Ragged(ops.xp.zeros((0, 0)), lengths_array) output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
assert output.dataXd.ndim == 2 assert output.dataXd.ndim == 2
return output return output

View File

@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
DOCS: https://spacy.io/api/tok2vec#predict DOCS: https://spacy.io/api/tok2vec#predict
""" """
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
width = self.model.get_dim("nO")
return [self.model.ops.alloc((0, width)) for doc in docs]
tokvecs = self.model.predict(docs) tokvecs = self.model.predict(docs)
batch_id = Tok2VecListener.get_batch_id(docs) batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners: for listener in self.listeners:

View File

@ -1,7 +1,7 @@
import pytest import pytest
import numpy import numpy
from numpy.testing import assert_array_equal, assert_almost_equal from numpy.testing import assert_array_equal, assert_almost_equal
from thinc.api import get_current_ops from thinc.api import get_current_ops, Ragged
from spacy import util from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [
"I like London and Berlin", "I like London and Berlin",
{"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}}, {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
), ),
("", {"spans": {SPAN_KEY: []}}),
] ]
@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping():
"London and Berlin", "London and Berlin",
} }
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_zero_suggestions():
# Test with a suggester that returns 0 suggestions
@registry.misc("test_zero_suggester")
def make_zero_suggester():
def zero_suggester(docs, *, ops=None):
if ops is None:
ops = get_current_ops()
return Ragged(
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
)
return zero_suggester
fix_random_seed(0)
nlp = English()
spancat = nlp.add_pipe(
"spancat",
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
)
train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert spancat.model.get_dim("nO") == 2
assert set(spancat.labels) == {"LOC", "PERSON"}
nlp.update(train_examples, sgd=optimizer)

View File

@ -11,7 +11,7 @@ from spacy.lang.en import English
from thinc.api import Config, get_current_ops from thinc.api import Config, get_current_ops
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
from ..util import get_batch, make_tempdir from ..util import get_batch, make_tempdir, add_vecs_to_vocab
def test_empty_doc(): def test_empty_doc():
@ -140,9 +140,25 @@ TRAIN_DATA = [
] ]
def test_tok2vec_listener(): @pytest.mark.parametrize("with_vectors", (False, True))
def test_tok2vec_listener(with_vectors):
orig_config = Config().from_str(cfg_string) orig_config = Config().from_str(cfg_string)
orig_config["components"]["tok2vec"]["model"]["embed"][
"include_static_vectors"
] = with_vectors
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
if with_vectors:
ops = get_current_ops()
vectors = [
("apple", ops.asarray([1, 2, 3])),
("orange", ops.asarray([-1, -2, -3])),
("and", ops.asarray([-1, -1, -1])),
("juice", ops.asarray([5, 5, 10])),
("pie", ops.asarray([7, 6.3, 8.9])),
]
add_vecs_to_vocab(nlp.vocab, vectors)
assert nlp.pipe_names == ["tok2vec", "tagger"] assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger") tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec") tok2vec = nlp.get_pipe("tok2vec")
@ -169,6 +185,9 @@ def test_tok2vec_listener():
ops = get_current_ops() ops = get_current_ops()
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
# test with empty doc
doc = nlp("")
# TODO: should this warn or error? # TODO: should this warn or error?
nlp.select_pipes(disable="tok2vec") nlp.select_pipes(disable="tok2vec")
assert nlp.pipe_names == ["tagger"] assert nlp.pipe_names == ["tagger"]