mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
1355396051 | ||
|
c51c4534d8 | ||
|
2dc383ae1c | ||
|
c69a8756b6 | ||
|
5d0cc79940 | ||
|
900741401e | ||
|
fa8f03047d | ||
|
7c43f8a52d |
|
@ -23,7 +23,7 @@ jobs:
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
- job: "Validate"
|
- job: "Validate"
|
||||||
pool:
|
pool:
|
||||||
vmImage: "ubuntu-18.04"
|
vmImage: "ubuntu-latest"
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
|
@ -39,49 +39,49 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "windows-2019"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python36Mac:
|
# Python36Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-latest"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "windows-2019"
|
imageName: "windows-latest"
|
||||||
python.version: "3.7"
|
python.version: "3.7"
|
||||||
# Python37Mac:
|
# Python37Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
# Python38Linux:
|
# Python38Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
# Python38Windows:
|
# Python38Windows:
|
||||||
# imageName: "windows-2019"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: "macos-10.14"
|
imageName: "macos-latest"
|
||||||
python.version: "3.8"
|
python.version: "3.8"
|
||||||
Python39Linux:
|
Python39Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.9"
|
python.version: "3.9"
|
||||||
# Python39Windows:
|
# Python39Windows:
|
||||||
# imageName: "windows-2019"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.9"
|
# python.version: "3.9"
|
||||||
# Python39Mac:
|
# Python39Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.9"
|
# python.version: "3.9"
|
||||||
Python310Linux:
|
Python310Linux:
|
||||||
imageName: "ubuntu-20.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.10"
|
python.version: "3.10"
|
||||||
Python310Windows:
|
Python310Windows:
|
||||||
imageName: "windows-2019"
|
imageName: "windows-latest"
|
||||||
python.version: "3.10"
|
python.version: "3.10"
|
||||||
Python310Mac:
|
Python310Mac:
|
||||||
imageName: "macos-10.15"
|
imageName: "macos-latest"
|
||||||
python.version: "3.10"
|
python.version: "3.10"
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
|
|
|
@ -29,7 +29,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<3.10.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy>=0.910
|
mypy==0.910
|
||||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-requests
|
types-requests
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.1.4"
|
__version__ = "3.1.5"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -19,7 +19,7 @@ class Lexeme:
|
||||||
@property
|
@property
|
||||||
def vector_norm(self) -> float: ...
|
def vector_norm(self) -> float: ...
|
||||||
vector: Floats1d
|
vector: Floats1d
|
||||||
rank: str
|
rank: int
|
||||||
sentiment: float
|
sentiment: float
|
||||||
@property
|
@property
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
|
|
|
@ -28,7 +28,13 @@ def forward(
|
||||||
X, spans = source_spans
|
X, spans = source_spans
|
||||||
assert spans.dataXd.ndim == 2
|
assert spans.dataXd.ndim == 2
|
||||||
indices = _get_span_indices(ops, spans, X.lengths)
|
indices = _get_span_indices(ops, spans, X.lengths)
|
||||||
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index]
|
if len(indices) > 0:
|
||||||
|
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index]
|
||||||
|
else:
|
||||||
|
Y = Ragged(
|
||||||
|
ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
|
||||||
|
ops.xp.zeros((len(X.lengths),), dtype="i"),
|
||||||
|
)
|
||||||
x_shape = X.dataXd.shape
|
x_shape = X.dataXd.shape
|
||||||
x_lengths = X.lengths
|
x_lengths = X.lengths
|
||||||
|
|
||||||
|
@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
||||||
for j in range(spans_i.shape[0]):
|
for j in range(spans_i.shape[0]):
|
||||||
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
||||||
offset += length
|
offset += length
|
||||||
return ops.flatten(indices)
|
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
||||||
|
|
||||||
|
|
||||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||||
|
|
|
@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
if len(spans) > 0:
|
if len(spans) > 0:
|
||||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||||
else:
|
else:
|
||||||
output = Ragged(ops.xp.zeros((0, 0)), lengths_array)
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
|
||||||
assert output.dataXd.ndim == 2
|
assert output.dataXd.ndim == 2
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#predict
|
DOCS: https://spacy.io/api/tok2vec#predict
|
||||||
"""
|
"""
|
||||||
|
if not any(len(doc) for doc in docs):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
width = self.model.get_dim("nO")
|
||||||
|
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
for listener in self.listeners:
|
for listener in self.listeners:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||||
from thinc.api import get_current_ops
|
from thinc.api import get_current_ops, Ragged
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [
|
||||||
"I like London and Berlin",
|
"I like London and Berlin",
|
||||||
{"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
|
{"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
|
||||||
),
|
),
|
||||||
|
("", {"spans": {SPAN_KEY: []}}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping():
|
||||||
"London and Berlin",
|
"London and Berlin",
|
||||||
}
|
}
|
||||||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_zero_suggestions():
|
||||||
|
# Test with a suggester that returns 0 suggestions
|
||||||
|
|
||||||
|
@registry.misc("test_zero_suggester")
|
||||||
|
def make_zero_suggester():
|
||||||
|
def zero_suggester(docs, *, ops=None):
|
||||||
|
if ops is None:
|
||||||
|
ops = get_current_ops()
|
||||||
|
return Ragged(
|
||||||
|
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
||||||
|
)
|
||||||
|
|
||||||
|
return zero_suggester
|
||||||
|
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = English()
|
||||||
|
spancat = nlp.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
||||||
|
)
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
assert spancat.model.get_dim("nO") == 2
|
||||||
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
|
|
@ -11,7 +11,7 @@ from spacy.lang.en import English
|
||||||
from thinc.api import Config, get_current_ops
|
from thinc.api import Config, get_current_ops
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
|
|
||||||
from ..util import get_batch, make_tempdir
|
from ..util import get_batch, make_tempdir, add_vecs_to_vocab
|
||||||
|
|
||||||
|
|
||||||
def test_empty_doc():
|
def test_empty_doc():
|
||||||
|
@ -140,9 +140,25 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec_listener():
|
@pytest.mark.parametrize("with_vectors", (False, True))
|
||||||
|
def test_tok2vec_listener(with_vectors):
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
orig_config["components"]["tok2vec"]["model"]["embed"][
|
||||||
|
"include_static_vectors"
|
||||||
|
] = with_vectors
|
||||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
|
||||||
|
if with_vectors:
|
||||||
|
ops = get_current_ops()
|
||||||
|
vectors = [
|
||||||
|
("apple", ops.asarray([1, 2, 3])),
|
||||||
|
("orange", ops.asarray([-1, -2, -3])),
|
||||||
|
("and", ops.asarray([-1, -1, -1])),
|
||||||
|
("juice", ops.asarray([5, 5, 10])),
|
||||||
|
("pie", ops.asarray([7, 6.3, 8.9])),
|
||||||
|
]
|
||||||
|
add_vecs_to_vocab(nlp.vocab, vectors)
|
||||||
|
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
|
@ -169,6 +185,9 @@ def test_tok2vec_listener():
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
|
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
|
||||||
|
|
||||||
|
# test with empty doc
|
||||||
|
doc = nlp("")
|
||||||
|
|
||||||
# TODO: should this warn or error?
|
# TODO: should this warn or error?
|
||||||
nlp.select_pipes(disable="tok2vec")
|
nlp.select_pipes(disable="tok2vec")
|
||||||
assert nlp.pipe_names == ["tagger"]
|
assert nlp.pipe_names == ["tagger"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user