Merge remote-tracking branch 'upstream/develop' into fix/patterns-init

This commit is contained in:
svlandeg 2020-10-05 17:49:44 +02:00
commit 9eb813a35d
17 changed files with 160 additions and 165 deletions

View File

@ -7,7 +7,7 @@ requires = [
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50", "thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.8.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"
] ]

View File

@ -2,7 +2,7 @@
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0

View File

@ -41,7 +41,7 @@ install_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0 srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0 catalogue>=2.0.1,<2.1.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a33" __version__ = "3.0.0a34"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -458,10 +458,10 @@ class Errors:
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E900 = ("Patterns for component '{name}' not initialized. This can be fixed " E900 = ("Patterns for component '{name}' not initialized. This can be fixed "
"by calling 'add_patterns' or 'initialize'.") "by calling 'add_patterns' or 'initialize'.")
E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See " "Try checking whitespace and delimiters. See "
"https://nightly.spacy.io/api/cli#convert") "https://nightly.spacy.io/api/cli#convert")
E093 = ("The token-per-line NER file is not formatted correctly. Try checking " E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection " "dimension refers to the output width, after the linear projection "

View File

@ -289,13 +289,12 @@ class Lookups:
DOCS: https://nightly.spacy.io/api/lookups#to_disk DOCS: https://nightly.spacy.io/api/lookups#to_disk
""" """
if len(self._tables): path = ensure_path(path)
path = ensure_path(path) if not path.exists():
if not path.exists(): path.mkdir()
path.mkdir() filepath = path / filename
filepath = path / filename with filepath.open("wb") as file_:
with filepath.open("wb") as file_: file_.write(self.to_bytes())
file_.write(self.to_bytes())
def from_disk( def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs

View File

@ -210,7 +210,7 @@ class Morphologizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
""" """

View File

@ -162,7 +162,7 @@ cdef class Pipe:
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/pipe#get_loss DOCS: https://nightly.spacy.io/api/pipe#get_loss
""" """

View File

@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
""" """

View File

@ -249,7 +249,7 @@ class Tagger(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/tagger#get_loss DOCS: https://nightly.spacy.io/api/tagger#get_loss
""" """

View File

@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
""" """

View File

@ -7,6 +7,15 @@ from spacy import util
from spacy import prefer_gpu, require_gpu from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.fixture @pytest.fixture
@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected):
result = util.dot_to_dict(dot_notation) result = util.dot_to_dict(dot_notation)
assert result == expected assert result == expected
assert util.dict_to_dot(result) == dot_notation assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -1,137 +0,0 @@
import pytest
from spacy import util
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -5,7 +5,7 @@ import copy
from functools import partial from functools import partial
from pydantic import BaseModel, StrictStr from pydantic import BaseModel, StrictStr
from ..util import registry, logger from ..util import registry
from ..tokens import Doc from ..tokens import Doc
from .example import Example from .example import Example
@ -119,9 +119,8 @@ def make_orth_variants(
orig_token_dict = copy.deepcopy(token_dict) orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", []) ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", []) ndpv = orth_variants.get("paired", [])
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("ORTH", [])
words = token_dict.get("words", []) tags = token_dict.get("TAG", [])
tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined # keep unmodified if words or tags are not defined
if words and tags: if words and tags:
if lower: if lower:
@ -154,8 +153,8 @@ def make_orth_variants(
if words[word_idx] in pair: if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx]) pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx] words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict["words"] = words token_dict["ORTH"] = words
token_dict["tags"] = tags token_dict["TAG"] = tags
# modify raw # modify raw
if raw is not None: if raw is not None:
variants = [] variants = []

View File

@ -103,7 +103,7 @@ def conll_ner_to_docs(
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines])) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2: if len(cols) < 2:
raise ValueError(Errors.E093) raise ValueError(Errors.E903)
length = len(cols[0]) length = len(cols[0])
words.extend(cols[0]) words.extend(cols[0])
sent_starts.extend([True] + [False] * (length - 1)) sent_starts.extend([True] + [False] * (length - 1))

View File

@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
sent_words, sent_iob = zip(*sent_tokens) sent_words, sent_iob = zip(*sent_tokens)
sent_tags = ["-"] * len(sent_words) sent_tags = ["-"] * len(sent_words)
else: else:
raise ValueError(Errors.E092) raise ValueError(Errors.E902)
words.extend(sent_words) words.extend(sent_words)
tags.extend(sent_tags) tags.extend(sent_tags)
iob.extend(sent_iob) iob.extend(sent_iob)

View File

@ -445,9 +445,9 @@ cdef class Vocab:
setters = ["strings", "vectors"] setters = ["strings", "vectors"]
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude" and self.vectors is not None: if "vectors" not in "exclude":
self.vectors.to_disk(path) self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None: if "lookups" not in "exclude":
self.lookups.to_disk(path) self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()): def from_disk(self, path, *, exclude=tuple()):