mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 19:46:26 +03:00
eec5ccd72f
* `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
938 lines
30 KiB
Python
938 lines
30 KiB
Python
import itertools
|
|
import logging
|
|
from unittest import mock
|
|
import pytest
|
|
from spacy.language import Language
|
|
from spacy.scorer import Scorer
|
|
from spacy.tokens import Doc, Span
|
|
from spacy.vocab import Vocab
|
|
from spacy.training import Example
|
|
from spacy.lang.en import English
|
|
from spacy.lang.de import German
|
|
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
|
from spacy.util import load_model_from_config
|
|
import spacy
|
|
from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
|
|
|
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
|
|
|
|
|
try:
|
|
import torch
|
|
|
|
# Ensure that we don't deadlock in multiprocessing tests.
|
|
torch.set_num_threads(1)
|
|
torch.set_num_interop_threads(1)
|
|
except ImportError:
|
|
pass
|
|
|
|
TAGGER_CFG_STRING = """
|
|
[nlp]
|
|
lang = "en"
|
|
pipeline = ["tok2vec","tagger"]
|
|
|
|
[components]
|
|
|
|
[components.tagger]
|
|
factory = "tagger"
|
|
|
|
[components.tagger.model]
|
|
@architectures = "spacy.Tagger.v2"
|
|
nO = null
|
|
|
|
[components.tagger.model.tok2vec]
|
|
@architectures = "spacy.Tok2VecListener.v1"
|
|
width = ${components.tok2vec.model.encode.width}
|
|
|
|
[components.tok2vec]
|
|
factory = "tok2vec"
|
|
|
|
[components.tok2vec.model]
|
|
@architectures = "spacy.Tok2Vec.v2"
|
|
|
|
[components.tok2vec.model.embed]
|
|
@architectures = "spacy.MultiHashEmbed.v1"
|
|
width = ${components.tok2vec.model.encode.width}
|
|
rows = [2000, 1000, 1000, 1000]
|
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
|
include_static_vectors = false
|
|
|
|
[components.tok2vec.model.encode]
|
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
|
width = 96
|
|
depth = 4
|
|
window_size = 1
|
|
maxout_pieces = 3
|
|
"""
|
|
|
|
|
|
TAGGER_TRAIN_DATA = [
|
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
|
]
|
|
|
|
|
|
TAGGER_TRAIN_DATA = [
|
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
|
]
|
|
|
|
|
|
def evil_component(doc):
|
|
if "2" in doc.text:
|
|
raise ValueError("no dice")
|
|
return doc
|
|
|
|
|
|
def perhaps_set_sentences(doc):
|
|
if not doc.text.startswith("4"):
|
|
doc[-1].is_sent_start = True
|
|
return doc
|
|
|
|
|
|
def assert_sents_error(doc):
|
|
if not doc.has_annotation("SENT_START"):
|
|
raise ValueError("no sents")
|
|
return doc
|
|
|
|
|
|
def warn_error(proc_name, proc, docs, e):
|
|
logger = logging.getLogger("spacy")
|
|
logger.warning(f"Trouble with component {proc_name}.")
|
|
|
|
|
|
@pytest.fixture
|
|
def nlp():
|
|
nlp = Language(Vocab())
|
|
textcat = nlp.add_pipe("textcat")
|
|
for label in ("POSITIVE", "NEGATIVE"):
|
|
textcat.add_label(label)
|
|
nlp.initialize()
|
|
return nlp
|
|
|
|
|
|
def test_language_update(nlp):
|
|
text = "hello world"
|
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
|
wrongkeyannots = {"LABEL": True}
|
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
|
example = Example.from_dict(doc, annots)
|
|
nlp.update([example])
|
|
|
|
# Not allowed to call with just one Example
|
|
with pytest.raises(TypeError):
|
|
nlp.update(example)
|
|
|
|
# Update with text and dict: not supported anymore since v.3
|
|
with pytest.raises(TypeError):
|
|
nlp.update((text, annots))
|
|
# Update with doc object and dict
|
|
with pytest.raises(TypeError):
|
|
nlp.update((doc, annots))
|
|
|
|
# Create examples badly
|
|
with pytest.raises(ValueError):
|
|
example = Example.from_dict(doc, None)
|
|
with pytest.raises(KeyError):
|
|
example = Example.from_dict(doc, wrongkeyannots)
|
|
|
|
|
|
def test_language_update_updates():
|
|
config = Config().from_str(TAGGER_CFG_STRING)
|
|
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
|
|
|
train_examples = []
|
|
for t in TAGGER_TRAIN_DATA:
|
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
|
|
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
|
|
|
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
|
nlp.update(train_examples, sgd=optimizer)
|
|
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
|
|
|
xp = get_array_module(docs_after_update[0].tensor)
|
|
assert xp.any(
|
|
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
|
)
|
|
|
|
|
|
def test_language_evaluate(nlp):
|
|
text = "hello world"
|
|
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
|
example = Example.from_dict(doc, annots)
|
|
scores = nlp.evaluate([example])
|
|
assert scores["speed"] > 0
|
|
|
|
# test with generator
|
|
scores = nlp.evaluate(eg for eg in [example])
|
|
assert scores["speed"] > 0
|
|
|
|
# Not allowed to call with just one Example
|
|
with pytest.raises(TypeError):
|
|
nlp.evaluate(example)
|
|
|
|
# Evaluate with text and dict: not supported anymore since v.3
|
|
with pytest.raises(TypeError):
|
|
nlp.evaluate([(text, annots)])
|
|
# Evaluate with doc object and dict
|
|
with pytest.raises(TypeError):
|
|
nlp.evaluate([(doc, annots)])
|
|
with pytest.raises(TypeError):
|
|
nlp.evaluate([text, annots])
|
|
|
|
|
|
def test_evaluate_no_pipe(nlp):
|
|
"""Test that docs are processed correctly within Language.pipe if the
|
|
component doesn't expose a .pipe method."""
|
|
|
|
@Language.component("test_evaluate_no_pipe")
|
|
def pipe(doc):
|
|
return doc
|
|
|
|
text = "hello world"
|
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
|
nlp = Language(Vocab())
|
|
doc = nlp(text)
|
|
nlp.add_pipe("test_evaluate_no_pipe")
|
|
nlp.evaluate([Example.from_dict(doc, annots)])
|
|
|
|
|
|
def test_evaluate_textcat_multilabel(en_vocab):
|
|
"""Test that evaluate works with a multilabel textcat pipe."""
|
|
nlp = Language(en_vocab)
|
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
|
textcat_multilabel.add_label(label)
|
|
nlp.initialize()
|
|
|
|
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
|
|
doc = nlp.make_doc("hello world")
|
|
example = Example.from_dict(doc, annots)
|
|
scores = nlp.evaluate([example])
|
|
labels = nlp.get_pipe("textcat_multilabel").labels
|
|
for label in labels:
|
|
assert scores["cats_f_per_type"].get(label) is not None
|
|
for key in example.reference.cats.keys():
|
|
if key not in labels:
|
|
assert scores["cats_f_per_type"].get(key) is None
|
|
|
|
|
|
def test_evaluate_multiple_textcat_final(en_vocab):
|
|
"""Test that evaluate evaluates the final textcat component in a pipeline
|
|
with more than one textcat or textcat_multilabel."""
|
|
nlp = Language(en_vocab)
|
|
textcat = nlp.add_pipe("textcat")
|
|
for label in ("POSITIVE", "NEGATIVE"):
|
|
textcat.add_label(label)
|
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
|
textcat_multilabel.add_label(label)
|
|
nlp.initialize()
|
|
|
|
annots = {
|
|
"cats": {
|
|
"POSITIVE": 1.0,
|
|
"NEGATIVE": 0.0,
|
|
"FEATURE": 1.0,
|
|
"QUESTION": 1.0,
|
|
"POSITIVE": 1.0,
|
|
"NEGATIVE": 0.0,
|
|
}
|
|
}
|
|
doc = nlp.make_doc("hello world")
|
|
example = Example.from_dict(doc, annots)
|
|
scores = nlp.evaluate([example])
|
|
# get the labels from the final pipe
|
|
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
|
|
for label in labels:
|
|
assert scores["cats_f_per_type"].get(label) is not None
|
|
for key in example.reference.cats.keys():
|
|
if key not in labels:
|
|
assert scores["cats_f_per_type"].get(key) is None
|
|
|
|
|
|
def test_evaluate_multiple_textcat_separate(en_vocab):
|
|
"""Test that evaluate can evaluate multiple textcat components separately
|
|
with custom scorers."""
|
|
|
|
def custom_textcat_score(examples, **kwargs):
|
|
scores = Scorer.score_cats(
|
|
examples,
|
|
"cats",
|
|
multi_label=False,
|
|
**kwargs,
|
|
)
|
|
return {f"custom_{k}": v for k, v in scores.items()}
|
|
|
|
@spacy.registry.scorers("test_custom_textcat_scorer")
|
|
def make_custom_textcat_scorer():
|
|
return custom_textcat_score
|
|
|
|
nlp = Language(en_vocab)
|
|
textcat = nlp.add_pipe(
|
|
"textcat",
|
|
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
|
|
)
|
|
for label in ("POSITIVE", "NEGATIVE"):
|
|
textcat.add_label(label)
|
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
|
textcat_multilabel.add_label(label)
|
|
nlp.initialize()
|
|
|
|
annots = {
|
|
"cats": {
|
|
"POSITIVE": 1.0,
|
|
"NEGATIVE": 0.0,
|
|
"FEATURE": 1.0,
|
|
"QUESTION": 1.0,
|
|
"POSITIVE": 1.0,
|
|
"NEGATIVE": 0.0,
|
|
}
|
|
}
|
|
doc = nlp.make_doc("hello world")
|
|
example = Example.from_dict(doc, annots)
|
|
scores = nlp.evaluate([example])
|
|
# check custom scores for the textcat pipe
|
|
assert "custom_cats_f_per_type" in scores
|
|
labels = nlp.get_pipe("textcat").labels
|
|
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
|
|
# check default scores for the textcat_multilabel pipe
|
|
assert "cats_f_per_type" in scores
|
|
labels = nlp.get_pipe("textcat_multilabel").labels
|
|
assert set(scores["cats_f_per_type"].keys()) == set(labels)
|
|
|
|
|
|
def vector_modification_pipe(doc):
|
|
doc.vector += 1
|
|
return doc
|
|
|
|
|
|
def userdata_pipe(doc):
|
|
doc.user_data["foo"] = "bar"
|
|
return doc
|
|
|
|
|
|
def ner_pipe(doc):
|
|
span = Span(doc, 0, 1, label="FIRST")
|
|
doc.ents += (span,)
|
|
return doc
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_vectors():
|
|
return [
|
|
("spacy", [-0.1, -0.2, -0.3]),
|
|
("world", [-0.2, -0.3, -0.4]),
|
|
("pipe", [0.7, 0.8, 0.9]),
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def nlp2(nlp, sample_vectors):
|
|
Language.component(
|
|
"test_language_vector_modification_pipe", func=vector_modification_pipe
|
|
)
|
|
Language.component("test_language_userdata_pipe", func=userdata_pipe)
|
|
Language.component("test_language_ner_pipe", func=ner_pipe)
|
|
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
|
nlp.add_pipe("test_language_vector_modification_pipe")
|
|
nlp.add_pipe("test_language_ner_pipe")
|
|
nlp.add_pipe("test_language_userdata_pipe")
|
|
return nlp
|
|
|
|
|
|
@pytest.fixture
|
|
def texts():
|
|
data = [
|
|
"Hello world.",
|
|
"This is spacy.",
|
|
"You can use multiprocessing with pipe method.",
|
|
"Please try!",
|
|
]
|
|
return data
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe(nlp2, n_process, texts):
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
texts = texts * 10
|
|
expecteds = [nlp2(text) for text in texts]
|
|
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
|
|
|
|
for doc, expected_doc in zip(docs, expecteds):
|
|
assert_docs_equal(doc, expected_doc)
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_stream(nlp2, n_process, texts):
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
# check if nlp.pipe can handle infinite length iterator properly.
|
|
stream_texts = itertools.cycle(texts)
|
|
texts0, texts1 = itertools.tee(stream_texts)
|
|
expecteds = (nlp2(text) for text in texts0)
|
|
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
|
|
|
|
n_fetch = 20
|
|
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
|
|
assert_docs_equal(doc, expected_doc)
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler(n_process):
|
|
"""Test that the error handling of nlp.pipe works well"""
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
nlp = English()
|
|
nlp.add_pipe("merge_subtokens")
|
|
nlp.initialize()
|
|
texts = ["Curious to see what will happen to this text.", "And this one."]
|
|
# the pipeline fails because there's no parser
|
|
with pytest.raises(ValueError):
|
|
nlp(texts[0])
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, n_process=n_process))
|
|
nlp.set_error_handler(raise_error)
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, n_process=n_process))
|
|
# set explicitely to ignoring
|
|
nlp.set_error_handler(ignore_error)
|
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
|
assert len(docs) == 0
|
|
nlp(texts[0])
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
|
"""Test the error handling of a custom component that has no pipe method"""
|
|
Language.component("my_evil_component", func=evil_component)
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
nlp = English()
|
|
nlp.add_pipe("my_evil_component")
|
|
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
|
|
with pytest.raises(ValueError):
|
|
# the evil custom component throws an error
|
|
list(nlp.pipe(texts))
|
|
|
|
nlp.set_error_handler(warn_error)
|
|
logger = logging.getLogger("spacy")
|
|
with mock.patch.object(logger, "warning") as mock_warning:
|
|
# the errors by the evil custom component raise a warning for each
|
|
# bad doc
|
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
|
# HACK/TODO? the warnings in child processes don't seem to be
|
|
# detected by the mock logger
|
|
if n_process == 1:
|
|
mock_warning.assert_called()
|
|
assert mock_warning.call_count == 2
|
|
assert len(docs) + mock_warning.call_count == len(texts)
|
|
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
|
"""Test the error handling of nlp.pipe with input as tuples"""
|
|
Language.component("my_evil_component", func=evil_component)
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
nlp = English()
|
|
nlp.add_pipe("my_evil_component")
|
|
texts = [
|
|
("TEXT 111", 111),
|
|
("TEXT 222", 222),
|
|
("TEXT 333", 333),
|
|
("TEXT 342", 342),
|
|
("TEXT 666", 666),
|
|
]
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, as_tuples=True))
|
|
nlp.set_error_handler(warn_error)
|
|
logger = logging.getLogger("spacy")
|
|
with mock.patch.object(logger, "warning") as mock_warning:
|
|
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
|
|
# HACK/TODO? the warnings in child processes don't seem to be
|
|
# detected by the mock logger
|
|
if n_process == 1:
|
|
mock_warning.assert_called()
|
|
assert mock_warning.call_count == 2
|
|
assert len(tuples) + mock_warning.call_count == len(texts)
|
|
assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
|
|
assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
|
|
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
|
"""Test the error handling of a component's pipe method"""
|
|
Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
|
|
Language.component("assert_sents_error", func=assert_sents_error)
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
texts = [f"{str(i)} is enough. Done" for i in range(100)]
|
|
nlp = English()
|
|
nlp.add_pipe("my_perhaps_sentences")
|
|
nlp.add_pipe("assert_sents_error")
|
|
nlp.initialize()
|
|
with pytest.raises(ValueError):
|
|
# assert_sents_error requires sentence boundaries, will throw an error otherwise
|
|
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
|
nlp.set_error_handler(ignore_error)
|
|
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
|
# we lose/ignore the failing 4,40-49 docs
|
|
assert len(docs) == 89
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler_make_doc_actual(n_process):
|
|
"""Test the error handling for make_doc"""
|
|
# TODO: fix so that the following test is the actual behavior
|
|
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
nlp = English()
|
|
nlp.max_length = 10
|
|
texts = ["12345678901234567890", "12345"] * 10
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, n_process=n_process))
|
|
nlp.default_error_handler = ignore_error
|
|
if n_process == 1:
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, n_process=n_process))
|
|
else:
|
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
|
assert len(docs) == 0
|
|
|
|
|
|
@pytest.mark.xfail
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_error_handler_make_doc_preferred(n_process):
|
|
"""Test the error handling for make_doc"""
|
|
|
|
ops = get_current_ops()
|
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
|
nlp = English()
|
|
nlp.max_length = 10
|
|
texts = ["12345678901234567890", "12345"] * 10
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(texts, n_process=n_process))
|
|
nlp.default_error_handler = ignore_error
|
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
|
assert len(docs) == 0
|
|
|
|
|
|
def test_language_from_config_before_after_init():
|
|
name = "test_language_from_config_before_after_init"
|
|
ran_before = False
|
|
ran_after = False
|
|
ran_after_pipeline = False
|
|
ran_before_init = False
|
|
ran_after_init = False
|
|
|
|
@registry.callbacks(f"{name}_before")
|
|
def make_before_creation():
|
|
def before_creation(lang_cls):
|
|
nonlocal ran_before
|
|
ran_before = True
|
|
assert lang_cls is English
|
|
lang_cls.Defaults.foo = "bar"
|
|
return lang_cls
|
|
|
|
return before_creation
|
|
|
|
@registry.callbacks(f"{name}_after")
|
|
def make_after_creation():
|
|
def after_creation(nlp):
|
|
nonlocal ran_after
|
|
ran_after = True
|
|
assert isinstance(nlp, English)
|
|
assert nlp.pipe_names == []
|
|
assert nlp.Defaults.foo == "bar"
|
|
nlp.meta["foo"] = "bar"
|
|
return nlp
|
|
|
|
return after_creation
|
|
|
|
@registry.callbacks(f"{name}_after_pipeline")
|
|
def make_after_pipeline_creation():
|
|
def after_pipeline_creation(nlp):
|
|
nonlocal ran_after_pipeline
|
|
ran_after_pipeline = True
|
|
assert isinstance(nlp, English)
|
|
assert nlp.pipe_names == ["sentencizer"]
|
|
assert nlp.Defaults.foo == "bar"
|
|
assert nlp.meta["foo"] == "bar"
|
|
nlp.meta["bar"] = "baz"
|
|
return nlp
|
|
|
|
return after_pipeline_creation
|
|
|
|
@registry.callbacks(f"{name}_before_init")
|
|
def make_before_init():
|
|
def before_init(nlp):
|
|
nonlocal ran_before_init
|
|
ran_before_init = True
|
|
nlp.meta["before_init"] = "before"
|
|
return nlp
|
|
|
|
return before_init
|
|
|
|
@registry.callbacks(f"{name}_after_init")
|
|
def make_after_init():
|
|
def after_init(nlp):
|
|
nonlocal ran_after_init
|
|
ran_after_init = True
|
|
nlp.meta["after_init"] = "after"
|
|
return nlp
|
|
|
|
return after_init
|
|
|
|
config = {
|
|
"nlp": {
|
|
"pipeline": ["sentencizer"],
|
|
"before_creation": {"@callbacks": f"{name}_before"},
|
|
"after_creation": {"@callbacks": f"{name}_after"},
|
|
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
|
},
|
|
"components": {"sentencizer": {"factory": "sentencizer"}},
|
|
"initialize": {
|
|
"before_init": {"@callbacks": f"{name}_before_init"},
|
|
"after_init": {"@callbacks": f"{name}_after_init"},
|
|
},
|
|
}
|
|
nlp = English.from_config(config)
|
|
assert nlp.Defaults.foo == "bar"
|
|
assert nlp.meta["foo"] == "bar"
|
|
assert nlp.meta["bar"] == "baz"
|
|
assert "before_init" not in nlp.meta
|
|
assert "after_init" not in nlp.meta
|
|
assert nlp.pipe_names == ["sentencizer"]
|
|
assert nlp("text")
|
|
nlp.initialize()
|
|
assert nlp.meta["before_init"] == "before"
|
|
assert nlp.meta["after_init"] == "after"
|
|
assert all(
|
|
[ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]
|
|
)
|
|
|
|
|
|
def test_language_from_config_before_after_init_invalid():
|
|
"""Check that an error is raised if function doesn't return nlp."""
|
|
name = "test_language_from_config_before_after_init_invalid"
|
|
registry.callbacks(f"{name}_before1", func=lambda: lambda nlp: None)
|
|
registry.callbacks(f"{name}_before2", func=lambda: lambda nlp: nlp())
|
|
registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: None)
|
|
registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: English)
|
|
|
|
for callback_name in [f"{name}_before1", f"{name}_before2"]:
|
|
config = {"nlp": {"before_creation": {"@callbacks": callback_name}}}
|
|
with pytest.raises(ValueError):
|
|
English.from_config(config)
|
|
for callback_name in [f"{name}_after1", f"{name}_after2"]:
|
|
config = {"nlp": {"after_creation": {"@callbacks": callback_name}}}
|
|
with pytest.raises(ValueError):
|
|
English.from_config(config)
|
|
for callback_name in [f"{name}_after1", f"{name}_after2"]:
|
|
config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
|
|
with pytest.raises(ValueError):
|
|
English.from_config(config)
|
|
|
|
|
|
def test_language_whitespace_tokenizer():
|
|
"""Test the custom whitespace tokenizer from the docs."""
|
|
|
|
class WhitespaceTokenizer:
|
|
def __init__(self, vocab):
|
|
self.vocab = vocab
|
|
|
|
def __call__(self, text):
|
|
words = text.split(" ")
|
|
spaces = [True] * len(words)
|
|
# Avoid zero-length tokens
|
|
for i, word in enumerate(words):
|
|
if word == "":
|
|
words[i] = " "
|
|
spaces[i] = False
|
|
# Remove the final trailing space
|
|
if words[-1] == " ":
|
|
words = words[0:-1]
|
|
spaces = spaces[0:-1]
|
|
else:
|
|
spaces[-1] = False
|
|
|
|
return Doc(self.vocab, words=words, spaces=spaces)
|
|
|
|
nlp = spacy.blank("en")
|
|
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
|
text = " What's happened to me? he thought. It wasn't a dream. "
|
|
doc = nlp(text)
|
|
assert doc.text == text
|
|
|
|
|
|
def test_language_custom_tokenizer():
|
|
"""Test that a fully custom tokenizer can be plugged in via the registry."""
|
|
name = "test_language_custom_tokenizer"
|
|
|
|
class CustomTokenizer:
|
|
"""Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
|
|
|
|
def __init__(self, nlp, prefix):
|
|
self.vocab = nlp.vocab
|
|
self.prefix = prefix
|
|
|
|
def __call__(self, text):
|
|
words = [f"{self.prefix}{word}" for word in text.split(" ")]
|
|
return Doc(self.vocab, words=words)
|
|
|
|
@registry.tokenizers(name)
|
|
def custom_create_tokenizer(prefix: str = "_"):
|
|
def create_tokenizer(nlp):
|
|
return CustomTokenizer(nlp, prefix=prefix)
|
|
|
|
return create_tokenizer
|
|
|
|
config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
|
|
nlp = English.from_config(config)
|
|
doc = nlp("hello world")
|
|
assert [t.text for t in doc] == ["_hello", "_world"]
|
|
doc = list(nlp.pipe(["hello world"]))[0]
|
|
assert [t.text for t in doc] == ["_hello", "_world"]
|
|
|
|
|
|
def test_language_from_config_invalid_lang():
|
|
"""Test that calling Language.from_config raises an error and lang defined
|
|
in config needs to match language-specific subclasses."""
|
|
config = {"nlp": {"lang": "en"}}
|
|
with pytest.raises(ValueError):
|
|
Language.from_config(config)
|
|
with pytest.raises(ValueError):
|
|
German.from_config(config)
|
|
|
|
|
|
def test_spacy_blank():
|
|
nlp = spacy.blank("en")
|
|
assert nlp.config["training"]["dropout"] == 0.1
|
|
config = {"training": {"dropout": 0.2}}
|
|
meta = {"name": "my_custom_model"}
|
|
nlp = spacy.blank("en", config=config, meta=meta)
|
|
assert nlp.config["training"]["dropout"] == 0.2
|
|
assert nlp.meta["name"] == "my_custom_model"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"lang,target",
|
|
[
|
|
("en", "en"),
|
|
("fra", "fr"),
|
|
("fre", "fr"),
|
|
("iw", "he"),
|
|
("is", "isl"),
|
|
("mo", "ro"),
|
|
("mul", "mul"),
|
|
("no", "nb"),
|
|
("pt-BR", "pt"),
|
|
("xx", "mul"),
|
|
("zh-Hans", "zh"),
|
|
("zh-Hant", None),
|
|
("zxx", None),
|
|
],
|
|
)
|
|
def test_language_matching(lang, target):
|
|
"""
|
|
Test that we can look up languages by equivalent or nearly-equivalent
|
|
language codes.
|
|
"""
|
|
assert find_matching_language(lang) == target
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"lang,target",
|
|
[
|
|
("en", "en"),
|
|
("fra", "fr"),
|
|
("fre", "fr"),
|
|
("iw", "he"),
|
|
("is", "isl"),
|
|
("mo", "ro"),
|
|
("xx", "mul"),
|
|
("no", "nb"),
|
|
("pt-BR", "pt"),
|
|
("zh-Hans", "zh"),
|
|
],
|
|
)
|
|
def test_blank_languages(lang, target):
|
|
"""
|
|
Test that we can get spacy.blank in various languages, including codes
|
|
that are defined to be equivalent or that match by CLDR language matching.
|
|
"""
|
|
nlp = spacy.blank(lang)
|
|
assert nlp.lang == target
|
|
|
|
|
|
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
|
def test_language_init_invalid_vocab(value):
|
|
err_fragment = "invalid value"
|
|
with pytest.raises(ValueError) as e:
|
|
Language(value)
|
|
assert err_fragment in str(e.value)
|
|
|
|
|
|
def test_language_source_and_vectors(nlp2):
|
|
nlp = Language(Vocab())
|
|
textcat = nlp.add_pipe("textcat")
|
|
for label in ("POSITIVE", "NEGATIVE"):
|
|
textcat.add_label(label)
|
|
nlp.initialize()
|
|
long_string = "thisisalongstring"
|
|
assert long_string not in nlp.vocab.strings
|
|
assert long_string not in nlp2.vocab.strings
|
|
nlp.vocab.strings.add(long_string)
|
|
assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes()
|
|
vectors_bytes = nlp.vocab.vectors.to_bytes()
|
|
with pytest.warns(UserWarning):
|
|
nlp2.add_pipe("textcat", name="textcat2", source=nlp)
|
|
# strings should be added
|
|
assert long_string in nlp2.vocab.strings
|
|
# vectors should remain unmodified
|
|
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_pass_doc_to_pipeline(nlp, n_process):
|
|
texts = ["cats", "dogs", "guinea pigs"]
|
|
docs = [nlp.make_doc(text) for text in texts]
|
|
assert not any(len(doc.cats) for doc in docs)
|
|
doc = nlp(docs[0])
|
|
assert doc.text == texts[0]
|
|
assert len(doc.cats) > 0
|
|
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
|
docs = nlp.pipe(docs, n_process=n_process)
|
|
assert [doc.text for doc in docs] == texts
|
|
assert all(len(doc.cats) for doc in docs)
|
|
|
|
|
|
def test_invalid_arg_to_pipeline(nlp):
|
|
str_list = ["This is a text.", "This is another."]
|
|
with pytest.raises(ValueError):
|
|
nlp(str_list) # type: ignore
|
|
assert len(list(nlp.pipe(str_list))) == 2
|
|
int_list = [1, 2, 3]
|
|
with pytest.raises(ValueError):
|
|
list(nlp.pipe(int_list)) # type: ignore
|
|
with pytest.raises(ValueError):
|
|
nlp(int_list) # type: ignore
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
|
|
)
|
|
def test_multiprocessing_gpu_warning(nlp2, texts):
|
|
texts = texts * 10
|
|
docs = nlp2.pipe(texts, n_process=2, batch_size=2)
|
|
|
|
with pytest.warns(UserWarning, match="multiprocessing with GPU models"):
|
|
with pytest.raises(ValueError):
|
|
# Trigger multi-processing.
|
|
for _ in docs:
|
|
pass
|
|
|
|
|
|
def test_dot_in_factory_names(nlp):
|
|
Language.component("my_evil_component", func=evil_component)
|
|
nlp.add_pipe("my_evil_component")
|
|
|
|
with pytest.raises(ValueError, match="not permitted"):
|
|
Language.component("my.evil.component.v1", func=evil_component)
|
|
|
|
with pytest.raises(ValueError, match="not permitted"):
|
|
Language.factory("my.evil.component.v1", func=evil_component)
|
|
|
|
|
|
def test_component_return():
|
|
"""Test that an error is raised if components return a type other than a
|
|
doc."""
|
|
nlp = English()
|
|
|
|
@Language.component("test_component_good_pipe")
|
|
def good_pipe(doc):
|
|
return doc
|
|
|
|
nlp.add_pipe("test_component_good_pipe")
|
|
nlp("text")
|
|
nlp.remove_pipe("test_component_good_pipe")
|
|
|
|
@Language.component("test_component_bad_pipe")
|
|
def bad_pipe(doc):
|
|
return doc.text
|
|
|
|
nlp.add_pipe("test_component_bad_pipe")
|
|
with pytest.raises(ValueError, match="instead of a Doc"):
|
|
nlp("text")
|
|
|
|
|
|
@pytest.mark.slow
|
|
@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
|
|
def test_distill(teacher_tagger_name):
|
|
teacher = English()
|
|
teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
|
|
train_examples = []
|
|
for t in TAGGER_TRAIN_DATA:
|
|
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
|
|
|
|
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
|
|
|
for i in range(50):
|
|
losses = {}
|
|
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
|
assert losses[teacher_tagger_name] < 0.00001
|
|
|
|
student = English()
|
|
student_tagger = student.add_pipe("tagger")
|
|
student_tagger.min_tree_freq = 1
|
|
student_tagger.initialize(
|
|
get_examples=lambda: train_examples, labels=teacher_tagger.label_data
|
|
)
|
|
|
|
distill_examples = [
|
|
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
|
|
]
|
|
|
|
student_to_teacher = (
|
|
None
|
|
if teacher_tagger.name == student_tagger.name
|
|
else {student_tagger.name: teacher_tagger.name}
|
|
)
|
|
|
|
for i in range(50):
|
|
losses = {}
|
|
student.distill(
|
|
teacher,
|
|
distill_examples,
|
|
sgd=optimizer,
|
|
losses=losses,
|
|
student_to_teacher=student_to_teacher,
|
|
)
|
|
assert losses["tagger"] < 0.00001
|
|
|
|
test_text = "I like blue eggs"
|
|
doc = student(test_text)
|
|
assert doc[0].tag_ == "N"
|
|
assert doc[1].tag_ == "V"
|
|
assert doc[2].tag_ == "J"
|
|
assert doc[3].tag_ == "N"
|
|
|
|
# Do an extra update to check if annotates works, though we can't really
|
|
# validate the resuls, since the annotations are ephemeral.
|
|
student.distill(
|
|
teacher,
|
|
distill_examples,
|
|
sgd=optimizer,
|
|
losses=losses,
|
|
student_to_teacher=student_to_teacher,
|
|
annotates=["tagger"],
|
|
)
|