mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 21:53:05 +03:00
Handle errors while multiprocessing (#8004)
* Handle errors while multiprocessing Handle errors while multiprocessing without hanging. * Return the traceback for errors raised while processing a batch, which can be handled by the top-level error handler * Allow for shortened batches due to custom error handlers that ignore errors and skip documents * Define custom components at a higher level * Also move up custom error handler * Use simpler component for test * Switch error type * Adjust test * Only call top-level error handler for exceptions * Register custom test components within tests Use global functions (so they can be pickled) but register the components only within the individual tests.
This commit is contained in:
parent
8a2602051c
commit
b120fb3511
|
@ -490,6 +490,7 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
|
||||||
E872 = ("Unable to copy tokenizer from base model due to different "
|
E872 = ("Unable to copy tokenizer from base model due to different "
|
||||||
'tokenizer settings: current tokenizer config "{curr_config}" '
|
'tokenizer settings: current tokenizer config "{curr_config}" '
|
||||||
'vs. base model "{base_config}"')
|
'vs. base model "{base_config}"')
|
||||||
|
|
|
@ -13,6 +13,7 @@ import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
import traceback
|
||||||
|
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
|
@ -1521,11 +1522,15 @@ class Language:
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
|
||||||
try:
|
try:
|
||||||
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
|
for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1):
|
||||||
|
if byte_doc is not None:
|
||||||
|
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
elif byte_error is not None:
|
||||||
|
error = srsly.msgpack_loads(byte_error)
|
||||||
|
self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error)))
|
||||||
if i % batch_size == 0:
|
if i % batch_size == 0:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
|
@ -2019,12 +2024,19 @@ def _apply_pipes(
|
||||||
"""
|
"""
|
||||||
Underscore.load_state(underscore_state)
|
Underscore.load_state(underscore_state)
|
||||||
while True:
|
while True:
|
||||||
|
try:
|
||||||
texts = receiver.get()
|
texts = receiver.get()
|
||||||
docs = (make_doc(text) for text in texts)
|
docs = (make_doc(text) for text in texts)
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs)
|
docs = pipe(docs)
|
||||||
# Connection does not accept unpickable objects, so send list.
|
# Connection does not accept unpickable objects, so send list.
|
||||||
sender.send([doc.to_bytes() for doc in docs])
|
byte_docs = [(doc.to_bytes(), None) for doc in docs]
|
||||||
|
padding = [(None, None)] * (len(texts) - len(byte_docs))
|
||||||
|
sender.send(byte_docs + padding)
|
||||||
|
except Exception:
|
||||||
|
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
|
padding = [(None, None)] * (len(texts) - 1)
|
||||||
|
sender.send(error_msg + padding)
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
|
|
@ -8,13 +8,36 @@ from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry, ignore_error, raise_error
|
from spacy.util import registry, ignore_error, raise_error, logger
|
||||||
import spacy
|
import spacy
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||||
|
|
||||||
|
|
||||||
|
def evil_component(doc):
|
||||||
|
if "2" in doc.text:
|
||||||
|
raise ValueError("no dice")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def perhaps_set_sentences(doc):
|
||||||
|
if not doc.text.startswith("4"):
|
||||||
|
doc[-1].is_sent_start = True
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def assert_sents_error(doc):
|
||||||
|
if not doc.has_annotation("SENT_START"):
|
||||||
|
raise ValueError("no sents")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def warn_error(proc_name, proc, docs, e):
|
||||||
|
logger = logging.getLogger("spacy")
|
||||||
|
logger.warning(f"Trouble with component {proc_name}.")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp():
|
def nlp():
|
||||||
nlp = Language(Vocab())
|
nlp = Language(Vocab())
|
||||||
|
@ -93,19 +116,16 @@ def test_evaluate_no_pipe(nlp):
|
||||||
nlp.evaluate([Example.from_dict(doc, annots)])
|
nlp.evaluate([Example.from_dict(doc, annots)])
|
||||||
|
|
||||||
|
|
||||||
@Language.component("test_language_vector_modification_pipe")
|
|
||||||
def vector_modification_pipe(doc):
|
def vector_modification_pipe(doc):
|
||||||
doc.vector += 1
|
doc.vector += 1
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@Language.component("test_language_userdata_pipe")
|
|
||||||
def userdata_pipe(doc):
|
def userdata_pipe(doc):
|
||||||
doc.user_data["foo"] = "bar"
|
doc.user_data["foo"] = "bar"
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@Language.component("test_language_ner_pipe")
|
|
||||||
def ner_pipe(doc):
|
def ner_pipe(doc):
|
||||||
span = Span(doc, 0, 1, label="FIRST")
|
span = Span(doc, 0, 1, label="FIRST")
|
||||||
doc.ents += (span,)
|
doc.ents += (span,)
|
||||||
|
@ -123,6 +143,9 @@ def sample_vectors():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp2(nlp, sample_vectors):
|
def nlp2(nlp, sample_vectors):
|
||||||
|
Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe)
|
||||||
|
Language.component("test_language_userdata_pipe", func=userdata_pipe)
|
||||||
|
Language.component("test_language_ner_pipe", func=ner_pipe)
|
||||||
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
||||||
nlp.add_pipe("test_language_vector_modification_pipe")
|
nlp.add_pipe("test_language_vector_modification_pipe")
|
||||||
nlp.add_pipe("test_language_ner_pipe")
|
nlp.add_pipe("test_language_ner_pipe")
|
||||||
|
@ -168,8 +191,11 @@ def test_language_pipe_stream(nlp2, n_process, texts):
|
||||||
assert_docs_equal(doc, expected_doc)
|
assert_docs_equal(doc, expected_doc)
|
||||||
|
|
||||||
|
|
||||||
def test_language_pipe_error_handler():
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler(n_process):
|
||||||
"""Test that the error handling of nlp.pipe works well"""
|
"""Test that the error handling of nlp.pipe works well"""
|
||||||
|
ops = get_current_ops()
|
||||||
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("merge_subtokens")
|
nlp.add_pipe("merge_subtokens")
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
@ -178,34 +204,25 @@ def test_language_pipe_error_handler():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp(texts[0])
|
nlp(texts[0])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts))
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
nlp.set_error_handler(raise_error)
|
nlp.set_error_handler(raise_error)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts))
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
# set explicitely to ignoring
|
# set explicitely to ignoring
|
||||||
nlp.set_error_handler(ignore_error)
|
nlp.set_error_handler(ignore_error)
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
assert len(docs) == 0
|
assert len(docs) == 0
|
||||||
nlp(texts[0])
|
nlp(texts[0])
|
||||||
|
|
||||||
|
|
||||||
def test_language_pipe_error_handler_custom(en_vocab):
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
||||||
"""Test the error handling of a custom component that has no pipe method"""
|
"""Test the error handling of a custom component that has no pipe method"""
|
||||||
|
Language.component("my_evil_component", func=evil_component)
|
||||||
@Language.component("my_evil_component")
|
ops = get_current_ops()
|
||||||
def evil_component(doc):
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
if "2" in doc.text:
|
|
||||||
raise ValueError("no dice")
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def warn_error(proc_name, proc, docs, e):
|
|
||||||
from spacy.util import logger
|
|
||||||
|
|
||||||
logger.warning(f"Trouble with component {proc_name}.")
|
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("my_evil_component")
|
nlp.add_pipe("my_evil_component")
|
||||||
nlp.initialize()
|
|
||||||
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
|
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# the evil custom component throws an error
|
# the evil custom component throws an error
|
||||||
|
@ -214,36 +231,75 @@ def test_language_pipe_error_handler_custom(en_vocab):
|
||||||
nlp.set_error_handler(warn_error)
|
nlp.set_error_handler(warn_error)
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
with mock.patch.object(logger, "warning") as mock_warning:
|
with mock.patch.object(logger, "warning") as mock_warning:
|
||||||
# the errors by the evil custom component raise a warning for each bad batch
|
# the errors by the evil custom component raise a warning for each
|
||||||
docs = list(nlp.pipe(texts))
|
# bad doc
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
# HACK/TODO? the warnings in child processes don't seem to be
|
||||||
|
# detected by the mock logger
|
||||||
|
if n_process == 1:
|
||||||
mock_warning.assert_called()
|
mock_warning.assert_called()
|
||||||
assert mock_warning.call_count == 2
|
assert mock_warning.call_count == 2
|
||||||
assert len(docs) + mock_warning.call_count == len(texts)
|
assert len(docs) + mock_warning.call_count == len(texts)
|
||||||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||||
|
|
||||||
|
|
||||||
def test_language_pipe_error_handler_pipe(en_vocab):
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
||||||
"""Test the error handling of a component's pipe method"""
|
"""Test the error handling of a component's pipe method"""
|
||||||
|
Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
|
||||||
@Language.component("my_sentences")
|
Language.component("assert_sents_error", func=assert_sents_error)
|
||||||
def perhaps_set_sentences(doc):
|
ops = get_current_ops()
|
||||||
if not doc.text.startswith("4"):
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
doc[-1].is_sent_start = True
|
|
||||||
return doc
|
|
||||||
|
|
||||||
texts = [f"{str(i)} is enough. Done" for i in range(100)]
|
texts = [f"{str(i)} is enough. Done" for i in range(100)]
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("my_sentences")
|
nlp.add_pipe("my_perhaps_sentences")
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3})
|
nlp.add_pipe("assert_sents_error")
|
||||||
entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3])
|
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# the entity linker requires sentence boundaries, will throw an error otherwise
|
# assert_sents_error requires sentence boundaries, will throw an error otherwise
|
||||||
docs = list(nlp.pipe(texts, batch_size=10))
|
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
||||||
nlp.set_error_handler(ignore_error)
|
nlp.set_error_handler(ignore_error)
|
||||||
docs = list(nlp.pipe(texts, batch_size=10))
|
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
||||||
# we lose/ignore the failing 0-9 and 40-49 batches
|
# we lose/ignore the failing 4,40-49 docs
|
||||||
assert len(docs) == 80
|
assert len(docs) == 89
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler_make_doc_actual(n_process):
|
||||||
|
"""Test the error handling for make_doc"""
|
||||||
|
# TODO: fix so that the following test is the actual behavior
|
||||||
|
|
||||||
|
ops = get_current_ops()
|
||||||
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = English()
|
||||||
|
nlp.max_length = 10
|
||||||
|
texts = ["12345678901234567890", "12345"] * 10
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
nlp.default_error_handler = ignore_error
|
||||||
|
if n_process == 1:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
else:
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
assert len(docs) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler_make_doc_preferred(n_process):
|
||||||
|
"""Test the error handling for make_doc"""
|
||||||
|
|
||||||
|
ops = get_current_ops()
|
||||||
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = English()
|
||||||
|
nlp.max_length = 10
|
||||||
|
texts = ["12345678901234567890", "12345"] * 10
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
nlp.default_error_handler = ignore_error
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
assert len(docs) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_language_from_config_before_after_init():
|
def test_language_from_config_before_after_init():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user