multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment
2025-08-03 20:00:21 +03:00 · 2019-10-08 19:20:55 +09:00 · 2019-10-08 19:20:55 +09:00 · 650cbfe82d
commit 650cbfe82d
parent 14841d0aa6
3 changed files with 198 additions and 10 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -95,6 +95,7 @@ class Warnings(object):
            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
            "If this is surprising, make sure you have the spacy-lookups-data "
            "package installed.")
+    W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")


@add_codes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,8 +1,11 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals

+import atexit
 import random
 import itertools
+from warnings import warn
+from spacy.util import minibatch
 import weakref
 import functools
 from collections import OrderedDict
@ -10,6 +13,8 @@ from contextlib import contextmanager
 from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
+import multiprocessing as mp
+from itertools import chain, cycle

 from .tokenizer import Tokenizer
 from .vocab import Vocab
@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
 from .pipeline import Morphologizer
-from .compat import izip, basestring_
+from .compat import izip, basestring_, is_python2
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
+from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings, deprecation_warning, user_warning
 from . import util
 from . import about

@ -733,6 +739,7 @@ class Language(object):
        disable=[],
        cleanup=False,
        component_cfg=None,
+        n_process=1,
    ):
        """Process texts as a stream, and yield `Doc` objects in order.

@ -746,12 +753,20 @@ class Language(object):
            use. Experimental.
        component_cfg (dict): An optional dictionary with extra keyword
            arguments for specific components.
+        n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
        """
+        # raw_texts will be used later to stop iterator.
+        texts, raw_texts = itertools.tee(texts)
+        if is_python2 and n_process != 1:
+            user_warning(Warnings.W023)
+            n_process = 1
        if n_threads != -1:
            deprecation_warning(Warnings.W016)
+        if n_process == -1:
+            n_process = mp.cpu_count()
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
@ -765,9 +780,12 @@ class Language(object):
            for doc, context in izip(docs, contexts):
                yield (doc, context)
            return
-        docs = (self.make_doc(text) for text in texts)
        if component_cfg is None:
            component_cfg = {}
+
+        pipes = (
+            []
+        )  # contains functools.partial objects so that easily create multiprocess worker.
        for name, proc in self.pipeline:
            if name in disable:
                continue
@ -775,10 +793,20 @@ class Language(object):
            # Allow component_cfg to overwrite the top-level kwargs.
            kwargs.setdefault("batch_size", batch_size)
            if hasattr(proc, "pipe"):
-                docs = proc.pipe(docs, **kwargs)
+                f = functools.partial(proc.pipe, **kwargs)
            else:
                # Apply the function, but yield the doc
-                docs = _pipe(proc, docs, kwargs)
+                f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
+            pipes.append(f)
+
+        if n_process != 1:
+            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
+        else:
+            # if n_process == 1, no processes are forked.
+            docs = (self.make_doc(text) for text in texts)
+            for pipe in pipes:
+                docs = pipe(docs)
+
        # Track weakrefs of "recent" documents, so that we can see when they
        # expire from memory. When they do, we know we don't need old strings.
        # This way, we avoid maintaining an unbounded growth in string entries
@ -809,6 +837,46 @@ class Language(object):
                        self.tokenizer._reset_cache(keys)
                    nr_seen = 0

+    def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
+        # raw_texts is used later to stop iteration.
+        texts, raw_texts = itertools.tee(texts)
+        # for sending texts to worker
+        texts_q = [mp.Queue() for _ in range(n_process)]
+        # for receiving byte encoded docs from worker
+        bytedocs_recv_ch, bytedocs_send_ch = zip(
+            *[mp.Pipe(False) for _ in range(n_process)]
+        )
+
+        batch_texts = minibatch(texts, batch_size)
+        # Sender sends texts to the workers.
+        # This is necessary to properly handle infinite length of texts.
+        # (In this case, all data cannot be sent to the workers at once)
+        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
+        # send twice so that make process busy
+        sender.send()
+        sender.send()
+
+        procs = [
+            mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
+            for rch, sch in zip(texts_q, bytedocs_send_ch)
+        ]
+        for proc in procs:
+            proc.start()
+
+        # Cycle channels not to break the order of docs.
+        # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
+        byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
+        docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
+        try:
+            for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
+                yield doc
+                if i % batch_size == 0:
+                    # tell `sender` that one batch was consumed.
+                    sender.step()
+        finally:
+            for proc in procs:
+                proc.terminate()
+
    def to_disk(self, path, exclude=tuple(), disable=None):
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.
@ -987,12 +1055,55 @@ class DisabledPipes(list):
        self[:] = []


-def _pipe(func, docs, kwargs):
+def _pipe(docs, proc, kwargs):
    # We added some args for pipe that __call__ doesn't expect.
    kwargs = dict(kwargs)
    for arg in ["n_threads", "batch_size"]:
        if arg in kwargs:
            kwargs.pop(arg)
    for doc in docs:
-        doc = func(doc, **kwargs)
+        doc = proc(doc, **kwargs)
        yield doc
+
+
+def _apply_pipes(make_doc, pipes, reciever, sender):
+    """Worker for Language.pipe
+
+    Args:
+        receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()`
+        sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()`
+    """
+    while True:
+        texts = reciever.get()
+        docs = (make_doc(text) for text in texts)
+        for pipe in pipes:
+            docs = pipe(docs)
+        # Connection does not accept unpickable objects, so send list.
+        sender.send([doc.to_bytes() for doc in docs])
+
+
+class _Sender:
+    """Util for sending data to multiprocessing workers in Language.pipe"""
+
+    def __init__(self, data, queues, chunk_size):
+        self.data = iter(data)
+        self.queues = iter(cycle(queues))
+        self.chunk_size = chunk_size
+        self.count = 0
+
+    def send(self):
+        """Send chunk_size items from self.data to channels."""
+        for item, q in itertools.islice(
+            zip(self.data, cycle(self.queues)), self.chunk_size
+        ):
+            # cycle channels so that distribute the texts evenly
+            q.put(item)
+
+    def step(self):
+        """Tell sender that comsumed one item. 
+
+        Data is sent to the workers after every chunk_size calls."""
+        self.count += 1
+        if self.count >= self.chunk_size:
+            self.count = 0
+            self.send()
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -1,11 +1,16 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import itertools
+
 import pytest
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.tokens import Doc
+from spacy.compat import is_python2
 from spacy.gold import GoldParse
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+from spacy.vocab import Vocab
+
+from .util import add_vecs_to_vocab, assert_docs_equal


@pytest.fixture
@ -58,3 +63,74 @@ def test_language_evaluate(nlp):
    # Evaluate badly
    with pytest.raises(Exception):
        nlp.evaluate([text, gold])
+
+
+def vector_modification_pipe(doc):
+    doc.vector += 1
+    return doc
+
+
+def userdata_pipe(doc):
+    doc.user_data["foo"] = "bar"
+    return doc
+
+
+def ner_pipe(doc):
+    span = Span(doc, 0, 1, label="FIRST")
+    doc.ents += (span,)
+    return doc
+
+
+@pytest.fixture
+def sample_vectors():
+    return [
+        ("spacy", [-0.1, -0.2, -0.3]),
+        ("world", [-0.2, -0.3, -0.4]),
+        ("pipe", [0.7, 0.8, 0.9]),
+    ]
+
+
+@pytest.fixture
+def nlp2(nlp, sample_vectors):
+    add_vecs_to_vocab(nlp.vocab, sample_vectors)
+    nlp.add_pipe(vector_modification_pipe)
+    nlp.add_pipe(ner_pipe)
+    nlp.add_pipe(userdata_pipe)
+    return nlp
+
+
+@pytest.fixture
+def texts():
+    data = [
+        "Hello world.",
+        "This is spacy.",
+        "You can use multiprocessing with pipe method.",
+        "Please try!",
+    ]
+    return data
+
+
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe(nlp2, n_process, texts):
+    texts = texts * 10
+    expecteds = [nlp2(text) for text in texts]
+    docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
+
+    for doc, expected_doc in zip(docs, expecteds):
+        assert_docs_equal(doc, expected_doc)
+
+
+@pytest.mark.skipif(
+    is_python2, reason="python2 seems to be unable to handle iterator properly"
+)
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_stream(nlp2, n_process, texts):
+    # check if nlp.pipe can handle infinite length iterator properly.
+    stream_texts = itertools.cycle(texts)
+    texts0, texts1 = itertools.tee(stream_texts)
+    expecteds = (nlp2(text) for text in texts0)
+    docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
+
+    n_fetch = 20
+    for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
+        assert_docs_equal(doc, expected_doc)