take care of global vectors in multiprocessing (#5081)

* restore load_nlp.VECTORS in the child process * add unit test * fix test * remove unnecessary import * add utf8 encoding * import unicode_literals
2025-07-18 20:22:25 +03:00 · 2020-03-03 13:58:22 +01:00 · 2020-03-03 13:58:22 +01:00 · d307e9ca58
commit d307e9ca58
parent d078b47c81
5 changed files with 34 additions and 7 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -296,8 +296,7 @@ def link_vectors_to_models(vocab):
    key = (ops.device, vectors.name)
    if key in thinc.extra.load_nlp.VECTORS:
        if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
-            # This is a hack to avoid the problem in #3853. Maybe we should
-            # print a warning as well?
+            # This is a hack to avoid the problem in #3853.
            old_name = vectors.name
            new_name = vectors.name + "_%d" % data.shape[0]
            user_warning(Warnings.W019.format(old=old_name, new=new_name))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals

 import random
 import itertools
+
+from thinc.extra import load_nlp
+
 from spacy.util import minibatch
 import weakref
 import functools
@ -856,7 +859,7 @@ class Language(object):
        procs = [
            mp.Process(
                target=_apply_pipes,
-                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+                args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
            )
            for rch, sch in zip(texts_q, bytedocs_send_ch)
        ]
@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs):
        yield doc


-def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
+def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
    """Worker for Language.pipe

    receiver (multiprocessing.Connection): Pipe to receive text. Usually
@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
    sender (multiprocessing.Connection): Pipe to send doc. Usually created by
        `multiprocessing.Pipe()`
    underscore_state (tuple): The data in the Underscore class of the parent
+    vectors (dict): The global vectors data, copied from the parent
    """
    Underscore.load_state(underscore_state)
+    load_nlp.VECTORS = vectors
    while True:
        texts = receiver.get()
        docs = (make_doc(text) for text in texts)
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+
+from spacy.lang.en import English
+from spacy.vocab import Vocab
+
+
+def test_issue4725():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals

 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
-from spacy.tokens.underscore import Underscore


 def test_issue4849():
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,10 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-import spacy
 from spacy.lang.en import English
 from spacy.tokens import Span, Doc
-from spacy.tokens.underscore import Underscore


 class CustomPipe: