take care of global vectors in multiprocessing (#5081)

* restore load_nlp.VECTORS in the child process

* add unit test

* fix test

* remove unnecessary import

* add utf8 encoding

* import unicode_literals
This commit is contained in:
Sofie Van Landeghem 2020-03-03 13:58:22 +01:00 committed by GitHub
parent d078b47c81
commit d307e9ca58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 34 additions and 7 deletions

View File

@ -296,8 +296,7 @@ def link_vectors_to_models(vocab):
key = (ops.device, vectors.name) key = (ops.device, vectors.name)
if key in thinc.extra.load_nlp.VECTORS: if key in thinc.extra.load_nlp.VECTORS:
if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
# This is a hack to avoid the problem in #3853. Maybe we should # This is a hack to avoid the problem in #3853.
# print a warning as well?
old_name = vectors.name old_name = vectors.name
new_name = vectors.name + "_%d" % data.shape[0] new_name = vectors.name + "_%d" % data.shape[0]
user_warning(Warnings.W019.format(old=old_name, new=new_name)) user_warning(Warnings.W019.format(old=old_name, new=new_name))

View File

@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals
import random import random
import itertools import itertools
from thinc.extra import load_nlp
from spacy.util import minibatch from spacy.util import minibatch
import weakref import weakref
import functools import functools
@ -856,7 +859,7 @@ class Language(object):
procs = [ procs = [
mp.Process( mp.Process(
target=_apply_pipes, target=_apply_pipes,
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
) )
for rch, sch in zip(texts_q, bytedocs_send_ch) for rch, sch in zip(texts_q, bytedocs_send_ch)
] ]
@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs):
yield doc yield doc
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
"""Worker for Language.pipe """Worker for Language.pipe
receiver (multiprocessing.Connection): Pipe to receive text. Usually receiver (multiprocessing.Connection): Pipe to receive text. Usually
@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
sender (multiprocessing.Connection): Pipe to send doc. Usually created by sender (multiprocessing.Connection): Pipe to send doc. Usually created by
`multiprocessing.Pipe()` `multiprocessing.Pipe()`
underscore_state (tuple): The data in the Underscore class of the parent underscore_state (tuple): The data in the Underscore class of the parent
vectors (dict): The global vectors data, copied from the parent
""" """
Underscore.load_state(underscore_state) Underscore.load_state(underscore_state)
load_nlp.VECTORS = vectors
while True: while True:
texts = receiver.get() texts = receiver.get()
docs = (make_doc(text) for text in texts) docs = (make_doc(text) for text in texts)

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
import numpy
from spacy.lang.en import English
from spacy.vocab import Vocab
def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
vocab = Vocab(vectors_name="test_vocab_add_vector")
data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0
data[1] = 2.0
vocab.set_vector("cat", data[0])
vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
nlp.begin_training()
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
from spacy.tokens.underscore import Underscore
def test_issue4849(): def test_issue4849():

View File

@ -1,10 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Span, Doc from spacy.tokens import Span, Doc
from spacy.tokens.underscore import Underscore
class CustomPipe: class CustomPipe: