mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
take care of global vectors in multiprocessing (#5081)
* restore load_nlp.VECTORS in the child process * add unit test * fix test * remove unnecessary import * add utf8 encoding * import unicode_literals
This commit is contained in:
parent
d078b47c81
commit
d307e9ca58
|
@ -296,8 +296,7 @@ def link_vectors_to_models(vocab):
|
|||
key = (ops.device, vectors.name)
|
||||
if key in thinc.extra.load_nlp.VECTORS:
|
||||
if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
|
||||
# This is a hack to avoid the problem in #3853. Maybe we should
|
||||
# print a warning as well?
|
||||
# This is a hack to avoid the problem in #3853.
|
||||
old_name = vectors.name
|
||||
new_name = vectors.name + "_%d" % data.shape[0]
|
||||
user_warning(Warnings.W019.format(old=old_name, new=new_name))
|
||||
|
|
|
@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals
|
|||
|
||||
import random
|
||||
import itertools
|
||||
|
||||
from thinc.extra import load_nlp
|
||||
|
||||
from spacy.util import minibatch
|
||||
import weakref
|
||||
import functools
|
||||
|
@ -856,7 +859,7 @@ class Language(object):
|
|||
procs = [
|
||||
mp.Process(
|
||||
target=_apply_pipes,
|
||||
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
|
||||
args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
|
||||
)
|
||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||
]
|
||||
|
@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs):
|
|||
yield doc
|
||||
|
||||
|
||||
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
|
||||
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
|
||||
"""Worker for Language.pipe
|
||||
|
||||
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
||||
|
@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
|
|||
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
|
||||
`multiprocessing.Pipe()`
|
||||
underscore_state (tuple): The data in the Underscore class of the parent
|
||||
vectors (dict): The global vectors data, copied from the parent
|
||||
"""
|
||||
Underscore.load_state(underscore_state)
|
||||
load_nlp.VECTORS = vectors
|
||||
while True:
|
||||
texts = receiver.get()
|
||||
docs = (make_doc(text) for text in texts)
|
||||
|
|
26
spacy/tests/regression/test_issue4725.py
Normal file
26
spacy/tests/regression/test_issue4725.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4725():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
||||
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens.underscore import Underscore
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span, Doc
|
||||
from spacy.tokens.underscore import Underscore
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
|
|
Loading…
Reference in New Issue
Block a user