mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 07:27:28 +03:00 
			
		
		
		
	Merge pull request #5006 from svlandeg/bugfix/multiproc-underscore
load Underscore state when multiprocessing
This commit is contained in:
		
						commit
						4440a072d2
					
				|  | @ -15,6 +15,7 @@ import multiprocessing as mp | ||||||
| from itertools import chain, cycle | from itertools import chain, cycle | ||||||
| 
 | 
 | ||||||
| from .tokenizer import Tokenizer | from .tokenizer import Tokenizer | ||||||
|  | from .tokens.underscore import Underscore | ||||||
| from .vocab import Vocab | from .vocab import Vocab | ||||||
| from .lemmatizer import Lemmatizer | from .lemmatizer import Lemmatizer | ||||||
| from .lookups import Lookups | from .lookups import Lookups | ||||||
|  | @ -853,7 +854,10 @@ class Language(object): | ||||||
|         sender.send() |         sender.send() | ||||||
| 
 | 
 | ||||||
|         procs = [ |         procs = [ | ||||||
|             mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch)) |             mp.Process( | ||||||
|  |                 target=_apply_pipes, | ||||||
|  |                 args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), | ||||||
|  |             ) | ||||||
|             for rch, sch in zip(texts_q, bytedocs_send_ch) |             for rch, sch in zip(texts_q, bytedocs_send_ch) | ||||||
|         ] |         ] | ||||||
|         for proc in procs: |         for proc in procs: | ||||||
|  | @ -1108,16 +1112,18 @@ def _pipe(docs, proc, kwargs): | ||||||
|         yield doc |         yield doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _apply_pipes(make_doc, pipes, reciever, sender): | def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): | ||||||
|     """Worker for Language.pipe |     """Worker for Language.pipe | ||||||
| 
 | 
 | ||||||
|     receiver (multiprocessing.Connection): Pipe to receive text. Usually |     receiver (multiprocessing.Connection): Pipe to receive text. Usually | ||||||
|         created by `multiprocessing.Pipe()` |         created by `multiprocessing.Pipe()` | ||||||
|     sender (multiprocessing.Connection): Pipe to send doc. Usually created by |     sender (multiprocessing.Connection): Pipe to send doc. Usually created by | ||||||
|         `multiprocessing.Pipe()` |         `multiprocessing.Pipe()` | ||||||
|  |     underscore_state (tuple): The data in the Underscore class of the parent | ||||||
|     """ |     """ | ||||||
|  |     Underscore.load_state(underscore_state) | ||||||
|     while True: |     while True: | ||||||
|         texts = reciever.get() |         texts = receiver.get() | ||||||
|         docs = (make_doc(text) for text in texts) |         docs = (make_doc(text) for text in texts) | ||||||
|         for pipe in pipes: |         for pipe in pipes: | ||||||
|             docs = pipe(docs) |             docs = pipe(docs) | ||||||
|  |  | ||||||
|  | @ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token | ||||||
| from spacy.tokens.underscore import Underscore | from spacy.tokens.underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="function", autouse=True) | ||||||
|  | def clean_underscore(): | ||||||
|  |     # reset the Underscore object after the test, to avoid having state copied across tests | ||||||
|  |     yield | ||||||
|  |     Underscore.doc_extensions = {} | ||||||
|  |     Underscore.span_extensions = {} | ||||||
|  |     Underscore.token_extensions = {} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_create_doc_underscore(): | def test_create_doc_underscore(): | ||||||
|     doc = Mock() |     doc = Mock() | ||||||
|     doc.doc = doc |     doc.doc = doc | ||||||
|  |  | ||||||
|  | @ -6,6 +6,7 @@ import re | ||||||
| from mock import Mock | from mock import Mock | ||||||
| from spacy.matcher import Matcher, DependencyMatcher | from spacy.matcher import Matcher, DependencyMatcher | ||||||
| from spacy.tokens import Doc, Token | from spacy.tokens import Doc, Token | ||||||
|  | from ..doc.test_underscore import clean_underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  | @ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab): | ||||||
|     assert matches[2] == "test hello world" |     assert matches[2] == "test hello world" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.usefixtures("clean_underscore") | ||||||
| def test_matcher_extension_attribute(en_vocab): | def test_matcher_extension_attribute(en_vocab): | ||||||
|     matcher = Matcher(en_vocab) |     matcher = Matcher(en_vocab) | ||||||
|     get_is_fruit = lambda token: token.text in ("apple", "banana") |     get_is_fruit = lambda token: token.text in ("apple", "banana") | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.pipeline import EntityRuler | from spacy.pipeline import EntityRuler | ||||||
|  | from spacy.tokens.underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_issue4849(): | def test_issue4849(): | ||||||
|  |  | ||||||
							
								
								
									
										45
									
								
								spacy/tests/regression/test_issue4903.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								spacy/tests/regression/test_issue4903.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import spacy | ||||||
|  | from spacy.lang.en import English | ||||||
|  | from spacy.tokens import Span, Doc | ||||||
|  | from spacy.tokens.underscore import Underscore | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class CustomPipe: | ||||||
|  |     name = "my_pipe" | ||||||
|  | 
 | ||||||
|  |     def __init__(self): | ||||||
|  |         Span.set_extension("my_ext", getter=self._get_my_ext) | ||||||
|  |         Doc.set_extension("my_ext", default=None) | ||||||
|  | 
 | ||||||
|  |     def __call__(self, doc): | ||||||
|  |         gathered_ext = [] | ||||||
|  |         for sent in doc.sents: | ||||||
|  |             sent_ext = self._get_my_ext(sent) | ||||||
|  |             sent._.set("my_ext", sent_ext) | ||||||
|  |             gathered_ext.append(sent_ext) | ||||||
|  | 
 | ||||||
|  |         doc._.set("my_ext", "\n".join(gathered_ext)) | ||||||
|  | 
 | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_my_ext(span): | ||||||
|  |         return str(span.end) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue4903(): | ||||||
|  |     # ensures that this runs correctly and doesn't hang or crash on Windows / macOS | ||||||
|  | 
 | ||||||
|  |     nlp = English() | ||||||
|  |     custom_component = CustomPipe() | ||||||
|  |     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||||
|  |     nlp.add_pipe(custom_component, after="sentencizer") | ||||||
|  | 
 | ||||||
|  |     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] | ||||||
|  |     docs = list(nlp.pipe(text, n_process=2)) | ||||||
|  |     assert docs[0].text == "I like bananas." | ||||||
|  |     assert docs[1].text == "Do you like them?" | ||||||
|  |     assert docs[2].text == "No, I prefer wasabi." | ||||||
|  | @ -11,6 +11,6 @@ def nlp(): | ||||||
|     return spacy.blank("en") |     return spacy.blank("en") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_evaluate(nlp): | def test_issue4924(nlp): | ||||||
|     docs_golds = [("", {})] |     docs_golds = [("", {})] | ||||||
|     nlp.evaluate(docs_golds) |     nlp.evaluate(docs_golds) | ||||||
|  |  | ||||||
|  | @ -79,6 +79,14 @@ class Underscore(object): | ||||||
|     def _get_key(self, name): |     def _get_key(self, name): | ||||||
|         return ("._.", name, self._start, self._end) |         return ("._.", name, self._start, self._end) | ||||||
| 
 | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def get_state(cls): | ||||||
|  |         return cls.token_extensions, cls.span_extensions, cls.doc_extensions | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def load_state(cls, state): | ||||||
|  |         cls.token_extensions, cls.span_extensions, cls.doc_extensions = state | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def get_ext_args(**kwargs): | def get_ext_args(**kwargs): | ||||||
|     """Validate and convert arguments. Reused in Doc, Token and Span.""" |     """Validate and convert arguments. Reused in Doc, Token and Span.""" | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user