mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125
This commit is contained in:
		
							parent
							
								
									85e7944572
								
							
						
					
					
						commit
						f8de403483
					
				|  | @ -99,7 +99,7 @@ cdef class Vocab: | ||||||
|         return self.length |         return self.length | ||||||
| 
 | 
 | ||||||
|     def __reduce__(self): |     def __reduce__(self): | ||||||
|         tmp_dir = tempfile.mkdtmp() |         tmp_dir = tempfile.mkdtemp() | ||||||
|         lex_loc = path.join(tmp_dir, 'lexemes.bin') |         lex_loc = path.join(tmp_dir, 'lexemes.bin') | ||||||
|         str_loc = path.join(tmp_dir, 'strings.txt') |         str_loc = path.join(tmp_dir, 'strings.txt') | ||||||
|         map_loc = path.join(tmp_dir, 'tag_map.json') |         map_loc = path.join(tmp_dir, 'tag_map.json') | ||||||
|  | @ -108,7 +108,7 @@ cdef class Vocab: | ||||||
|         self.strings.dump(str_loc) |         self.strings.dump(str_loc) | ||||||
|         json.dump(self.morphology.tag_map, open(map_loc, 'w')) |         json.dump(self.morphology.tag_map, open(map_loc, 'w')) | ||||||
| 
 | 
 | ||||||
|         return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) |         return (unpickle_vocab, (tmp_dir,), None, None) | ||||||
| 
 | 
 | ||||||
|     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: |     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: | ||||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme |         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||||
|  | @ -353,7 +353,13 @@ cdef class Vocab: | ||||||
|         return vec_len |         return vec_len | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| copy_reg.constructor(Vocab.from_dir) | def unpickle_vocab(data_dir): | ||||||
|  |     # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods, | ||||||
|  |     # so we need to fiddle with the design of Language a little bit. | ||||||
|  |     from .language import Language | ||||||
|  |     return Vocab.from_dir(data_dir, Language.default_lex_attrs()) | ||||||
|  | 
 | ||||||
|  | copy_reg.constructor(unpickle_vocab) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def write_binary_vectors(in_loc, out_loc): | def write_binary_vectors(in_loc, out_loc): | ||||||
|  |  | ||||||
|  | @ -1,10 +1,13 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
|  | import StringIO | ||||||
|  | import pickle | ||||||
| 
 | 
 | ||||||
| from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA | from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA | ||||||
| from spacy.parts_of_speech import NOUN, VERB | from spacy.parts_of_speech import NOUN, VERB | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_neq(en_vocab): | def test_neq(en_vocab): | ||||||
|     addr = en_vocab['Hello'] |     addr = en_vocab['Hello'] | ||||||
|     assert en_vocab['bye'].orth != addr.orth |     assert en_vocab['bye'].orth != addr.orth | ||||||
|  | @ -38,3 +41,11 @@ def test_symbols(en_vocab): | ||||||
|     assert en_vocab.strings['ORTH'] == ORTH |     assert en_vocab.strings['ORTH'] == ORTH | ||||||
|     assert en_vocab.strings['PROB'] == PROB |     assert en_vocab.strings['PROB'] == PROB | ||||||
|      |      | ||||||
|  | 
 | ||||||
|  | def test_pickle_vocab(en_vocab): | ||||||
|  |     file_ = StringIO.StringIO() | ||||||
|  |     pickle.dump(en_vocab, file_) | ||||||
|  | 
 | ||||||
|  |     file_.seek(0) | ||||||
|  | 
 | ||||||
|  |     loaded = pickle.load(file_) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user