mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Add serializer property to Vocab, and lazy-load it. Add get_by_orth method.
This commit is contained in:
		
							parent
							
								
									6ab1696b15
								
							
						
					
					
						commit
						a7c4d72e83
					
				|  | @ -5,7 +5,7 @@ from cymem.cymem cimport Pool | ||||||
| from murmurhash.mrmr cimport hash64 | from murmurhash.mrmr cimport hash64 | ||||||
| 
 | 
 | ||||||
| from .structs cimport LexemeC, TokenC | from .structs cimport LexemeC, TokenC | ||||||
| from .typedefs cimport utf8_t, hash_t | from .typedefs cimport utf8_t, attr_t, hash_t | ||||||
| from .strings cimport StringStore | from .strings cimport StringStore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -29,9 +29,12 @@ cdef class Vocab: | ||||||
|     cpdef readonly StringStore strings |     cpdef readonly StringStore strings | ||||||
|     cdef readonly object pos_tags |     cdef readonly object pos_tags | ||||||
|     cdef readonly int length |     cdef readonly int length | ||||||
|     cdef public object packer |     cdef public object _serializer | ||||||
|  |     cdef public object data_dir | ||||||
| 
 | 
 | ||||||
|     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL |     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL | ||||||
|  |     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL | ||||||
|  |      | ||||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 |     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 | ||||||
| 
 | 
 | ||||||
|     cdef PreshMap _by_hash |     cdef PreshMap _by_hash | ||||||
|  |  | ||||||
							
								
								
									
										105
									
								
								spacy/vocab.pyx
									
									
									
									
									
								
							
							
						
						
									
										105
									
								
								spacy/vocab.pyx
									
									
									
									
									
								
							|  | @ -6,6 +6,7 @@ import bz2 | ||||||
| from os import path | from os import path | ||||||
| import codecs | import codecs | ||||||
| import math | import math | ||||||
|  | import json | ||||||
| 
 | 
 | ||||||
| from .lexeme cimport EMPTY_LEXEME | from .lexeme cimport EMPTY_LEXEME | ||||||
| from .lexeme cimport set_lex_struct_props | from .lexeme cimport set_lex_struct_props | ||||||
|  | @ -13,6 +14,7 @@ from .lexeme cimport Lexeme | ||||||
| from .strings cimport hash_string | from .strings cimport hash_string | ||||||
| from .orth cimport word_shape | from .orth cimport word_shape | ||||||
| from .typedefs cimport attr_t | from .typedefs cimport attr_t | ||||||
|  | from .cfile cimport CFile | ||||||
| 
 | 
 | ||||||
| from cymem.cymem cimport Address | from cymem.cymem cimport Address | ||||||
| from . import util | from . import util | ||||||
|  | @ -54,8 +56,19 @@ cdef class Vocab: | ||||||
|             if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): |             if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): | ||||||
|                 self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) |                 self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) | ||||||
| 
 | 
 | ||||||
|         #self.packer = Packer(self, util.read_encoding_freqs(data_dir)) |         self._serializer = None | ||||||
|         self.packer = None |         self.data_dir = data_dir | ||||||
|  | 
 | ||||||
|  |     property serializer: | ||||||
|  |         def __get__(self): | ||||||
|  |             if self._serializer is None: | ||||||
|  |                 freqs = [] | ||||||
|  |                 if self.data_dir is not None: | ||||||
|  |                     freqs_loc = path.join(self.data_dir, 'serializer.json') | ||||||
|  |                     if path.exists(freqs_loc): | ||||||
|  |                         freqs = json.load(open(freqs_loc)) | ||||||
|  |                 self._serializer = Packer(self, freqs) | ||||||
|  |             return self._serializer | ||||||
| 
 | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         """The current number of lexemes stored.""" |         """The current number of lexemes stored.""" | ||||||
|  | @ -82,6 +95,27 @@ cdef class Vocab: | ||||||
|             self._add_lex_to_vocab(key, lex) |             self._add_lex_to_vocab(key, lex) | ||||||
|         return lex |         return lex | ||||||
| 
 | 
 | ||||||
|  |     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: | ||||||
|  |         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||||
|  |         if necessary, using memory acquired from the given pool.  If the pool | ||||||
|  |         is the lexicon's own memory, the lexeme is saved in the lexicon.''' | ||||||
|  |         cdef LexemeC* lex | ||||||
|  |         lex = <LexemeC*>self._by_orth.get(orth) | ||||||
|  |         if lex != NULL: | ||||||
|  |             return lex | ||||||
|  |         cdef unicode string = self.strings[orth] | ||||||
|  |         cdef bint is_oov = mem is not self.mem | ||||||
|  |         if len(string) < 3: | ||||||
|  |             mem = self.mem | ||||||
|  |         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) | ||||||
|  |         props = self.lexeme_props_getter(string) | ||||||
|  |         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||||
|  |         if is_oov: | ||||||
|  |             lex.id = 0 | ||||||
|  |         else: | ||||||
|  |             self._add_lex_to_vocab(hash_string(string), lex) | ||||||
|  |         return lex | ||||||
|  | 
 | ||||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: |     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||||
|         self._by_hash.set(key, <void*>lex) |         self._by_hash.set(key, <void*>lex) | ||||||
|         self._by_orth.set(lex.orth, <void*>lex) |         self._by_orth.set(lex.orth, <void*>lex) | ||||||
|  | @ -138,19 +172,16 @@ cdef class Vocab: | ||||||
|         if path.exists(loc): |         if path.exists(loc): | ||||||
|             assert not path.isdir(loc) |             assert not path.isdir(loc) | ||||||
|         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc |         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc | ||||||
|         cdef FILE* fp = fopen(<char*>bytes_loc, 'wb') | 
 | ||||||
|         assert fp != NULL |         cdef CFile fp = CFile(bytes_loc, 'wb') | ||||||
|         cdef size_t st |         cdef size_t st | ||||||
|         cdef size_t addr |         cdef size_t addr | ||||||
|         cdef hash_t key |         cdef hash_t key | ||||||
|         for key, addr in self._by_hash.items(): |         for key, addr in self._by_hash.items(): | ||||||
|             lexeme = <LexemeC*>addr |             lexeme = <LexemeC*>addr | ||||||
|             st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp) |             fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) | ||||||
|             assert st == 1 |             fp.write_from(lexeme, sizeof(LexemeC), 1) | ||||||
|             st = fwrite(lexeme, sizeof(LexemeC), 1, fp) |         fp.close() | ||||||
|             assert st == 1 |  | ||||||
|         st = fclose(fp) |  | ||||||
|         assert st == 0 |  | ||||||
| 
 | 
 | ||||||
|     def load_lexemes(self, strings_loc, loc): |     def load_lexemes(self, strings_loc, loc): | ||||||
|         self.strings.load(strings_loc) |         self.strings.load(strings_loc) | ||||||
|  | @ -188,7 +219,7 @@ cdef class Vocab: | ||||||
|         fclose(fp) |         fclose(fp) | ||||||
| 
 | 
 | ||||||
|     def load_rep_vectors(self, loc): |     def load_rep_vectors(self, loc): | ||||||
|         file_ = _CFile(loc, b'rb') |         cdef CFile file_ = CFile(loc, b'rb') | ||||||
|         cdef int32_t word_len |         cdef int32_t word_len | ||||||
|         cdef int32_t vec_len |         cdef int32_t vec_len | ||||||
|         cdef int32_t prev_vec_len = 0 |         cdef int32_t prev_vec_len = 0 | ||||||
|  | @ -198,22 +229,20 @@ cdef class Vocab: | ||||||
|         cdef bytes py_word |         cdef bytes py_word | ||||||
|         cdef vector[float*] vectors |         cdef vector[float*] vectors | ||||||
|         cdef int i |         cdef int i | ||||||
|  |         cdef Pool tmp_mem = Pool() | ||||||
|         while True: |         while True: | ||||||
|             try: |             try: | ||||||
|                 file_.read(&word_len, sizeof(word_len), 1) |                 file_.read_into(&word_len, sizeof(word_len), 1) | ||||||
|             except IOError: |             except IOError: | ||||||
|                 break |                 break | ||||||
|             file_.read(&vec_len, sizeof(vec_len), 1) |             file_.read_into(&vec_len, sizeof(vec_len), 1) | ||||||
|             if prev_vec_len != 0 and vec_len != prev_vec_len: |             if prev_vec_len != 0 and vec_len != prev_vec_len: | ||||||
|                 raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len) |                 raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len) | ||||||
|             if 0 >= vec_len >= MAX_VEC_SIZE: |             if 0 >= vec_len >= MAX_VEC_SIZE: | ||||||
|                 raise VectorReadError.bad_size(loc, vec_len) |                 raise VectorReadError.bad_size(loc, vec_len) | ||||||
|             mem = Address(word_len, sizeof(char)) |  | ||||||
|             chars = <char*>mem.ptr |  | ||||||
|             vec = <float*>self.mem.alloc(vec_len, sizeof(float)) |  | ||||||
| 
 | 
 | ||||||
|             file_.read(chars, sizeof(char), word_len) |             chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char)) | ||||||
|             file_.read(vec, sizeof(float), vec_len) |             vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float)) | ||||||
| 
 | 
 | ||||||
|             string_id = self.strings[chars[:word_len]] |             string_id = self.strings[chars[:word_len]] | ||||||
|             while string_id >= vectors.size(): |             while string_id >= vectors.size(): | ||||||
|  | @ -235,7 +264,7 @@ cdef class Vocab: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def write_binary_vectors(in_loc, out_loc): | def write_binary_vectors(in_loc, out_loc): | ||||||
|     cdef _CFile out_file = _CFile(out_loc, 'wb') |     cdef CFile out_file = CFile(out_loc, 'wb') | ||||||
|     cdef Address mem |     cdef Address mem | ||||||
|     cdef int32_t word_len |     cdef int32_t word_len | ||||||
|     cdef int32_t vec_len |     cdef int32_t vec_len | ||||||
|  | @ -252,42 +281,12 @@ def write_binary_vectors(in_loc, out_loc): | ||||||
|             word_len = len(word) |             word_len = len(word) | ||||||
|             vec_len = len(pieces) |             vec_len = len(pieces) | ||||||
| 
 | 
 | ||||||
|             out_file.write(sizeof(word_len), 1, &word_len) |             out_file.write_from(&word_len, 1, sizeof(word_len)) | ||||||
|             out_file.write(sizeof(vec_len), 1, &vec_len) |             out_file.write_from(&vec_len, 1, sizeof(vec_len)) | ||||||
| 
 | 
 | ||||||
|             chars = <char*>word |             chars = <char*>word | ||||||
|             out_file.write(sizeof(char), len(word), chars) |             out_file.write_from(chars, len(word), sizeof(char)) | ||||||
|             out_file.write(sizeof(float), vec_len, vec) |             out_file.write_from(vec, vec_len, sizeof(float)) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class _CFile: |  | ||||||
|     cdef FILE* fp |  | ||||||
|     def __init__(self, loc, bytes mode): |  | ||||||
|         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc |  | ||||||
|         self.fp = fopen(<char*>bytes_loc, mode) |  | ||||||
|         if self.fp == NULL: |  | ||||||
|             raise IOError |  | ||||||
| 
 |  | ||||||
|     def __dealloc__(self): |  | ||||||
|         fclose(self.fp) |  | ||||||
| 
 |  | ||||||
|     def close(self): |  | ||||||
|         fclose(self.fp) |  | ||||||
| 
 |  | ||||||
|     cdef int read(self, void* dest, size_t elem_size, size_t n) except -1: |  | ||||||
|         st = fread(dest, elem_size, n, self.fp) |  | ||||||
|         if st != n: |  | ||||||
|             raise IOError |  | ||||||
| 
 |  | ||||||
|     cdef int write(self, size_t elem_size, size_t n, void* data) except -1: |  | ||||||
|         st = fwrite(data, elem_size, n, self.fp) |  | ||||||
|         if st != n: |  | ||||||
|             raise IOError |  | ||||||
| 
 |  | ||||||
|     cdef int write_unicode(self, unicode value): |  | ||||||
|         cdef bytes py_bytes = value.encode('utf8') |  | ||||||
|         cdef char* chars = <char*>py_bytes |  | ||||||
|         self.write(sizeof(char), len(py_bytes), chars) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class VectorReadError(Exception): | class VectorReadError(Exception): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user