mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Index lexemes by orth, instead of a lexemes vector. Breaks the mechanism for deciding not to own LexemeC structs during parsing. Need to reinstate this.
This commit is contained in:
		
							parent
							
								
									4dddc8a69b
								
							
						
					
					
						commit
						82d84b0f2b
					
				|  | @ -27,11 +27,12 @@ cdef class Vocab: | |||
|     cpdef public lexeme_props_getter | ||||
|     cdef Pool mem | ||||
|     cpdef readonly StringStore strings | ||||
|     cdef vector[const LexemeC*] lexemes | ||||
|     cdef readonly object pos_tags | ||||
|     cdef readonly int length | ||||
| 
 | ||||
|     cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL | ||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 | ||||
| 
 | ||||
|     cdef PreshMap _map | ||||
|     cdef PreshMap _by_hash | ||||
|     cdef PreshMap _by_orth | ||||
|     cdef readonly int repvec_length | ||||
|  |  | |||
|  | @ -33,12 +33,15 @@ cdef class Vocab: | |||
|     def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, | ||||
|                  pos_tags=None): | ||||
|         self.mem = Pool() | ||||
|         self._map = PreshMap(2 ** 20) | ||||
|         self._by_hash = PreshMap() | ||||
|         self._by_orth = PreshMap() | ||||
|         self.strings = StringStore() | ||||
|         self.pos_tags = pos_tags if pos_tags is not None else {} | ||||
|         self.lexemes.push_back(&EMPTY_LEXEME) | ||||
| 
 | ||||
|         self.lexeme_props_getter = get_lex_props | ||||
|         self.repvec_length = 0 | ||||
|         self.length = 0 | ||||
|         self._add_lex_to_vocab(0, &EMPTY_LEXEME) | ||||
|         if data_dir is not None: | ||||
|             if not path.exists(data_dir): | ||||
|                 raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) | ||||
|  | @ -52,34 +55,40 @@ cdef class Vocab: | |||
| 
 | ||||
|     def __len__(self): | ||||
|         """The current number of lexemes stored.""" | ||||
|         return self.lexemes.size() | ||||
|         return self.length | ||||
| 
 | ||||
|     cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL: | ||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|         if necessary, using memory acquired from the given pool.  If the pool | ||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon.''' | ||||
|         cdef LexemeC* lex | ||||
|         lex = <LexemeC*>self._map.get(c_str.key) | ||||
|         lex = <LexemeC*>self._by_hash.get(c_str.key) | ||||
|         if lex != NULL: | ||||
|             return lex | ||||
|         if c_str.n < 3: | ||||
|         #if c_str.n < 3: | ||||
|         oov = mem is not self.mem | ||||
|         mem = self.mem | ||||
|         cdef unicode py_str = c_str.chars[:c_str.n] | ||||
|         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) | ||||
|         props = self.lexeme_props_getter(py_str) | ||||
|         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||
|         if mem is self.mem: | ||||
|             lex.id = self.lexemes.size() | ||||
|         #if mem is self.mem: | ||||
|         #else: | ||||
|         if oov: | ||||
|             lex.id = 0 | ||||
|         self._add_lex_to_vocab(c_str.key, lex) | ||||
|         else: | ||||
|             lex.id = 1 | ||||
|         return lex | ||||
| 
 | ||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||
|         self._map.set(key, <void*>lex) | ||||
|         while self.lexemes.size() < (lex.id + 1): | ||||
|             self.lexemes.push_back(&EMPTY_LEXEME) | ||||
|         self.lexemes[lex.id] = lex | ||||
|         self._by_hash.set(key, <void*>lex) | ||||
|         self._by_orth.set(lex.orth, <void*>lex) | ||||
|         self.length += 1 | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         cdef attr_t orth | ||||
|         cdef size_t addr | ||||
|         for orth, addr in self._by_orth.items(): | ||||
|             yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length) | ||||
| 
 | ||||
|     def __getitem__(self,  id_or_string): | ||||
|         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously | ||||
|  | @ -98,13 +107,17 @@ cdef class Vocab: | |||
|         ''' | ||||
|         cdef UniStr c_str | ||||
|         cdef const LexemeC* lexeme | ||||
|         cdef attr_t orth | ||||
|         if type(id_or_string) == int: | ||||
|             if id_or_string >= self.lexemes.size(): | ||||
|                 raise IndexError | ||||
|             lexeme = self.lexemes.at(id_or_string) | ||||
|             orth = id_or_string | ||||
|             lexeme = <LexemeC*>self._by_orth.get(orth) | ||||
|             if lexeme == NULL: | ||||
|                 raise KeyError(id_or_string) | ||||
|             assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) | ||||
|         elif type(id_or_string) == unicode: | ||||
|             slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) | ||||
|             lexeme = self.get(self.mem, &c_str) | ||||
|             assert lexeme.orth == self.strings[id_or_string] | ||||
|         else: | ||||
|             raise ValueError("Vocab unable to map type: " | ||||
|                 "%s. Maps unicode --> Lexeme or " | ||||
|  | @ -115,12 +128,11 @@ cdef class Vocab: | |||
|         cdef UniStr c_str | ||||
|         slice_unicode(&c_str, py_str, 0, len(py_str)) | ||||
|         cdef LexemeC* lex | ||||
|         lex = <LexemeC*>self._map.get(c_str.key) | ||||
|         lex = <LexemeC*>self._by_hash.get(c_str.key) | ||||
|         if lex == NULL: | ||||
|             lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) | ||||
|             lex.id = self.lexemes.size() | ||||
|             self._add_lex_to_vocab(c_str.key, lex) | ||||
|         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||
|         self._add_lex_to_vocab(c_str.key, lex) | ||||
| 
 | ||||
|     def dump(self, loc): | ||||
|         if path.exists(loc): | ||||
|  | @ -129,12 +141,10 @@ cdef class Vocab: | |||
|         cdef FILE* fp = fopen(<char*>bytes_loc, 'wb') | ||||
|         assert fp != NULL | ||||
|         cdef size_t st | ||||
|         cdef size_t addr | ||||
|         cdef hash_t key | ||||
|         for i in range(self._map.length): | ||||
|             key = self._map.c_map.cells[i].key | ||||
|             if key == 0: | ||||
|                 continue | ||||
|             lexeme = <LexemeC*>self._map.c_map.cells[i].value | ||||
|         for key, addr in self._by_hash.items(): | ||||
|             lexeme = <LexemeC*>addr | ||||
|             st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp) | ||||
|             assert st == 1 | ||||
|             st = fwrite(lexeme, sizeof(LexemeC), 1, fp) | ||||
|  | @ -171,10 +181,9 @@ cdef class Vocab: | |||
|                 raise IOError('Error reading from lexemes.bin. Integrity check fails.') | ||||
|             py_str = self.strings[orth] | ||||
|             key = hash_string(py_str) | ||||
|             self._map.set(key, lexeme) | ||||
|             while self.lexemes.size() < (lexeme.id + 1): | ||||
|                 self.lexemes.push_back(&EMPTY_LEXEME) | ||||
|             self.lexemes[lexeme.id] = lexeme | ||||
|             self._by_hash.set(key, lexeme) | ||||
|             self._by_orth.set(lexeme.orth, lexeme) | ||||
|             self.length += 1 | ||||
|             i += 1 | ||||
|         fclose(fp) | ||||
| 
 | ||||
|  | @ -185,7 +194,7 @@ cdef class Vocab: | |||
|         cdef int32_t prev_vec_len = 0 | ||||
|         cdef float* vec | ||||
|         cdef Address mem | ||||
|         cdef id_t string_id | ||||
|         cdef attr_t string_id | ||||
|         cdef bytes py_word | ||||
|         cdef vector[float*] vectors | ||||
|         cdef int i | ||||
|  | @ -212,9 +221,9 @@ cdef class Vocab: | |||
|             assert vec != NULL | ||||
|             vectors[string_id] = vec | ||||
|         cdef LexemeC* lex | ||||
|         for i in range(self.lexemes.size()): | ||||
|             # Cast away the const, cos we can modify our lexemes | ||||
|             lex = <LexemeC*>self.lexemes[i] | ||||
|         cdef size_t lex_addr | ||||
|         for orth, lex_addr in self._by_orth.items(): | ||||
|             lex = <LexemeC*>lex_addr | ||||
|             if lex.lower < vectors.size(): | ||||
|                 lex.repvec = vectors[lex.lower] | ||||
|                 for i in range(vec_len): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user