mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string
This commit is contained in:
		
							parent
							
								
									6eef0bf9ab
								
							
						
					
					
						commit
						67641f3b58
					
				|  | @ -29,7 +29,7 @@ cdef class Tokenizer: | |||
| 
 | ||||
|     cpdef Doc tokens_from_list(self, list strings) | ||||
| 
 | ||||
|     cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1 | ||||
|     cdef int _try_cache(self, hash_t key, Doc tokens) except -1 | ||||
|     cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1 | ||||
|     cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, | ||||
|                              vector[LexemeC*] *suffixes) except NULL | ||||
|  |  | |||
|  | @ -39,16 +39,17 @@ cdef class Tokenizer: | |||
|         return cls(vocab, rules, prefix_re, suffix_re, infix_re) | ||||
| 
 | ||||
|     cpdef Doc tokens_from_list(self, list strings): | ||||
|         cdef int length = sum([len(s) for s in strings]) | ||||
|         cdef Doc tokens = Doc(self.vocab, ' '.join(strings)) | ||||
|         if length == 0: | ||||
|         cdef Doc tokens = Doc(self.vocab) | ||||
|         if sum([len(s) for s in strings]) == 0: | ||||
|             return tokens | ||||
|         cdef UniStr string_struct | ||||
|         cdef unicode py_string | ||||
|         cdef int idx = 0 | ||||
|         for i, py_string in enumerate(strings): | ||||
|             slice_unicode(&string_struct, py_string, 0, len(py_string)) | ||||
|             tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct)) | ||||
|             # Note that we pass tokens.mem here --- the Doc object has ownership | ||||
|             tokens.push_back( | ||||
|                 <const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True) | ||||
|             idx += len(py_string) + 1 | ||||
|         return tokens | ||||
| 
 | ||||
|  | @ -73,7 +74,7 @@ cdef class Tokenizer: | |||
|             tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. | ||||
|         """ | ||||
|         cdef int length = len(string) | ||||
|         cdef Doc tokens = Doc(self.vocab, string) | ||||
|         cdef Doc tokens = Doc(self.vocab) | ||||
|         if length == 0: | ||||
|             return tokens | ||||
|         cdef int i = 0 | ||||
|  | @ -86,32 +87,39 @@ cdef class Tokenizer: | |||
|             if Py_UNICODE_ISSPACE(chars[i]) != in_ws: | ||||
|                 if start < i: | ||||
|                     slice_unicode(&span, chars, start, i) | ||||
|                     cache_hit = self._try_cache(start, span.key, tokens) | ||||
|                     cache_hit = self._try_cache(span.key, tokens) | ||||
|                     if not cache_hit: | ||||
|                         self._tokenize(tokens, &span, start, i) | ||||
|                 in_ws = not in_ws | ||||
|                 start = i | ||||
|                 if chars[i] == ' ': | ||||
|                     tokens.data[tokens.length - 1].spacy = True | ||||
|                     start += 1 | ||||
|         i += 1 | ||||
|         if start < i: | ||||
|             slice_unicode(&span, chars, start, i) | ||||
|             cache_hit = self._try_cache(start, span.key, tokens) | ||||
|             cache_hit = self._try_cache(span.key, tokens) | ||||
|             if not cache_hit: | ||||
|                 self._tokenize(tokens, &span, start, i) | ||||
| 
 | ||||
|             tokens.data[tokens.length - 1].spacy = string[-1] == ' ' | ||||
|         return tokens | ||||
| 
 | ||||
|     cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1: | ||||
|     cdef int _try_cache(self, hash_t key, Doc tokens) except -1: | ||||
|         cached = <_Cached*>self._cache.get(key) | ||||
|         if cached == NULL: | ||||
|             return False | ||||
|         cdef int i | ||||
|         cdef int less_one = cached.length-1 | ||||
|         if cached.is_lex: | ||||
|             for i in range(cached.length): | ||||
|                 idx = tokens.push_back(idx, cached.data.lexemes[i]) | ||||
|             for i in range(less_one): | ||||
|                 # There's a space at the end of the chunk. | ||||
|                 tokens.push_back(cached.data.lexemes[i], False) | ||||
|             tokens.push_back(cached.data.lexemes[less_one], False) | ||||
|         else: | ||||
|             for i in range(cached.length): | ||||
|                 idx = tokens.push_back(idx, &cached.data.tokens[i]) | ||||
|             for i in range(less_one): | ||||
|                 tokens.push_back(&cached.data.tokens[i], False) | ||||
|             tokens.push_back(&cached.data.tokens[less_one], False) | ||||
|         return True | ||||
| 
 | ||||
|     cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: | ||||
|  | @ -171,36 +179,39 @@ cdef class Tokenizer: | |||
|                             vector[const LexemeC*] *prefixes, | ||||
|                             vector[const LexemeC*] *suffixes) except -1: | ||||
|         cdef bint cache_hit | ||||
|         cdef bint is_spacy | ||||
|         cdef int split | ||||
|         cdef const LexemeC* const* lexemes | ||||
|         cdef LexemeC* lexeme | ||||
|         cdef const LexemeC* lexeme | ||||
|         cdef UniStr span | ||||
|         cdef int i | ||||
|         # Have to calculate is_spacy here, i.e. does the token have a trailing | ||||
|         # space. There are no spaces *between* the tokens we attach | ||||
|         # here, and there *is* a space after the last token. | ||||
|         if prefixes.size(): | ||||
|             for i in range(prefixes.size()): | ||||
|                 idx = tokens.push_back(idx, prefixes[0][i]) | ||||
|                 tokens.push_back(prefixes[0][i], False) | ||||
|         if string.n != 0: | ||||
|             cache_hit = self._try_cache(idx, string.key, tokens) | ||||
|             cache_hit = self._try_cache(string.key, tokens) | ||||
|             if cache_hit: | ||||
|                 # Get last idx | ||||
|                 idx = tokens.data[tokens.length - 1].idx | ||||
|                 # Increment by last length | ||||
|                 idx += tokens.data[tokens.length - 1].lex.length | ||||
|                 pass | ||||
|             else: | ||||
|                 split = self._find_infix(string.chars, string.n) | ||||
|                 if split == 0 or split == -1: | ||||
|                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string)) | ||||
|                     tokens.push_back(self.vocab.get(tokens.mem, string), False) | ||||
|                 else: | ||||
|                     # Append the beginning, afix, end of the infix token | ||||
|                     slice_unicode(&span, string.chars, 0, split) | ||||
|                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) | ||||
|                     tokens.push_back(self.vocab.get(tokens.mem, &span), False) | ||||
|                     slice_unicode(&span, string.chars, split, split+1) | ||||
|                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) | ||||
|                     tokens.push_back(self.vocab.get(tokens.mem, &span), False) | ||||
|                     slice_unicode(&span, string.chars, split + 1, string.n) | ||||
|                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) | ||||
|                     tokens.push_back(self.vocab.get(tokens.mem, &span), False) | ||||
|         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() | ||||
|         while it != suffixes.rend(): | ||||
|             idx = tokens.push_back(idx, deref(it)) | ||||
|             lexeme = deref(it) | ||||
|             preinc(it) | ||||
|             tokens.push_back(lexeme, False) | ||||
| 
 | ||||
|     cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: | ||||
|         cdef int i | ||||
|  |  | |||
|  | @ -26,7 +26,7 @@ cdef class Doc: | |||
|     cdef int length | ||||
|     cdef int max_length | ||||
| 
 | ||||
|     cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 | ||||
|     cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 | ||||
| 
 | ||||
|     cpdef np.ndarray to_array(self, object features) | ||||
| 
 | ||||
|  |  | |||
|  | @ -173,7 +173,7 @@ cdef class Doc: | |||
|                 start = i | ||||
|         yield Span(self, start, self.length) | ||||
| 
 | ||||
|     cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: | ||||
|     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: | ||||
|         if self.length == self.max_length: | ||||
|             self._realloc(self.length * 2) | ||||
|         cdef TokenC* t = &self.data[self.length] | ||||
|  | @ -181,9 +181,13 @@ cdef class Doc: | |||
|             t[0] = lex_or_tok[0] | ||||
|         else: | ||||
|             t.lex = lex_or_tok | ||||
|         t.idx = idx | ||||
|         if self.length == 0: | ||||
|             t.idx = 0 | ||||
|         else: | ||||
|             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy | ||||
|         t.spacy = has_space | ||||
|         self.length += 1 | ||||
|         return idx + t.lex.length | ||||
|         return t.idx + t.lex.length + t.spacy | ||||
| 
 | ||||
|     @cython.boundscheck(False) | ||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||
|  | @ -375,11 +379,11 @@ cdef class Doc: | |||
|             string += vocab.strings[lex.orth] | ||||
|             if space: | ||||
|                 string += u' ' | ||||
|         cdef Doc doc = Doc(vocab, string) | ||||
|         cdef Doc doc = Doc(vocab) | ||||
|         cdef bint has_space = False | ||||
|         cdef int idx = 0 | ||||
|         for i, id_ in enumerate(ids): | ||||
|             doc.push_back(idx, vocab.lexemes[id_]) | ||||
|             idx += vocab.lexemes[id_].length | ||||
|             if spaces[i]: | ||||
|                 idx += 1 | ||||
|             lex = vocab.lexemes[id_] | ||||
|             has_space = spaces[i] | ||||
|             doc.push_back(lex, has_space) | ||||
|         return doc | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user