mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Refactor tokenization, splitting it into a clearer life-cycle.
This commit is contained in:
		
							parent
							
								
									c396581a0b
								
							
						
					
					
						commit
						143e51ec73
					
				|  | @ -94,3 +94,8 @@ you'll  you will | |||
| you're  you are | ||||
| you've  you have | ||||
| 10km    10 km | ||||
| U.S.    U.S. | ||||
| U.N.    U.N. | ||||
| Ms. Ms. | ||||
| Mr. Mr. | ||||
| P.  P. | ||||
|  |  | |||
|  | @ -4,4 +4,4 @@ from spacy.tokens cimport Tokens | |||
| 
 | ||||
| 
 | ||||
| cdef class English(Language): | ||||
|     cdef int _split_one(self, Py_UNICODE* characters, size_t length) | ||||
|     pass | ||||
|  |  | |||
							
								
								
									
										57
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										57
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -56,26 +56,47 @@ cdef class English(Language): | |||
|         name (unicode): The two letter code used by Wikipedia for the language. | ||||
|         lexicon (Lexicon): The lexicon. Exposes the lookup method. | ||||
|     """ | ||||
|     cdef int _split_one(self, Py_UNICODE* characters, size_t length): | ||||
|         if length == 1: | ||||
|     cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: | ||||
|         cdef Py_UNICODE c0 = chars[0] | ||||
|         cdef Py_UNICODE c1 = chars[1] | ||||
|         if c0 == ",": | ||||
|             return 1 | ||||
|         if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"): | ||||
|             return 2 | ||||
|         cdef int i = 0 | ||||
|         # Leading punctuation | ||||
|         if _check_punct(characters, 0, length): | ||||
|         elif c0 == '"': | ||||
|             return 1 | ||||
|         # Contractions | ||||
|         elif length >= 3 and characters[length - 2] == "'": | ||||
|             c2 = characters[length-1] | ||||
|             if c2 == "s" or c2 == "S": | ||||
|                 return length - 2 | ||||
|         if length >= 1: | ||||
|             # Split off all trailing punctuation characters | ||||
|             i = 0 | ||||
|             while i < length and not _check_punct(characters, i, length): | ||||
|                 i += 1 | ||||
|         return i | ||||
|         elif c0 == "(": | ||||
|             return 1 | ||||
|         elif c0 == "[": | ||||
|             return 1 | ||||
|         elif c0 == "{": | ||||
|             return 1 | ||||
|         elif c0 == "*": | ||||
|             return 1 | ||||
|         elif c0 == "<": | ||||
|             return 1 | ||||
|         elif c0 == "$": | ||||
|             return 1 | ||||
|         elif c0 == "£": | ||||
|             return 1 | ||||
|         elif c0 == "€": | ||||
|             return 1 | ||||
|         elif c0 == "\u201c": | ||||
|             return 1 | ||||
|         elif c0 == "'": | ||||
|             if c1 == "s": | ||||
|                 return 2 | ||||
|             elif c1 == "S": | ||||
|                 return 2 | ||||
|             elif c1 == "'": | ||||
|                 return 2 | ||||
|             else: | ||||
|                 return 1 | ||||
|         elif c0 == "`": | ||||
|             if c1 == "`": | ||||
|                 return 2 | ||||
|             else: | ||||
|                 return 1 | ||||
|         else: | ||||
|             return 0 | ||||
|          | ||||
| abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P']) | ||||
| cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length): | ||||
|  |  | |||
|  | @ -42,5 +42,14 @@ cdef class Language: | |||
|     cpdef Tokens tokenize(self, unicode text) | ||||
|     cpdef Lexeme lookup(self, unicode text) | ||||
| 
 | ||||
|     cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1 | ||||
|     cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 | ||||
|     cdef int _split_one(self, Py_UNICODE* characters, size_t length) | ||||
|     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) | ||||
|     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) | ||||
|      | ||||
|     cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, | ||||
|                             vector[LexemeC*] *prefixes, | ||||
|                             vector[LexemeC*] *suffixes) except -1 | ||||
| 
 | ||||
|     cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1 | ||||
|   | ||||
|  |  | |||
							
								
								
									
										114
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							
							
						
						
									
										114
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							|  | @ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init | |||
| from murmurhash.mrmr cimport hash64 | ||||
| from cpython.ref cimport Py_INCREF | ||||
| 
 | ||||
| from cython.operator cimport preincrement as preinc | ||||
| from cython.operator cimport dereference as deref | ||||
| 
 | ||||
| 
 | ||||
| from spacy._hashing cimport PointerHash | ||||
| from spacy import orth | ||||
|  | @ -191,42 +194,77 @@ cdef class Language: | |||
|         return tokens | ||||
| 
 | ||||
|     cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: | ||||
|         cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key) | ||||
|         cdef size_t i | ||||
|         if lexemes != NULL: | ||||
|             i = 0 | ||||
|             while lexemes[i] != NULL: | ||||
|                 tokens_v.push_back(lexemes[i]) | ||||
|                 i += 1 | ||||
|         self._check_cache(tokens_v, string) | ||||
|         if not string.n: | ||||
|             return 0 | ||||
|         cdef uint64_t key = string.key  | ||||
|         cdef size_t first_token = tokens_v.size() | ||||
|         cdef int split | ||||
|         cdef int remaining = string.n | ||||
|         cdef String prefix | ||||
|         cdef LexemeC* lexeme | ||||
|         while remaining >= 1: | ||||
|             split = self._split_one(string.chars, string.n) | ||||
|             remaining -= split | ||||
|             string_slice_prefix(string, &prefix, split) | ||||
|             lexemes = <LexemeC**>self.specials.get(prefix.key) | ||||
|             if lexemes != NULL: | ||||
|                 i = 0 | ||||
|                 while lexemes[i] != NULL: | ||||
|                     tokens_v.push_back(lexemes[i]) | ||||
|                     i += 1 | ||||
|             else: | ||||
|                 lexeme = <LexemeC*>self.lexicon.get(&prefix) | ||||
|                 tokens_v.push_back(lexeme) | ||||
|         lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*)) | ||||
|         cdef size_t j | ||||
|         for i, j in enumerate(range(first_token, tokens_v.size())): | ||||
|             lexemes[i] = tokens_v[0][j] | ||||
|         lexemes[i+1] = NULL | ||||
|         self.cache.set(key, lexemes) | ||||
|         cdef uint64_t orig_key = string.key | ||||
|         cdef size_t orig_size = tokens_v.size() | ||||
| 
 | ||||
|     cdef int _split_one(self, Py_UNICODE* characters, size_t length): | ||||
|         return length | ||||
|         cdef vector[LexemeC*] prefixes | ||||
|         cdef vector[LexemeC*] suffixes | ||||
| 
 | ||||
|         cdef String affix | ||||
|         cdef int split = self._find_prefix(string.chars, string.n) | ||||
|         while string.n and split >= 1: | ||||
|             string_slice_prefix(string, &affix, split) | ||||
|             prefixes.push_back(self.lexicon.get(&affix)) | ||||
|             split = self._find_prefix(string.chars, string.n) | ||||
| 
 | ||||
|         split = self._find_suffix(string.chars, string.n) | ||||
|         while string.n and split >= 1: | ||||
|             string_slice_suffix(string, &affix, split) | ||||
|             suffixes.push_back(self.lexicon.get(&affix)) | ||||
|             split = self._find_suffix(string.chars, string.n) | ||||
|   | ||||
|         self._attach_tokens(tokens_v, string, &prefixes, &suffixes) | ||||
|         self._save_cached(tokens_v, orig_key, orig_size) | ||||
| 
 | ||||
|     cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1: | ||||
|         lexemes = <LexemeC**>self.cache.get(string.key) | ||||
|         cdef size_t i = 0 | ||||
|         if lexemes != NULL: | ||||
|             while lexemes[i] != NULL: | ||||
|                 tokens.push_back(lexemes[i]) | ||||
|                 i += 1 | ||||
|             string.n = 0 | ||||
|             string.key = 0 | ||||
|             string.chars = NULL | ||||
| 
 | ||||
| 
 | ||||
|     cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, | ||||
|                             vector[LexemeC*] *prefixes, | ||||
|                             vector[LexemeC*] *suffixes) except -1: | ||||
|         cdef LexemeC* lexeme | ||||
|         for lexeme in prefixes[0]: | ||||
|             tokens.push_back(lexeme) | ||||
|         self._check_cache(tokens, string) | ||||
|         if string.n != 0: | ||||
|             tokens.push_back(self.lexicon.get(string)) | ||||
|         cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() | ||||
|         while it != suffixes.rend(): | ||||
|             tokens.push_back(deref(it)) | ||||
|             preinc(it) | ||||
| 
 | ||||
|     cdef int _save_cached(self, vector[LexemeC*] *tokens, | ||||
|                           uint64_t key, size_t n) except -1: | ||||
|         pass | ||||
|          | ||||
|     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length): | ||||
|         return 0 | ||||
| 
 | ||||
|     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length): | ||||
|         if length < 2: | ||||
|             return 0 | ||||
|         cdef unicode string = characters[:length] | ||||
|         print repr(string) | ||||
|         if string.endswith("'s") or string.endswith("'S"): | ||||
|             return 2 | ||||
|         elif string.endswith("..."): | ||||
|             return 3 | ||||
|         elif not string[-1].isalnum(): | ||||
|             return 1 | ||||
|         else: | ||||
|             return 0 | ||||
| 
 | ||||
|     def _load_special_tokenization(self, token_rules): | ||||
|         '''Load special-case tokenization rules. | ||||
|  | @ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil: | |||
|     s.chars += n | ||||
|     s.n -= n | ||||
|     s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) | ||||
| 
 | ||||
| 
 | ||||
| cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil: | ||||
|     string_from_slice(suffix, s.chars, s.n - n, s.n) | ||||
|     s.n -= n | ||||
|     s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user