mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Improve efficiency of special cases handling
* Use PhraseMatcher instead of Matcher * Improve efficiency of merging/splitting special cases in document * Process merge/splits in one pass without repeated token shifting * Merge in place if no splits
This commit is contained in:
		
							parent
							
								
									e74963acd4
								
							
						
					
					
						commit
						d3990d080c
					
				|  | @ -23,6 +23,7 @@ cdef class Tokenizer: | ||||||
|     cdef object _rules |     cdef object _rules | ||||||
|     cdef object _special_matcher |     cdef object _special_matcher | ||||||
|     cdef int _property_init_count |     cdef int _property_init_count | ||||||
|  |     cdef int _property_init_max | ||||||
| 
 | 
 | ||||||
|     cpdef Doc tokens_from_list(self, list strings) |     cpdef Doc tokens_from_list(self, list strings) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from cython.operator cimport dereference as deref | from cython.operator cimport dereference as deref | ||||||
| from cython.operator cimport preincrement as preinc | from cython.operator cimport preincrement as preinc | ||||||
|  | from libc.string cimport memcpy, memset | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from preshed.maps cimport PreshMap | from preshed.maps cimport PreshMap | ||||||
| cimport cython | cimport cython | ||||||
|  | @ -20,7 +21,8 @@ from .errors import Errors, Warnings, deprecation_warning | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| from .attrs import intify_attrs | from .attrs import intify_attrs | ||||||
| from .matcher import Matcher | from .lexeme cimport EMPTY_LEXEME | ||||||
|  | from .matcher import PhraseMatcher | ||||||
| from .symbols import ORTH | from .symbols import ORTH | ||||||
| 
 | 
 | ||||||
| cdef class Tokenizer: | cdef class Tokenizer: | ||||||
|  | @ -60,9 +62,10 @@ cdef class Tokenizer: | ||||||
|         self.infix_finditer = infix_finditer |         self.infix_finditer = infix_finditer | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self._rules = {} |         self._rules = {} | ||||||
|         self._special_matcher = Matcher(self.vocab) |         self._special_matcher = PhraseMatcher(self.vocab) | ||||||
|         self._load_special_cases(rules) |         self._load_special_cases(rules) | ||||||
|         self._property_init_count = 0 |         self._property_init_count = 0 | ||||||
|  |         self._property_init_max = 4 | ||||||
| 
 | 
 | ||||||
|     property token_match: |     property token_match: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -71,6 +74,7 @@ cdef class Tokenizer: | ||||||
|         def __set__(self, token_match): |         def __set__(self, token_match): | ||||||
|             self._token_match = token_match |             self._token_match = token_match | ||||||
|             self._reload_special_cases() |             self._reload_special_cases() | ||||||
|  |             if self._property_init_count <= self._property_init_max: | ||||||
|                 self._property_init_count += 1 |                 self._property_init_count += 1 | ||||||
| 
 | 
 | ||||||
|     property prefix_search: |     property prefix_search: | ||||||
|  | @ -80,6 +84,7 @@ cdef class Tokenizer: | ||||||
|         def __set__(self, prefix_search): |         def __set__(self, prefix_search): | ||||||
|             self._prefix_search = prefix_search |             self._prefix_search = prefix_search | ||||||
|             self._reload_special_cases() |             self._reload_special_cases() | ||||||
|  |             if self._property_init_count <= self._property_init_max: | ||||||
|                 self._property_init_count += 1 |                 self._property_init_count += 1 | ||||||
| 
 | 
 | ||||||
|     property suffix_search: |     property suffix_search: | ||||||
|  | @ -89,8 +94,10 @@ cdef class Tokenizer: | ||||||
|         def __set__(self, suffix_search): |         def __set__(self, suffix_search): | ||||||
|             self._suffix_search = suffix_search |             self._suffix_search = suffix_search | ||||||
|             self._reload_special_cases() |             self._reload_special_cases() | ||||||
|  |             if self._property_init_count <= self._property_init_max: | ||||||
|                 self._property_init_count += 1 |                 self._property_init_count += 1 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|     property infix_finditer: |     property infix_finditer: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self._infix_finditer |             return self._infix_finditer | ||||||
|  | @ -98,6 +105,7 @@ cdef class Tokenizer: | ||||||
|         def __set__(self, infix_finditer): |         def __set__(self, infix_finditer): | ||||||
|             self._infix_finditer = infix_finditer |             self._infix_finditer = infix_finditer | ||||||
|             self._reload_special_cases() |             self._reload_special_cases() | ||||||
|  |             if self._property_init_count <= self._property_init_max: | ||||||
|                 self._property_init_count += 1 |                 self._property_init_count += 1 | ||||||
| 
 | 
 | ||||||
|     def __reduce__(self): |     def __reduce__(self): | ||||||
|  | @ -225,48 +233,79 @@ cdef class Tokenizer: | ||||||
|         doc (Doc): Document. |         doc (Doc): Document. | ||||||
|         """ |         """ | ||||||
|         cdef int i |         cdef int i | ||||||
|         # Find all special cases and filter overlapping matches |         cdef int j | ||||||
|         spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)] |         cdef int curr_length = doc.length | ||||||
|         spans = util.filter_spans(spans) |         cdef int max_length = 0 | ||||||
|         spans = [(span.text, span.start, span.end) for span in spans] |  | ||||||
|         # Modify tokenization according to filtered special cases |  | ||||||
|         cdef int offset = 0 |         cdef int offset = 0 | ||||||
|         cdef int span_length_diff = 0 |         cdef int span_length_diff = 0 | ||||||
|  |         cdef bint modify_in_place = True | ||||||
|         cdef int idx_offset = 0 |         cdef int idx_offset = 0 | ||||||
|  |         cdef int orig_final_spacy | ||||||
|  |         cdef int orig_idx | ||||||
|  |         cdef Pool mem = Pool() | ||||||
|  |         spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)] | ||||||
|  |         # Skip processing if no matches | ||||||
|  |         if len(spans) == 0: | ||||||
|  |             return True | ||||||
|  |         spans = util.filter_spans(spans) | ||||||
|  |         # Put span info in span.start-indexed dict and calculate maximum | ||||||
|  |         # intermediate document size | ||||||
|  |         span_data = {} | ||||||
|         for span in spans: |         for span in spans: | ||||||
|             if not span[0] in self._rules: |             rule = self._rules.get(span.text, None) | ||||||
|                 continue |             span_length_diff = 0 | ||||||
|             # Allocate more memory for doc if needed |             if rule: | ||||||
|             span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1]) |                 span_length_diff = len(rule) - (span.end - span.start) | ||||||
|             while doc.length + offset + span_length_diff >= doc.max_length: |             if span_length_diff > 0: | ||||||
|                 doc._realloc(doc.length * 2) |                 modify_in_place = False | ||||||
|             # Find special case entries in cache |             curr_length += span_length_diff | ||||||
|  |             if curr_length > max_length: | ||||||
|  |                 max_length = curr_length | ||||||
|  |             span_data[span.start] = (span.text, span.start, span.end, span_length_diff) | ||||||
|  |         # Modify tokenization according to filtered special cases | ||||||
|  |         # If modifications never increase doc length, can modify in place | ||||||
|  |         if modify_in_place: | ||||||
|  |             tokens = doc.c | ||||||
|  |         # Otherwise create a separate array to store modified tokens | ||||||
|  |         else: | ||||||
|  |             tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC)) | ||||||
|  |         i = 0 | ||||||
|  |         while i < doc.length: | ||||||
|  |             if not i in span_data: | ||||||
|  |                 tokens[i + offset] = doc.c[i] | ||||||
|  |                 i += 1 | ||||||
|  |             else: | ||||||
|  |                 span = span_data[i] | ||||||
|                 cached = <_Cached*>self._specials.get(hash_string(span[0])) |                 cached = <_Cached*>self._specials.get(hash_string(span[0])) | ||||||
|                 if cached == NULL: |                 if cached == NULL: | ||||||
|                 continue |                     # Copy original tokens if no rule found | ||||||
|             # Shift original tokens... |                     for j in range(span[2] - span[1]): | ||||||
|             # ...from span position to end if new span is shorter |                         tokens[i + offset + j] = doc.c[i + j] | ||||||
|             if span_length_diff < 0: |                     i += span[2] - span[1] | ||||||
|                 for i in range(span[2] + offset, doc.length): |                 else: | ||||||
|                     doc.c[span_length_diff + i] = doc.c[i] |                     # Copy special case tokens into doc and adjust token and | ||||||
|             # ...from end to span position if new span is longer |                     # character offsets | ||||||
|             elif span_length_diff > 0: |  | ||||||
|                 for i in range(doc.length - 1, span[2] + offset - 1, -1): |  | ||||||
|                     doc.c[span_length_diff + i] = doc.c[i] |  | ||||||
|             # Copy special case tokens into doc and adjust token and character |  | ||||||
|             # offsets |  | ||||||
|                     idx_offset = 0 |                     idx_offset = 0 | ||||||
|                     orig_final_spacy = doc.c[span[2] + offset - 1].spacy |                     orig_final_spacy = doc.c[span[2] + offset - 1].spacy | ||||||
|             for i in range(cached.length): |                     orig_idx = doc.c[i].idx | ||||||
|                 orig_idx = doc.c[span[1] + offset + i].idx |                     for j in range(cached.length): | ||||||
|                 doc.c[span[1] + offset + i] = cached.data.tokens[i] |                         tokens[i + offset + j] = cached.data.tokens[j] | ||||||
|                 doc.c[span[1] + offset + i].idx = orig_idx + idx_offset |                         tokens[i + offset + j].idx = orig_idx + idx_offset | ||||||
|                 idx_offset += cached.data.tokens[i].lex.length + \ |                         idx_offset += cached.data.tokens[j].lex.length + \ | ||||||
|                         1 if cached.data.tokens[i].spacy else 0 |                                 1 if cached.data.tokens[j].spacy else 0 | ||||||
|             doc.c[span[2] + offset + - 1].spacy = orig_final_spacy |                     tokens[i + offset + cached.length - 1].spacy = orig_final_spacy | ||||||
|             # Token offset for special case spans |                     i += span[2] - span[1] | ||||||
|             offset += span_length_diff |                     offset += span[3] | ||||||
|             doc.length += span_length_diff |         # Allocate more memory for doc if needed | ||||||
|  |         while doc.length < doc.length + offset: | ||||||
|  |             doc._realloc(doc.length * 2) | ||||||
|  |         # If not modified in place, copy tokens back to doc | ||||||
|  |         if not modify_in_place: | ||||||
|  |             memcpy(doc.c, tokens, max_length * sizeof(TokenC)) | ||||||
|  |         for i in range(doc.length + offset, doc.length): | ||||||
|  |             memset(&doc.c[i], 0, sizeof(TokenC)) | ||||||
|  |             doc.c[i].lex = &EMPTY_LEXEME | ||||||
|  |         doc.length = doc.length + offset | ||||||
|         return True |         return True | ||||||
| 
 | 
 | ||||||
|     cdef int _try_cache(self, hash_t key, Doc tokens) except -1: |     cdef int _try_cache(self, hash_t key, Doc tokens) except -1: | ||||||
|  | @ -287,10 +326,6 @@ cdef class Tokenizer: | ||||||
|         if cached == NULL: |         if cached == NULL: | ||||||
|             return False |             return False | ||||||
|         cdef int i |         cdef int i | ||||||
|         if cached.is_lex: |  | ||||||
|             for i in range(cached.length): |  | ||||||
|                 tokens.push_back(cached.data.lexemes[i], False) |  | ||||||
|         else: |  | ||||||
|         for i in range(cached.length): |         for i in range(cached.length): | ||||||
|             tokens.push_back(&cached.data.tokens[i], False) |             tokens.push_back(&cached.data.tokens[i], False) | ||||||
|         has_special[0] = 1 |         has_special[0] = 1 | ||||||
|  | @ -521,7 +556,7 @@ cdef class Tokenizer: | ||||||
|         self._rules[string] = substrings |         self._rules[string] = substrings | ||||||
|         self._flush_cache() |         self._flush_cache() | ||||||
|         if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): |         if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): | ||||||
|             self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string, False)]) |             self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) | ||||||
| 
 | 
 | ||||||
|     def _reload_special_cases(self): |     def _reload_special_cases(self): | ||||||
|         try: |         try: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user