Improve efficiency of special cases handling

* Use PhraseMatcher instead of Matcher
* Improve efficiency of merging/splitting special cases in document
  * Process merge/splits in one pass without repeated token shifting
  * Merge in place if no splits
This commit is contained in:
Adriane Boyd 2019-09-20 16:39:30 +02:00
parent e74963acd4
commit d3990d080c
2 changed files with 87 additions and 51 deletions

View File

@ -23,6 +23,7 @@ cdef class Tokenizer:
cdef object _rules cdef object _rules
cdef object _special_matcher cdef object _special_matcher
cdef int _property_init_count cdef int _property_init_count
cdef int _property_init_max
cpdef Doc tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)

View File

@ -5,6 +5,7 @@ from __future__ import unicode_literals
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
cimport cython cimport cython
@ -20,7 +21,8 @@ from .errors import Errors, Warnings, deprecation_warning
from . import util from . import util
from .attrs import intify_attrs from .attrs import intify_attrs
from .matcher import Matcher from .lexeme cimport EMPTY_LEXEME
from .matcher import PhraseMatcher
from .symbols import ORTH from .symbols import ORTH
cdef class Tokenizer: cdef class Tokenizer:
@ -60,9 +62,10 @@ cdef class Tokenizer:
self.infix_finditer = infix_finditer self.infix_finditer = infix_finditer
self.vocab = vocab self.vocab = vocab
self._rules = {} self._rules = {}
self._special_matcher = Matcher(self.vocab) self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules) self._load_special_cases(rules)
self._property_init_count = 0 self._property_init_count = 0
self._property_init_max = 4
property token_match: property token_match:
def __get__(self): def __get__(self):
@ -71,7 +74,8 @@ cdef class Tokenizer:
def __set__(self, token_match): def __set__(self, token_match):
self._token_match = token_match self._token_match = token_match
self._reload_special_cases() self._reload_special_cases()
self._property_init_count += 1 if self._property_init_count <= self._property_init_max:
self._property_init_count += 1
property prefix_search: property prefix_search:
def __get__(self): def __get__(self):
@ -80,7 +84,8 @@ cdef class Tokenizer:
def __set__(self, prefix_search): def __set__(self, prefix_search):
self._prefix_search = prefix_search self._prefix_search = prefix_search
self._reload_special_cases() self._reload_special_cases()
self._property_init_count += 1 if self._property_init_count <= self._property_init_max:
self._property_init_count += 1
property suffix_search: property suffix_search:
def __get__(self): def __get__(self):
@ -89,7 +94,9 @@ cdef class Tokenizer:
def __set__(self, suffix_search): def __set__(self, suffix_search):
self._suffix_search = suffix_search self._suffix_search = suffix_search
self._reload_special_cases() self._reload_special_cases()
self._property_init_count += 1 if self._property_init_count <= self._property_init_max:
self._property_init_count += 1
property infix_finditer: property infix_finditer:
def __get__(self): def __get__(self):
@ -98,7 +105,8 @@ cdef class Tokenizer:
def __set__(self, infix_finditer): def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer self._infix_finditer = infix_finditer
self._reload_special_cases() self._reload_special_cases()
self._property_init_count += 1 if self._property_init_count <= self._property_init_max:
self._property_init_count += 1
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,
@ -225,48 +233,79 @@ cdef class Tokenizer:
doc (Doc): Document. doc (Doc): Document.
""" """
cdef int i cdef int i
# Find all special cases and filter overlapping matches cdef int j
spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)] cdef int curr_length = doc.length
spans = util.filter_spans(spans) cdef int max_length = 0
spans = [(span.text, span.start, span.end) for span in spans]
# Modify tokenization according to filtered special cases
cdef int offset = 0 cdef int offset = 0
cdef int span_length_diff = 0 cdef int span_length_diff = 0
cdef bint modify_in_place = True
cdef int idx_offset = 0 cdef int idx_offset = 0
cdef int orig_final_spacy
cdef int orig_idx
cdef Pool mem = Pool()
spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)]
# Skip processing if no matches
if len(spans) == 0:
return True
spans = util.filter_spans(spans)
# Put span info in span.start-indexed dict and calculate maximum
# intermediate document size
span_data = {}
for span in spans: for span in spans:
if not span[0] in self._rules: rule = self._rules.get(span.text, None)
continue span_length_diff = 0
# Allocate more memory for doc if needed if rule:
span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1]) span_length_diff = len(rule) - (span.end - span.start)
while doc.length + offset + span_length_diff >= doc.max_length: if span_length_diff > 0:
doc._realloc(doc.length * 2) modify_in_place = False
# Find special case entries in cache curr_length += span_length_diff
cached = <_Cached*>self._specials.get(hash_string(span[0])) if curr_length > max_length:
if cached == NULL: max_length = curr_length
continue span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
# Shift original tokens... # Modify tokenization according to filtered special cases
# ...from span position to end if new span is shorter # If modifications never increase doc length, can modify in place
if span_length_diff < 0: if modify_in_place:
for i in range(span[2] + offset, doc.length): tokens = doc.c
doc.c[span_length_diff + i] = doc.c[i] # Otherwise create a separate array to store modified tokens
# ...from end to span position if new span is longer else:
elif span_length_diff > 0: tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
for i in range(doc.length - 1, span[2] + offset - 1, -1): i = 0
doc.c[span_length_diff + i] = doc.c[i] while i < doc.length:
# Copy special case tokens into doc and adjust token and character if not i in span_data:
# offsets tokens[i + offset] = doc.c[i]
idx_offset = 0 i += 1
orig_final_spacy = doc.c[span[2] + offset - 1].spacy else:
for i in range(cached.length): span = span_data[i]
orig_idx = doc.c[span[1] + offset + i].idx cached = <_Cached*>self._specials.get(hash_string(span[0]))
doc.c[span[1] + offset + i] = cached.data.tokens[i] if cached == NULL:
doc.c[span[1] + offset + i].idx = orig_idx + idx_offset # Copy original tokens if no rule found
idx_offset += cached.data.tokens[i].lex.length + \ for j in range(span[2] - span[1]):
1 if cached.data.tokens[i].spacy else 0 tokens[i + offset + j] = doc.c[i + j]
doc.c[span[2] + offset + - 1].spacy = orig_final_spacy i += span[2] - span[1]
# Token offset for special case spans else:
offset += span_length_diff # Copy special case tokens into doc and adjust token and
doc.length += span_length_diff # character offsets
idx_offset = 0
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
orig_idx = doc.c[i].idx
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length + \
1 if cached.data.tokens[j].spacy else 0
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span[2] - span[1]
offset += span[3]
# Allocate more memory for doc if needed
while doc.length < doc.length + offset:
doc._realloc(doc.length * 2)
# If not modified in place, copy tokens back to doc
if not modify_in_place:
memcpy(doc.c, tokens, max_length * sizeof(TokenC))
for i in range(doc.length + offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME
doc.length = doc.length + offset
return True return True
cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
@ -287,12 +326,8 @@ cdef class Tokenizer:
if cached == NULL: if cached == NULL:
return False return False
cdef int i cdef int i
if cached.is_lex: for i in range(cached.length):
for i in range(cached.length): tokens.push_back(&cached.data.tokens[i], False)
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1 has_special[0] = 1
return True return True
@ -521,7 +556,7 @@ cdef class Tokenizer:
self._rules[string] = substrings self._rules[string] = substrings
self._flush_cache() self._flush_cache()
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string, False)]) self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
def _reload_special_cases(self): def _reload_special_cases(self):
try: try: