mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-20 21:40:35 +03:00
Improve efficiency of special cases handling
* Use PhraseMatcher instead of Matcher * Improve efficiency of merging/splitting special cases in document * Process merge/splits in one pass without repeated token shifting * Merge in place if no splits
This commit is contained in:
parent
e74963acd4
commit
d3990d080c
|
@ -23,6 +23,7 @@ cdef class Tokenizer:
|
||||||
cdef object _rules
|
cdef object _rules
|
||||||
cdef object _special_matcher
|
cdef object _special_matcher
|
||||||
cdef int _property_init_count
|
cdef int _property_init_count
|
||||||
|
cdef int _property_init_max
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings)
|
cpdef Doc tokens_from_list(self, list strings)
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
|
from libc.string cimport memcpy, memset
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
cimport cython
|
cimport cython
|
||||||
|
@ -20,7 +21,8 @@ from .errors import Errors, Warnings, deprecation_warning
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .matcher import Matcher
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
|
from .matcher import PhraseMatcher
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -60,9 +62,10 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = infix_finditer
|
self.infix_finditer = infix_finditer
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._special_matcher = Matcher(self.vocab)
|
self._special_matcher = PhraseMatcher(self.vocab)
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
self._property_init_count = 0
|
self._property_init_count = 0
|
||||||
|
self._property_init_max = 4
|
||||||
|
|
||||||
property token_match:
|
property token_match:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -71,7 +74,8 @@ cdef class Tokenizer:
|
||||||
def __set__(self, token_match):
|
def __set__(self, token_match):
|
||||||
self._token_match = token_match
|
self._token_match = token_match
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
self._property_init_count += 1
|
if self._property_init_count <= self._property_init_max:
|
||||||
|
self._property_init_count += 1
|
||||||
|
|
||||||
property prefix_search:
|
property prefix_search:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -80,7 +84,8 @@ cdef class Tokenizer:
|
||||||
def __set__(self, prefix_search):
|
def __set__(self, prefix_search):
|
||||||
self._prefix_search = prefix_search
|
self._prefix_search = prefix_search
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
self._property_init_count += 1
|
if self._property_init_count <= self._property_init_max:
|
||||||
|
self._property_init_count += 1
|
||||||
|
|
||||||
property suffix_search:
|
property suffix_search:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -89,7 +94,9 @@ cdef class Tokenizer:
|
||||||
def __set__(self, suffix_search):
|
def __set__(self, suffix_search):
|
||||||
self._suffix_search = suffix_search
|
self._suffix_search = suffix_search
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
self._property_init_count += 1
|
if self._property_init_count <= self._property_init_max:
|
||||||
|
self._property_init_count += 1
|
||||||
|
|
||||||
|
|
||||||
property infix_finditer:
|
property infix_finditer:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -98,7 +105,8 @@ cdef class Tokenizer:
|
||||||
def __set__(self, infix_finditer):
|
def __set__(self, infix_finditer):
|
||||||
self._infix_finditer = infix_finditer
|
self._infix_finditer = infix_finditer
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
self._property_init_count += 1
|
if self._property_init_count <= self._property_init_max:
|
||||||
|
self._property_init_count += 1
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
|
@ -225,48 +233,79 @@ cdef class Tokenizer:
|
||||||
doc (Doc): Document.
|
doc (Doc): Document.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
# Find all special cases and filter overlapping matches
|
cdef int j
|
||||||
spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)]
|
cdef int curr_length = doc.length
|
||||||
spans = util.filter_spans(spans)
|
cdef int max_length = 0
|
||||||
spans = [(span.text, span.start, span.end) for span in spans]
|
|
||||||
# Modify tokenization according to filtered special cases
|
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int span_length_diff = 0
|
cdef int span_length_diff = 0
|
||||||
|
cdef bint modify_in_place = True
|
||||||
cdef int idx_offset = 0
|
cdef int idx_offset = 0
|
||||||
|
cdef int orig_final_spacy
|
||||||
|
cdef int orig_idx
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)]
|
||||||
|
# Skip processing if no matches
|
||||||
|
if len(spans) == 0:
|
||||||
|
return True
|
||||||
|
spans = util.filter_spans(spans)
|
||||||
|
# Put span info in span.start-indexed dict and calculate maximum
|
||||||
|
# intermediate document size
|
||||||
|
span_data = {}
|
||||||
for span in spans:
|
for span in spans:
|
||||||
if not span[0] in self._rules:
|
rule = self._rules.get(span.text, None)
|
||||||
continue
|
span_length_diff = 0
|
||||||
# Allocate more memory for doc if needed
|
if rule:
|
||||||
span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1])
|
span_length_diff = len(rule) - (span.end - span.start)
|
||||||
while doc.length + offset + span_length_diff >= doc.max_length:
|
if span_length_diff > 0:
|
||||||
doc._realloc(doc.length * 2)
|
modify_in_place = False
|
||||||
# Find special case entries in cache
|
curr_length += span_length_diff
|
||||||
cached = <_Cached*>self._specials.get(hash_string(span[0]))
|
if curr_length > max_length:
|
||||||
if cached == NULL:
|
max_length = curr_length
|
||||||
continue
|
span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
|
||||||
# Shift original tokens...
|
# Modify tokenization according to filtered special cases
|
||||||
# ...from span position to end if new span is shorter
|
# If modifications never increase doc length, can modify in place
|
||||||
if span_length_diff < 0:
|
if modify_in_place:
|
||||||
for i in range(span[2] + offset, doc.length):
|
tokens = doc.c
|
||||||
doc.c[span_length_diff + i] = doc.c[i]
|
# Otherwise create a separate array to store modified tokens
|
||||||
# ...from end to span position if new span is longer
|
else:
|
||||||
elif span_length_diff > 0:
|
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
|
||||||
for i in range(doc.length - 1, span[2] + offset - 1, -1):
|
i = 0
|
||||||
doc.c[span_length_diff + i] = doc.c[i]
|
while i < doc.length:
|
||||||
# Copy special case tokens into doc and adjust token and character
|
if not i in span_data:
|
||||||
# offsets
|
tokens[i + offset] = doc.c[i]
|
||||||
idx_offset = 0
|
i += 1
|
||||||
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
|
else:
|
||||||
for i in range(cached.length):
|
span = span_data[i]
|
||||||
orig_idx = doc.c[span[1] + offset + i].idx
|
cached = <_Cached*>self._specials.get(hash_string(span[0]))
|
||||||
doc.c[span[1] + offset + i] = cached.data.tokens[i]
|
if cached == NULL:
|
||||||
doc.c[span[1] + offset + i].idx = orig_idx + idx_offset
|
# Copy original tokens if no rule found
|
||||||
idx_offset += cached.data.tokens[i].lex.length + \
|
for j in range(span[2] - span[1]):
|
||||||
1 if cached.data.tokens[i].spacy else 0
|
tokens[i + offset + j] = doc.c[i + j]
|
||||||
doc.c[span[2] + offset + - 1].spacy = orig_final_spacy
|
i += span[2] - span[1]
|
||||||
# Token offset for special case spans
|
else:
|
||||||
offset += span_length_diff
|
# Copy special case tokens into doc and adjust token and
|
||||||
doc.length += span_length_diff
|
# character offsets
|
||||||
|
idx_offset = 0
|
||||||
|
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
|
||||||
|
orig_idx = doc.c[i].idx
|
||||||
|
for j in range(cached.length):
|
||||||
|
tokens[i + offset + j] = cached.data.tokens[j]
|
||||||
|
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||||
|
idx_offset += cached.data.tokens[j].lex.length + \
|
||||||
|
1 if cached.data.tokens[j].spacy else 0
|
||||||
|
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
||||||
|
i += span[2] - span[1]
|
||||||
|
offset += span[3]
|
||||||
|
# Allocate more memory for doc if needed
|
||||||
|
while doc.length < doc.length + offset:
|
||||||
|
doc._realloc(doc.length * 2)
|
||||||
|
# If not modified in place, copy tokens back to doc
|
||||||
|
if not modify_in_place:
|
||||||
|
memcpy(doc.c, tokens, max_length * sizeof(TokenC))
|
||||||
|
for i in range(doc.length + offset, doc.length):
|
||||||
|
memset(&doc.c[i], 0, sizeof(TokenC))
|
||||||
|
doc.c[i].lex = &EMPTY_LEXEME
|
||||||
|
doc.length = doc.length + offset
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||||
|
@ -287,12 +326,8 @@ cdef class Tokenizer:
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
return False
|
||||||
cdef int i
|
cdef int i
|
||||||
if cached.is_lex:
|
for i in range(cached.length):
|
||||||
for i in range(cached.length):
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
tokens.push_back(cached.data.lexemes[i], False)
|
|
||||||
else:
|
|
||||||
for i in range(cached.length):
|
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
|
||||||
has_special[0] = 1
|
has_special[0] = 1
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -521,7 +556,7 @@ cdef class Tokenizer:
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
|
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
|
||||||
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string, False)])
|
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
|
||||||
|
|
||||||
def _reload_special_cases(self):
|
def _reload_special_cases(self):
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user