spaCy/spacy/tokenizer.pyx
Matthew Honnibal 4b2e5e59ed Add flush_cache method to tokenizer, to fix #1061
The tokenizer caches output for common chunks, for efficiency. This
cache is be invalidated when the tokenizer rules change, e.g. when a new
special-case rule is introduced. That's what was causing #1061.

When the cache is flushed, we free the intermediate token chunks.
I *think* this is safe --- but if we start getting segfaults, this patch
is to blame. The resolution would be to simply not free those bits of
memory. They'll be freed when the tokenizer exits anyway.
2017-07-22 15:06:50 +02:00

430 lines
17 KiB
Cython

# cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals
import ujson
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from .strings cimport hash_string
cimport cython
from . import util
from .tokens.doc cimport Doc
cdef class Tokenizer:
"""
Segment text, and create Doc objects with the discovered segment boundaries.
"""
@classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None, token_match=None):
"""
Load a Tokenizer, reading unsupplied components from the path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
A storage container for lexical types.
rules (dict):
Exceptions and special-cases for the tokenizer.
token_match:
A boolean function matching strings that becomes tokens.
prefix_search:
Signature of re.compile(string).search
suffix_search:
Signature of re.compile(string).search
infix_finditer:
Signature of re.compile(string).finditer
Returns Tokenizer
"""
path = util.ensure_path(path)
if rules is None:
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
rules = ujson.load(file_)
if prefix_search in (None, True):
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
entries = file_.read().split('\n')
prefix_search = util.compile_prefix_regex(entries).search
if suffix_search in (None, True):
with (path / 'tokenizer' / 'suffix.txt').open() as file_:
entries = file_.read().split('\n')
suffix_search = util.compile_suffix_regex(entries).search
if infix_finditer in (None, True):
with (path / 'tokenizer' / 'infix.txt').open() as file_:
entries = file_.read().split('\n')
infix_finditer = util.compile_infix_regex(entries).finditer
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
"""
Create a Tokenizer, to create Doc objects given unicode text.
Arguments:
vocab (Vocab):
A storage container for lexical types.
rules (dict):
Exceptions and special-cases for the tokenizer.
prefix_search:
A function matching the signature of re.compile(string).search
to match prefixes.
suffix_search:
A function matching the signature of re.compile(string).search
to match suffixes.
infix_finditer:
A function matching the signature of re.compile(string).finditer
to find infixes.
token_match:
A boolean function matching strings that becomes tokens.
"""
self.mem = Pool()
self._cache = PreshMap()
self._specials = PreshMap()
self.token_match = token_match
self.prefix_search = prefix_search
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
self.vocab = vocab
self._rules = {}
for chunk, substrings in sorted(rules.items()):
self.add_special_case(chunk, substrings)
def __reduce__(self):
args = (self.vocab,
self._rules,
self._prefix_re,
self._suffix_re,
self._infix_re,
self.token_match)
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
return Doc(self.vocab, words=strings)
#raise NotImplementedError(
# "Method deprecated in 1.0.\n"
# "Old: tokenizer.tokens_from_list(strings)\n"
# "New: Doc(tokenizer.vocab, words=strings)")
@cython.boundscheck(False)
def __call__(self, unicode string):
"""
Tokenize a string.
Arguments:
string (unicode): The string to tokenize.
Returns:
Doc A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
raise ValueError(
"String is too long: %d characters. Max is 2**30." % len(string)
)
cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab)
if length == 0:
return tokens
cdef int i = 0
cdef int start = 0
cdef bint cache_hit
cdef bint in_ws = string[0].isspace()
cdef unicode span
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing.
for uc in string:
if uc.isspace() != in_ws:
if start < i:
# When we want to make this fast, get the data buffer once
# with PyUnicode_AS_DATA, and then maintain a start_byte
# and end_byte, so we can call hash64 directly. That way
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
cache_hit = self._try_cache(key, tokens)
if not cache_hit:
self._tokenize(tokens, span, key)
if uc == ' ':
tokens.c[tokens.length - 1].spacy = True
start = i + 1
else:
start = i
in_ws = not in_ws
i += 1
if start < i:
span = string[start:]
key = hash_string(span)
cache_hit = self._try_cache(key, tokens)
if not cache_hit:
self._tokenize(tokens, span, key)
tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws
return tokens
def pipe(self, texts, batch_size=1000, n_threads=2):
"""
Tokenize a stream of texts.
Arguments:
texts: A sequence of unicode texts.
batch_size (int):
The number of texts to accumulate in an internal buffer.
n_threads (int):
The number of threads to use, if the implementation supports
multi-threading. The default tokenizer is single-threaded.
Yields:
Doc A sequence of Doc objects, in order.
"""
for text in texts:
yield self(text)
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
# See 'flush_cache' below for hand-wringing about
# how to handle this.
cached = <_Cached*>self._specials.get(key)
if cached == NULL:
return False
else:
self._cache.set(key, cached)
cdef int i
if cached.is_lex:
for i in range(cached.length):
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
return True
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
orig_size = tokens.length
special_case = <const _Cached*>self._specials.get(orig_key)
if special_case is not NULL:
for i in range(special_case.length):
tokens.push_back(&special_case.data.tokens[i], False)
self._cache.set(orig_key, <void*>special_case)
else:
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes):
cdef size_t i
cdef unicode prefix
cdef unicode suffix
cdef unicode minus_pre
cdef unicode minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string):
break
last_size = len(string)
pre_len = self.find_prefix(string)
if pre_len != 0:
prefix = string[:pre_len]
minus_pre = string[pre_len:]
# Check whether we've hit a special-case
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
break
if self.token_match and self.token_match(string):
break
suf_len = self.find_suffix(string)
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
# Check whether we've hit a special-case
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix))
suffixes.push_back(self.vocab.get(mem, suffix))
elif pre_len:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
if string and (self._specials.get(hash_string(string)) != NULL):
break
return string
cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
cdef unicode span
cdef int i
if prefixes.size():
for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False)
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
elif self.token_match and self.token_match(string):
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
# let's say we have dyn-o-mite-dave
# the regex finds the start and end positions of the hyphens
start = 0
for match in matches:
infix_start = match.start()
infix_end = match.end()
if infix_start == start:
continue
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
# token is non-empty. Empty infix tokens are useful
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
start = infix_end
span = string[start:]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)
preinc(it)
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i
for i in range(n):
if tokens[i].lex.id == 0:
return 0
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n
cached.is_lex = True
lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
for i in range(n):
lexemes[i] = tokens[i].lex
cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached)
def find_infix(self, unicode string):
"""
Find internal split points of the string, such as hyphens.
string (unicode): The string to segment.
Returns List[re.MatchObject]
A list of objects that have .start() and .end() methods, denoting the
placement of internal segment separators, e.g. hyphens.
"""
if self.infix_finditer is None:
return 0
return list(self.infix_finditer(string))
def find_prefix(self, unicode string):
"""
Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the prefix if present, otherwise None.
"""
if self.prefix_search is None:
return 0
match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string):
"""
Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the suffix if present, otherwise None.
"""
if self.suffix_search is None:
return 0
match = self.suffix_search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases):
"""
Add special-case tokenization rules.
"""
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings):
"""
Add a special-case tokenization rule.
Arguments:
string (unicode): The string to specially tokenize.
token_attrs:
A sequence of dicts, where each dict describes a token and its
attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated.
Returns None
"""
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(string)
self._specials.set(key, cached)
self._rules[string] = substrings
# After changing the tokenization rules, the previous tokenization
# may be stale.
self.flush_cache()
def flush_cache(self):
'''Flush the tokenizer's cache. May not free memory immediately.
This is called automatically after `add_special_case`, but if you
write to the prefix or suffix functions, you'll have to call this
yourself. You may also need to flush the tokenizer cache after
changing the lex_attr_getter functions.
'''
cdef hash_t key
for key in self._cache.keys():
special_case = self._specials.get(key)
# Don't free data shared with special-case rules
if special_case is not NULL:
continue
cached = <_Cached*>self._cache.get(key)
if cached is not NULL:
self.mem.free(cached)
self._cache = PreshMap(1000)
# We could here readd the data from specials --- but if we loop over
# a bunch of special-cases, we'll get a quadratic behaviour. The extra
# lookup isn't so bad? Tough to tell.