diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 1a3e86b49..919b0928b 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -27,8 +27,9 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) + vector[LexemeC*] *suffixes, int* has_special) cdef int _attach_tokens(self, Doc tokens, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, + int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 692357c8a..bc09129de 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -20,7 +20,8 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. """ - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): + def __init__(self, Vocab vocab, rules=None, prefix_search=None, + suffix_search=None, infix_finditer=None, token_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -48,8 +49,9 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + if rules is not None: + for chunk, substrings in sorted(rules.items()): + self.add_special_case(chunk, substrings) def __reduce__(self): args = (self.vocab, @@ -148,14 +150,18 @@ cdef class Tokenizer: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size + cdef int has_special orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, + &has_special) self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes): + vector[const LexemeC*] *suffixes, + int* has_special): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -174,6 +180,7 @@ cdef class Tokenizer: if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) + has_special[0] = 1 break if self.token_match and self.token_match(string): break @@ -185,6 +192,7 @@ cdef class Tokenizer: if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) + has_special[0] = 1 break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] @@ -197,6 +205,7 @@ cdef class Tokenizer: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) if string and (self._specials.get(hash_string(string)) != NULL): + has_special[0] = 1 break return string @@ -256,11 +265,15 @@ cdef class Tokenizer: preinc(it) tokens.push_back(lexeme, False) - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int has_special, int n) except -1: cdef int i for i in range(n): if tokens[i].lex.id == 0: return 0 + # See https://github.com/explosion/spaCy/issues/1250 + if has_special: + return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True