diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 7a107733b..4edc031d7 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -43,7 +43,7 @@ def test_double_hyphen(en_tokenizer): assert tokens[6].text == u'-' # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter # on infixes. - #assert tokens[7].text == u'bred' - #assert tokens[8].text == u'--' - #assert tokens[9].text == u'people' + assert tokens[7].text == u'bred' + assert tokens[8].text == u'--' + assert tokens[9].text == u'people' diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 44d627505..a1a5c289c 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,8 +28,9 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules) - self._rules = rules + self._rules = {} + for chunk, substrings in sorted(rules.items()): + self.add_special_case(chunk, substrings) def __reduce__(self): args = (self.vocab, @@ -158,7 +159,8 @@ cdef class Tokenizer: self._attach_tokens(tokens, span, &prefixes, &suffixes) self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) - cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, + cdef unicode _split_affixes(self, Pool mem, unicode string, + vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes): cdef size_t i cdef unicode prefix @@ -215,20 +217,23 @@ cdef class Tokenizer: if string: cache_hit = self._try_cache(hash_string(string), tokens) if not cache_hit: - match = self.find_infix(string) - if match is None: + matches = self.find_infix(string) + if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: - split = match.start() - end = match.end() - # Append the beginning, affix, end of the infix span - span = string[:split] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + # let's say we have dyn-o-mite-dave + # the regex finds the start and end positions of the hyphens + start = 0 + for match in matches: + infix_start = match.start() + infix_end = match.end() + span = string[start:infix_start] + tokens.push_back(self.vocab.get(tokens.mem, span), False) - span = string[split:end] - tokens.push_back(self.vocab.get(tokens.mem, span), False) - - span = string[end:] + infix_span = string[infix_start:infix_end] + tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + start = infix_end + span = string[start:] tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): @@ -251,7 +256,7 @@ cdef class Tokenizer: self._cache.set(key, cached) def find_infix(self, unicode string): - return self._infix_re.search(string) + return list(self._infix_re.finditer(string)) def find_prefix(self, unicode string): match = self._prefix_re.search(string) @@ -262,21 +267,24 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def _load_special_tokenization(self, special_cases): - '''Add a special-case tokenization rule. + '''Add special-case tokenization rules. ''' - cdef int i - cdef list substrings - cdef unicode chunk - cdef unicode form - cdef unicode lemma - cdef dict props - cdef LexemeC** lexemes - cdef hash_t hashed for chunk, substrings in sorted(special_cases.items()): - cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) - cached.length = len(substrings) - cached.is_lex = False - cached.data.tokens = self.vocab.make_fused_token(substrings) - key = hash_string(chunk) - self._specials.set(key, cached) - self._cache.set(key, cached) + self.add_special_case(chunk, substrings) + + def add_special_case(self, unicode chunk, substrings): + '''Add a special-case tokenization rule. + + For instance, "don't" is special-cased to tokenize into + ["do", "n't"]. The split tokens can have lemmas and part-of-speech + tags. + ''' + substrings = list(substrings) + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) + self._rules[chunk] = substrings