* Recognise multiple infixes in a token.

This commit is contained in:
Matthew Honnibal 2016-04-13 18:38:26 +10:00
parent 6df3858dbc
commit 04d0209be9
2 changed files with 42 additions and 34 deletions

View File

@ -43,7 +43,7 @@ def test_double_hyphen(en_tokenizer):
assert tokens[6].text == u'-' assert tokens[6].text == u'-'
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
# on infixes. # on infixes.
#assert tokens[7].text == u'bred' assert tokens[7].text == u'bred'
#assert tokens[8].text == u'--' assert tokens[8].text == u'--'
#assert tokens[9].text == u'people' assert tokens[9].text == u'people'

View File

@ -28,8 +28,9 @@ cdef class Tokenizer:
self._suffix_re = suffix_re self._suffix_re = suffix_re
self._infix_re = infix_re self._infix_re = infix_re
self.vocab = vocab self.vocab = vocab
self._load_special_tokenization(rules) self._rules = {}
self._rules = rules for chunk, substrings in sorted(rules.items()):
self.add_special_case(chunk, substrings)
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,
@ -158,7 +159,8 @@ cdef class Tokenizer:
self._attach_tokens(tokens, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, cdef unicode _split_affixes(self, Pool mem, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes): vector[const LexemeC*] *suffixes):
cdef size_t i cdef size_t i
cdef unicode prefix cdef unicode prefix
@ -215,20 +217,23 @@ cdef class Tokenizer:
if string: if string:
cache_hit = self._try_cache(hash_string(string), tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if not cache_hit: if not cache_hit:
match = self.find_infix(string) matches = self.find_infix(string)
if match is None: if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
split = match.start() # let's say we have dyn-o-mite-dave
end = match.end() # the regex finds the start and end positions of the hyphens
# Append the beginning, affix, end of the infix span start = 0
span = string[:split] for match in matches:
infix_start = match.start()
infix_end = match.end()
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
span = string[split:end] infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
start = infix_end
span = string[end:] span = string[start:]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
@ -251,7 +256,7 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
return self._infix_re.search(string) return list(self._infix_re.finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
match = self._prefix_re.search(string) match = self._prefix_re.search(string)
@ -262,17 +267,19 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases): def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule. '''Add special-case tokenization rules.
''' '''
cdef int i
cdef list substrings
cdef unicode chunk
cdef unicode form
cdef unicode lemma
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode chunk, substrings):
'''Add a special-case tokenization rule.
For instance, "don't" is special-cased to tokenize into
["do", "n't"]. The split tokens can have lemmas and part-of-speech
tags.
'''
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)
cached.is_lex = False cached.is_lex = False
@ -280,3 +287,4 @@ cdef class Tokenizer:
key = hash_string(chunk) key = hash_string(chunk)
self._specials.set(key, cached) self._specials.set(key, cached)
self._cache.set(key, cached) self._cache.set(key, cached)
self._rules[chunk] = substrings