* Recognise multiple infixes in a token.

This commit is contained in:
Matthew Honnibal 2016-04-13 18:38:26 +10:00
parent 6df3858dbc
commit 04d0209be9
2 changed files with 42 additions and 34 deletions

View File

@ -43,7 +43,7 @@ def test_double_hyphen(en_tokenizer):
assert tokens[6].text == u'-'
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
# on infixes.
#assert tokens[7].text == u'bred'
#assert tokens[8].text == u'--'
#assert tokens[9].text == u'people'
assert tokens[7].text == u'bred'
assert tokens[8].text == u'--'
assert tokens[9].text == u'people'

View File

@ -28,8 +28,9 @@ cdef class Tokenizer:
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules)
self._rules = rules
self._rules = {}
for chunk, substrings in sorted(rules.items()):
self.add_special_case(chunk, substrings)
def __reduce__(self):
args = (self.vocab,
@ -158,7 +159,8 @@ cdef class Tokenizer:
self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes,
cdef unicode _split_affixes(self, Pool mem, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes):
cdef size_t i
cdef unicode prefix
@ -215,20 +217,23 @@ cdef class Tokenizer:
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if not cache_hit:
match = self.find_infix(string)
if match is None:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
split = match.start()
end = match.end()
# Append the beginning, affix, end of the infix span
span = string[:split]
# let's say we have dyn-o-mite-dave
# the regex finds the start and end positions of the hyphens
start = 0
for match in matches:
infix_start = match.start()
infix_end = match.end()
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
span = string[split:end]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
span = string[end:]
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
start = infix_end
span = string[start:]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
@ -251,7 +256,7 @@ cdef class Tokenizer:
self._cache.set(key, cached)
def find_infix(self, unicode string):
return self._infix_re.search(string)
return list(self._infix_re.finditer(string))
def find_prefix(self, unicode string):
match = self._prefix_re.search(string)
@ -262,17 +267,19 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''Add special-case tokenization rules.
'''
cdef int i
cdef list substrings
cdef unicode chunk
cdef unicode form
cdef unicode lemma
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode chunk, substrings):
'''Add a special-case tokenization rule.
For instance, "don't" is special-cased to tokenize into
["do", "n't"]. The split tokens can have lemmas and part-of-speech
tags.
'''
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
@ -280,3 +287,4 @@ cdef class Tokenizer:
key = hash_string(chunk)
self._specials.set(key, cached)
self._cache.set(key, cached)
self._rules[chunk] = substrings