* Allow infix tokens to be variable length

This commit is contained in:
Matthew Honnibal 2015-07-18 22:45:00 +02:00
parent 5b4c78bbb2
commit cfd842769e
2 changed files with 13 additions and 11 deletions

View File

@ -37,5 +37,5 @@ cdef class Tokenizer:
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef object _find_infix(self, Py_UNICODE* characters, size_t length)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1

View File

@ -175,8 +175,7 @@ cdef class Tokenizer:
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1: vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit cdef bint cache_hit
cdef bint is_spacy cdef int split, end
cdef int split
cdef const LexemeC* const* lexemes cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef UniStr span cdef UniStr span
@ -189,16 +188,20 @@ cdef class Tokenizer:
if cache_hit: if cache_hit:
pass pass
else: else:
split = self._find_infix(string.chars, string.n) match = self._find_infix(string.chars, string.n)
if split == 0 or split == -1: if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
# Append the beginning, afix, end of the infix token split = match.start()
end = match.end()
# Append the beginning, afix, end of the infix span
slice_unicode(&span, string.chars, 0, split) slice_unicode(&span, string.chars, 0, split)
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split, split+1)
slice_unicode(&span, string.chars, split, end)
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split + 1, string.n)
slice_unicode(&span, string.chars, end, string.n)
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
@ -220,10 +223,9 @@ cdef class Tokenizer:
cached.data.lexemes = <const LexemeC* const*>lexemes cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached) self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: cdef object _find_infix(self, Py_UNICODE* chars, size_t length):
cdef unicode string = chars[:length] cdef unicode string = chars[:length]
match = self._infix_re.search(string) return self._infix_re.search(string)
return match.start() if match is not None else 0
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length] cdef unicode string = chars[:length]