From cfd842769ec43438e162b194b95f4b007b97d557 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 18 Jul 2015 22:45:00 +0200 Subject: [PATCH] * Allow infix tokens to be variable length --- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 6f4656962..4a9cafb05 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -37,5 +37,5 @@ cdef class Tokenizer: vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 + cdef object _find_infix(self, Py_UNICODE* characters, size_t length) cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index d174ca71a..f83a7b677 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -175,8 +175,7 @@ cdef class Tokenizer: vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes) except -1: cdef bint cache_hit - cdef bint is_spacy - cdef int split + cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme cdef UniStr span @@ -189,16 +188,20 @@ cdef class Tokenizer: if cache_hit: pass else: - split = self._find_infix(string.chars, string.n) - if split == 0 or split == -1: + match = self._find_infix(string.chars, string.n) + if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: - # Append the beginning, afix, end of the infix token + split = match.start() + end = match.end() + # Append the beginning, afix, end of the infix span slice_unicode(&span, string.chars, 0, split) tokens.push_back(self.vocab.get(tokens.mem, &span), False) - slice_unicode(&span, string.chars, split, split+1) + + slice_unicode(&span, string.chars, split, end) tokens.push_back(self.vocab.get(tokens.mem, &span), False) - slice_unicode(&span, string.chars, split + 1, string.n) + + slice_unicode(&span, string.chars, end, string.n) tokens.push_back(self.vocab.get(tokens.mem, &span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): @@ -220,10 +223,9 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: + cdef object _find_infix(self, Py_UNICODE* chars, size_t length): cdef unicode string = chars[:length] - match = self._infix_re.search(string) - return match.start() if match is not None else 0 + return self._infix_re.search(string) cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length]