mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Allow infix tokens to be variable length
This commit is contained in:
parent
5b4c78bbb2
commit
cfd842769e
|
@ -37,5 +37,5 @@ cdef class Tokenizer:
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef object _find_infix(self, Py_UNICODE* characters, size_t length)
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||||
|
|
|
@ -175,8 +175,7 @@ cdef class Tokenizer:
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes) except -1:
|
vector[const LexemeC*] *suffixes) except -1:
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
cdef bint is_spacy
|
cdef int split, end
|
||||||
cdef int split
|
|
||||||
cdef const LexemeC* const* lexemes
|
cdef const LexemeC* const* lexemes
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef UniStr span
|
cdef UniStr span
|
||||||
|
@ -189,16 +188,20 @@ cdef class Tokenizer:
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
match = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if match is None:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
# Append the beginning, afix, end of the infix token
|
split = match.start()
|
||||||
|
end = match.end()
|
||||||
|
# Append the beginning, afix, end of the infix span
|
||||||
slice_unicode(&span, string.chars, 0, split)
|
slice_unicode(&span, string.chars, 0, split)
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
slice_unicode(&span, string.chars, split, split+1)
|
|
||||||
|
slice_unicode(&span, string.chars, split, end)
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
|
||||||
|
slice_unicode(&span, string.chars, end, string.n)
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
|
@ -220,10 +223,9 @@ cdef class Tokenizer:
|
||||||
cached.data.lexemes = <const LexemeC* const*>lexemes
|
cached.data.lexemes = <const LexemeC* const*>lexemes
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef object _find_infix(self, Py_UNICODE* chars, size_t length):
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
match = self._infix_re.search(string)
|
return self._infix_re.search(string)
|
||||||
return match.start() if match is not None else 0
|
|
||||||
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user