* More slight cleaning for lang.pyx

This commit is contained in:
Matthew Honnibal 2014-10-10 19:50:07 +11:00
parent 02e948e7d5
commit 3d82ed1e5e

View File

@ -86,24 +86,18 @@ cdef class Language:
if Py_UNICODE_ISSPACE(chars[i]) == 1: if Py_UNICODE_ISSPACE(chars[i]) == 1:
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
self._tokenize(tokens.v, &span) if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
self._tokenize(tokens.v, &span) if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
return tokens return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
cdef size_t i cdef size_t i
lexemes = <LexemeC**>self.cache.get(string.key)
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens_v.push_back(lexemes[i])
i += 1
return 0
cdef uint64_t orig_key = string.key cdef uint64_t orig_key = string.key
cdef size_t orig_size = tokens_v.size() cdef size_t orig_size = tokens_v.size()
@ -162,13 +156,7 @@ cdef class Language:
for lexeme in deref(prefixes): for lexeme in deref(prefixes):
tokens.push_back(lexeme) tokens.push_back(lexeme)
if string.n != 0: if string.n != 0:
lexemes = <LexemeC**>self.specials.get(string.key) if not _extend_from_map(tokens, string, self.specials):
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
else:
tokens.push_back(self.lexicon.get(string)) tokens.push_back(self.lexicon.get(string))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
@ -272,6 +260,17 @@ cdef class Lexicon:
return Lexeme(<size_t>lexeme) return Lexeme(<size_t>lexeme)
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
lexemes = <LexemeC**>map_.get(string.key)
if lexemes == NULL:
return 0
cdef size_t i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
return 1
cdef void string_from_unicode(String* s, unicode uni): cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
string_from_slice(s, c_uni, 0, len(uni)) string_from_slice(s, c_uni, 0, len(uni))