mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* More slight cleaning for lang.pyx
This commit is contained in:
parent
02e948e7d5
commit
3d82ed1e5e
|
@ -86,24 +86,18 @@ cdef class Language:
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens.v, &span)
|
if not _extend_from_map(tokens.v, &span, self.cache):
|
||||||
|
self._tokenize(tokens.v, &span)
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens.v, &span)
|
if not _extend_from_map(tokens.v, &span, self.cache):
|
||||||
|
self._tokenize(tokens.v, &span)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
|
||||||
if lexemes != NULL:
|
|
||||||
i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens_v.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef uint64_t orig_key = string.key
|
cdef uint64_t orig_key = string.key
|
||||||
cdef size_t orig_size = tokens_v.size()
|
cdef size_t orig_size = tokens_v.size()
|
||||||
|
|
||||||
|
@ -162,13 +156,7 @@ cdef class Language:
|
||||||
for lexeme in deref(prefixes):
|
for lexeme in deref(prefixes):
|
||||||
tokens.push_back(lexeme)
|
tokens.push_back(lexeme)
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
lexemes = <LexemeC**>self.specials.get(string.key)
|
if not _extend_from_map(tokens, string, self.specials):
|
||||||
if lexemes != NULL:
|
|
||||||
i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
tokens.push_back(self.lexicon.get(string))
|
tokens.push_back(self.lexicon.get(string))
|
||||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
|
@ -272,6 +260,17 @@ cdef class Lexicon:
|
||||||
return Lexeme(<size_t>lexeme)
|
return Lexeme(<size_t>lexeme)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
|
||||||
|
lexemes = <LexemeC**>map_.get(string.key)
|
||||||
|
if lexemes == NULL:
|
||||||
|
return 0
|
||||||
|
cdef size_t i = 0
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
cdef void string_from_unicode(String* s, unicode uni):
|
cdef void string_from_unicode(String* s, unicode uni):
|
||||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||||
string_from_slice(s, c_uni, 0, len(uni))
|
string_from_slice(s, c_uni, 0, len(uni))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user