mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
* Preparations in place to handle hyphenation etc
This commit is contained in:
parent
ff79dbac2e
commit
868e558037
|
@ -52,6 +52,8 @@ cdef class Language:
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||||
|
|
||||||
|
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
||||||
|
|
|
@ -155,16 +155,14 @@ cdef class Language:
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
for lexeme in deref(prefixes):
|
for lexeme in deref(prefixes):
|
||||||
tokens.push_back(lexeme)
|
tokens.push_back(lexeme)
|
||||||
if string.n != 0:
|
if not _extend_from_map(tokens, string, self.specials):
|
||||||
if not _extend_from_map(tokens, string, self.specials):
|
self._split_body_token(tokens, string)
|
||||||
tokens.push_back(self.lexicon.get(string))
|
|
||||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
tokens.push_back(deref(it))
|
tokens.push_back(deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
|
||||||
uint64_t key, size_t n) except -1:
|
|
||||||
assert tokens.size() > n
|
assert tokens.size() > n
|
||||||
lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
|
lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
|
||||||
cdef size_t i, j
|
cdef size_t i, j
|
||||||
|
@ -172,6 +170,9 @@ cdef class Language:
|
||||||
lexemes[i] = tokens.at(j)
|
lexemes[i] = tokens.at(j)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self.cache.set(key, lexemes)
|
self.cache.set(key, lexemes)
|
||||||
|
|
||||||
|
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
|
||||||
|
tokens.push_back(self.lexicon.get(string))
|
||||||
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
|
@ -255,6 +256,8 @@ cdef class Lexicon:
|
||||||
|
|
||||||
|
|
||||||
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
|
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
|
||||||
|
if string.n == 0:
|
||||||
|
return 1
|
||||||
lexemes = <LexemeC**>map_.get(string.key)
|
lexemes = <LexemeC**>map_.get(string.key)
|
||||||
if lexemes == NULL:
|
if lexemes == NULL:
|
||||||
return 0
|
return 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user