mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
* Moving to storing LexemeC structs internally
This commit is contained in:
parent
563047e90f
commit
bf9c60c31c
|
@ -95,20 +95,21 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef _tokenize(self, Tokens tokens, unicode string):
|
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||||
cdef list lexemes
|
cdef LexemeC** lexemes
|
||||||
if len(string) == 1:
|
if string in self.cache:
|
||||||
lexemes = [self.lookup(string)]
|
lexemes = <LexemeC**><size_t>self.cache[string]
|
||||||
elif string in self.cache:
|
|
||||||
lexemes = self.cache[string]
|
|
||||||
else:
|
else:
|
||||||
lexemes = []
|
|
||||||
substrings = self._split(string)
|
substrings = self._split(string)
|
||||||
|
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes.append(self.lexicon.lookup(substring))
|
lexemes[i] = self.lexicon.lookup(substring)._c
|
||||||
self.cache[string] = lexemes
|
lexemes[i + 1] = NULL
|
||||||
cdef Lexeme lexeme
|
self.cache[string] = <size_t>lexemes
|
||||||
for lexeme in lexemes:
|
cdef LexemeC* lexeme
|
||||||
tokens.append(lexeme)
|
i = 0
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
cdef list _split(self, unicode string):
|
cdef list _split(self, unicode string):
|
||||||
"""Find how to split a contiguous span of non-space characters into substrings.
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
|
@ -147,11 +148,13 @@ cdef class Language:
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
a string and tokens is a list of strings.
|
a string and tokens is a list of strings.
|
||||||
'''
|
'''
|
||||||
|
cdef LexemeC** lexemes
|
||||||
for string, substrings in token_rules:
|
for string, substrings in token_rules:
|
||||||
lexemes = []
|
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes.append(self.lexicon.lookup(substring))
|
lexemes[i] = self.lexicon.lookup(substring)._c
|
||||||
self.cache[string] = lexemes
|
lexemes[i + 1] = NULL
|
||||||
|
self.cache[string] = <size_t>lexemes
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user