mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Switch to returning a Tokens object
This commit is contained in:
parent
1a3222af4b
commit
563047e90f
|
@ -30,7 +30,7 @@ cdef class Language:
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly object tokens_class
|
cpdef readonly object tokens_class
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef _tokenize(self, Tokens tokens, unicode string)
|
cdef _tokenize(self, Tokens tokens, unicode string)
|
||||||
|
|
|
@ -62,7 +62,7 @@ cdef class Language:
|
||||||
"""
|
"""
|
||||||
return self.lexicon.lookup(string)
|
return self.lexicon.lookup(string)
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
The tokenization rules are defined in two places:
|
The tokenization rules are defined in two places:
|
||||||
|
@ -78,12 +78,12 @@ cdef class Language:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
|
cdef Tokens tokens = self.tokens_class(length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return []
|
return tokens
|
||||||
|
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef Tokens tokens = self.tokens_class()
|
|
||||||
for c in string:
|
for c in string:
|
||||||
if c == ' ':
|
if c == ' ':
|
||||||
if start < i:
|
if start < i:
|
||||||
|
@ -92,11 +92,7 @@ cdef class Language:
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, string[start:i])
|
self._tokenize(tokens, string[start:i])
|
||||||
assert tokens
|
return tokens
|
||||||
output = []
|
|
||||||
for i in range(tokens.length):
|
|
||||||
output.append(Lexeme(<size_t>tokens.lexemes[i]))
|
|
||||||
return output
|
|
||||||
|
|
||||||
cdef _tokenize(self, Tokens tokens, unicode string):
|
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||||
cdef list lexemes
|
cdef list lexemes
|
||||||
|
|
|
@ -30,6 +30,14 @@ cdef class Tokens:
|
||||||
self.size = size
|
self.size = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
if i >= self.length:
|
||||||
|
raise IndexError
|
||||||
|
return Lexeme(<size_t>self.lexemes[i])
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.length
|
||||||
|
|
||||||
def append(self, Lexeme lexeme):
|
def append(self, Lexeme lexeme):
|
||||||
self.push_back(lexeme._c)
|
self.push_back(lexeme._c)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user