* Switch to returning a Tokens object

2025-10-19 10:14:24 +03:00 · 2014-09-11 21:37:32 +02:00 · 2014-09-11 21:37:32 +02:00 · 563047e90f
commit 563047e90f
parent 1a3222af4b
3 changed files with 13 additions and 9 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -30,7 +30,7 @@ cdef class Language:
    cpdef readonly Lexicon lexicon
    cpdef readonly object tokens_class
-    cpdef list tokenize(self, unicode text)
+    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)
    cdef _tokenize(self, Tokens tokens, unicode string)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -62,7 +62,7 @@ cdef class Language:
        """
        return self.lexicon.lookup(string)
-    cpdef list tokenize(self, unicode string):
+    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.
        The tokenization rules are defined in two places:
@ -78,12 +78,12 @@ cdef class Language:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
        cdef size_t length = len(string)
        cdef Tokens tokens = self.tokens_class(length)
        if length == 0:
-            return []
+            return tokens
        cdef size_t start = 0
        cdef size_t i = 0
        cdef Tokens tokens = self.tokens_class()
        for c in string:
            if c == ' ':
                if start < i:
@ -92,11 +92,7 @@ cdef class Language:
            i += 1
        if start < i:
            self._tokenize(tokens, string[start:i])
-        assert tokens
+        return tokens
        output = []
        for i in range(tokens.length):
            output.append(Lexeme(<size_t>tokens.lexemes[i]))
        return output
    cdef _tokenize(self, Tokens tokens, unicode string):
        cdef list lexemes
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -30,6 +30,14 @@ cdef class Tokens:
        self.size = size
        self.length = 0
    def __getitem__(self, i):
        if i >= self.length:
            raise IndexError
        return Lexeme(<size_t>self.lexemes[i])
    def __len__(self):
        return self.length
    def append(self, Lexeme lexeme):
        self.push_back(lexeme._c)