* Add tokens_from_list method to Language

2025-09-19 18:42:37 +03:00 · 2014-11-11 23:43:14 +11:00 · 2014-11-11 23:43:14 +11:00 · c788633429
commit c788633429
parent da70b6bd60
2 changed files with 15 additions and 0 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -49,6 +49,7 @@ cdef class Language:
    cdef object _suffix_re
    cdef object _infix_re
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -49,6 +49,20 @@ cdef class Language:
        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
            self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
        if length == 0:
            return tokens
        cdef String string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
            string_from_unicode(&string_struct, py_string)
            tokens.push_back(idx, self.lexicon.get(&string_struct))
            idx += len(py_string) + 1
        return tokens
    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.