diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 371beb51c..68f1ee58a 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -49,6 +49,7 @@ cdef class Language: cdef object _suffix_re cdef object _infix_re + cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 7b9f12dd1..79a84e936 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -49,6 +49,20 @@ cdef class Language: if path.exists(path.join(util.DATA_DIR, name, 'ner')): self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner')) + cpdef Tokens tokens_from_list(self, list strings): + cdef int length = sum([len(s) for s in strings]) + cdef Tokens tokens = Tokens(self.lexicon.strings, length) + if length == 0: + return tokens + cdef String string_struct + cdef unicode py_string + cdef int idx = 0 + for i, py_string in enumerate(strings): + string_from_unicode(&string_struct, py_string) + tokens.push_back(idx, self.lexicon.get(&string_struct)) + idx += len(py_string) + 1 + return tokens + cpdef Tokens tokenize(self, unicode string): """Tokenize a string.