* Add tokens_from_list method to Language

This commit is contained in:
Matthew Honnibal 2014-11-11 23:43:14 +11:00
parent da70b6bd60
commit c788633429
2 changed files with 15 additions and 0 deletions

View File

@ -49,6 +49,7 @@ cdef class Language:
cdef object _suffix_re
cdef object _infix_re
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1

View File

@ -49,6 +49,20 @@ cdef class Language:
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
if length == 0:
return tokens
cdef String string_struct
cdef unicode py_string
cdef int idx = 0
for i, py_string in enumerate(strings):
string_from_unicode(&string_struct, py_string)
tokens.push_back(idx, self.lexicon.get(&string_struct))
idx += len(py_string) + 1
return tokens
cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string.