mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Add tokens_from_list method to Language
This commit is contained in:
parent
da70b6bd60
commit
c788633429
|
@ -49,6 +49,7 @@ cdef class Language:
|
|||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
|
|
|
@ -49,6 +49,20 @@ cdef class Language:
|
|||
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
|
||||
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef String string_struct
|
||||
cdef unicode py_string
|
||||
cdef int idx = 0
|
||||
for i, py_string in enumerate(strings):
|
||||
string_from_unicode(&string_struct, py_string)
|
||||
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
||||
idx += len(py_string) + 1
|
||||
return tokens
|
||||
|
||||
cpdef Tokens tokenize(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user