mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
* Add tokens_from_list method to Language
This commit is contained in:
parent
da70b6bd60
commit
c788633429
|
@ -49,6 +49,7 @@ cdef class Language:
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||||
|
|
|
@ -49,6 +49,20 @@ cdef class Language:
|
||||||
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
|
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
|
||||||
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
|
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
|
||||||
|
|
||||||
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||||
|
if length == 0:
|
||||||
|
return tokens
|
||||||
|
cdef String string_struct
|
||||||
|
cdef unicode py_string
|
||||||
|
cdef int idx = 0
|
||||||
|
for i, py_string in enumerate(strings):
|
||||||
|
string_from_unicode(&string_struct, py_string)
|
||||||
|
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
||||||
|
idx += len(py_string) + 1
|
||||||
|
return tokens
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user