Rename 'tokens' variable 'doc' in tokenizer

This commit is contained in:
Matthew Honnibal 2017-10-17 18:21:41 +02:00
parent 839de87ca9
commit f45973848c

View File

@ -79,9 +79,9 @@ cdef class Tokenizer:
"String is too long: %d characters. Max is 2**30." % len(string) "String is too long: %d characters. Max is 2**30." % len(string)
) )
cdef int length = len(string) cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab) cdef Doc doc = Doc(self.vocab)
if length == 0: if length == 0:
return tokens return doc
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit cdef bint cache_hit
@ -100,11 +100,11 @@ cdef class Tokenizer:
# we don't have to create the slice when we hit the cache. # we don't have to create the slice when we hit the cache.
span = string[start:i] span = string[start:i]
key = hash_string(span) key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, doc)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, span, key) self._tokenize(doc, span, key)
if uc == ' ': if uc == ' ':
tokens.c[tokens.length - 1].spacy = True doc.c[doc.length - 1].spacy = True
start = i + 1 start = i + 1
else: else:
start = i start = i
@ -113,11 +113,11 @@ cdef class Tokenizer:
if start < i: if start < i:
span = string[start:] span = string[start:]
key = hash_string(span) key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, doc)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, span, key) self._tokenize(doc, span, key)
tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws doc.c[doc.length - 1].spacy = string[-1] == ' ' and not in_ws
return tokens return doc
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
"""Tokenize a stream of texts. """Tokenize a stream of texts.