mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Rename 'tokens' variable 'doc' in tokenizer
This commit is contained in:
parent
839de87ca9
commit
f45973848c
|
@ -79,9 +79,9 @@ cdef class Tokenizer:
|
||||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
"String is too long: %d characters. Max is 2**30." % len(string)
|
||||||
)
|
)
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Doc tokens = Doc(self.vocab)
|
cdef Doc doc = Doc(self.vocab)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return doc
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
|
@ -100,11 +100,11 @@ cdef class Tokenizer:
|
||||||
# we don't have to create the slice when we hit the cache.
|
# we don't have to create the slice when we hit the cache.
|
||||||
span = string[start:i]
|
span = string[start:i]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, tokens)
|
cache_hit = self._try_cache(key, doc)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, span, key)
|
self._tokenize(doc, span, key)
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
tokens.c[tokens.length - 1].spacy = True
|
doc.c[doc.length - 1].spacy = True
|
||||||
start = i + 1
|
start = i + 1
|
||||||
else:
|
else:
|
||||||
start = i
|
start = i
|
||||||
|
@ -113,11 +113,11 @@ cdef class Tokenizer:
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, tokens)
|
cache_hit = self._try_cache(key, doc)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, span, key)
|
self._tokenize(doc, span, key)
|
||||||
tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == ' ' and not in_ws
|
||||||
return tokens
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||||
"""Tokenize a stream of texts.
|
"""Tokenize a stream of texts.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user