mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
* Add string length cap in Tokenizer.__call__
This commit is contained in:
parent
17fffb4c57
commit
3ba66f2dc7
|
@ -72,6 +72,10 @@ cdef class Tokenizer:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
||||||
"""
|
"""
|
||||||
|
if len(string) >= (2 ** 30):
|
||||||
|
raise ValueError(
|
||||||
|
"String is too long: %d characters. Max is 2**30." % len(string)
|
||||||
|
)
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Doc tokens = Doc(self.vocab)
|
cdef Doc tokens = Doc(self.vocab)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user