From 3ba66f2dc76b0bf483c12e4f15918c5cebe5994c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 16 Oct 2015 04:54:16 +1100 Subject: [PATCH] * Add string length cap in Tokenizer.__call__ --- spacy/tokenizer.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index d54770d2b..ef9c26c01 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -72,6 +72,10 @@ cdef class Tokenizer: Returns: tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. """ + if len(string) >= (2 ** 30): + raise ValueError( + "String is too long: %d characters. Max is 2**30." % len(string) + ) cdef int length = len(string) cdef Doc tokens = Doc(self.vocab) if length == 0: