mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add input length error, to address #1826
This commit is contained in:
parent
cca7e7ad11
commit
23afa6429f
|
@ -111,7 +111,7 @@ class Language(object):
|
|||
'merge_entities': lambda nlp, **cfg: merge_entities
|
||||
}
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||
def __init__(self, vocab=True, make_doc=True, max_length=10**6, meta={}, **kwargs):
|
||||
"""Initialise a Language object.
|
||||
|
||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||
|
@ -126,6 +126,15 @@ class Language(object):
|
|||
string occurs in both, the component is not loaded.
|
||||
meta (dict): Custom meta data for the Language class. Is written to by
|
||||
models to add model meta data.
|
||||
max_length (int) :
|
||||
Maximum number of characters in a single text. The current v2 models
|
||||
may run out memory on extremely long texts, due to large internal
|
||||
allocations. You should segment these texts into meaningful units,
|
||||
e.g. paragraphs, subsections etc, before passing them to spaCy.
|
||||
Default maximum length is 1,000,000 characters (1mb). As a rule of
|
||||
thumb, if all pipeline components are enabled, spaCy's default
|
||||
models currently requires roughly 1GB of temporary memory per
|
||||
100,000 characters in one text.
|
||||
RETURNS (Language): The newly constructed object.
|
||||
"""
|
||||
self._meta = dict(meta)
|
||||
|
@ -141,6 +150,7 @@ class Language(object):
|
|||
make_doc = factory(self, **meta.get('tokenizer', {}))
|
||||
self.tokenizer = make_doc
|
||||
self.pipeline = []
|
||||
self.max_length = max_length
|
||||
self._optimizer = None
|
||||
|
||||
@property
|
||||
|
@ -340,6 +350,17 @@ class Language(object):
|
|||
>>> tokens[0].text, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
if len(text) >= self.max_length:
|
||||
msg = (
|
||||
"Text of length {length} exceeds maximum of {max_length}. "
|
||||
"The v2 parser and NER models require roughly 1GB of temporary "
|
||||
"memory per 100,000 characters in the input. This means long "
|
||||
"texts may cause memory allocation errors. If you're not using "
|
||||
"the parser or NER, it's probably safe to increase the "
|
||||
"nlp.max_length limit. The limit is in number of characters, "
|
||||
"so you can check whether your inputs are too long by checking "
|
||||
"len(text)".)
|
||||
raise ValueError(msg.format(length=len(text), max_length=self.max_length))
|
||||
doc = self.make_doc(text)
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
|
|
Loading…
Reference in New Issue
Block a user