Custom warning if the doc_bin is too large (#8069)

* custom warning if the doc_bin is too large

* cleanup

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* fix numbering

* fixing numbering once more

* fixing this seems to be pretty hard

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Sofie Van Landeghem 2021-05-17 15:48:40 +02:00 committed by GitHub
parent b120fb3511
commit 0dffc5d9e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 1 deletions

View File

@ -490,6 +490,11 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x # New errors added in v3.x
E870 = ("Could not serialize the DocBin because it is too large. Consider "
"splitting up your documents into several doc bins and serializing "
"each separately. spacy.Corpus.v1 will search recursively for all "
"*.spacy files if you provide a directory instead of a filename as "
"the 'path'.")
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}") E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
E872 = ("Unable to copy tokenizer from base model due to different " E872 = ("Unable to copy tokenizer from base model due to different "
'tokenizer settings: current tokenizer config "{curr_config}" ' 'tokenizer settings: current tokenizer config "{curr_config}" '

View File

@ -246,7 +246,10 @@ class DocBin:
""" """
path = ensure_path(path) path = ensure_path(path)
with path.open("wb") as file_: with path.open("wb") as file_:
file_.write(self.to_bytes()) try:
file_.write(self.to_bytes())
except ValueError:
raise ValueError(Errors.E870)
def from_disk(self, path: Union[str, Path]) -> "DocBin": def from_disk(self, path: Union[str, Path]) -> "DocBin":
"""Load the DocBin from a file (typically called .spacy). """Load the DocBin from a file (typically called .spacy).