💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields
This commit is contained in:
Ines Montani 2019-03-10 19:16:45 +01:00 committed by Matthew Honnibal
parent 9a8f169e5c
commit 7ba3a5d95c
25 changed files with 598 additions and 314 deletions

View File

@ -70,6 +70,12 @@ class Warnings(object):
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
"efficient and less error-prone Doc.retokenize context manager "
"instead.")
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
"methods is and should be replaced with `exclude`. This makes it "
"consistent with the other objects serializable.")
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
"being serialized or deserialized is deprecated. Please use the "
"`exclude` argument instead. For example: exclude=['{arg}'].")
@add_codes
@ -348,7 +354,10 @@ class Errors(object):
"This is likely a bug in spaCy, so feel free to open an issue.")
E127 = ("Cannot create phrase pattern representation for length 0. This "
"is likely a bug in spaCy.")
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
"arguments to exclude fields from being serialized or deserialized "
"is now deprecated. Please use the `exclude` argument instead. "
"For example: exclude=['{arg}'].")
@add_codes

View File

@ -28,7 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop
from .errors import Errors
from .errors import Errors, Warnings, deprecation_warning
from . import util
from . import about
@ -699,124 +699,114 @@ class Language(object):
self.tokenizer._reset_cache(keys)
nr_seen = 0
def to_disk(self, path, disable=tuple()):
def to_disk(self, path, exclude=tuple(), disable=None):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be strings or `Path`-like objects.
disable (list): Names of pipeline components to disable and prevent
from being saved.
path (unicode or Path): Path to a directory, which will be created if
it doesn't exist.
exclude (list): Names of components or serialization fields to exclude.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
DOCS: https://spacy.io/api/language#to_disk
"""
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path)
serializers = OrderedDict(
(
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
)
)
serializers = OrderedDict()
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
for name, proc in self.pipeline:
if not hasattr(proc, "name"):
continue
if name in disable:
if name in exclude:
continue
if not hasattr(proc, "to_disk"):
continue
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable})
util.to_disk(path, serializers, exclude)
def from_disk(self, path, disable=tuple()):
def from_disk(self, path, exclude=tuple(), disable=None):
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
disable (list): Names of the pipeline components to disable.
path (unicode or Path): A path to a directory.
exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
DOCS: https://spacy.io/api/language#from_disk
"""
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path)
deserializers = OrderedDict(
(
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
(
"vocab",
lambda p: (
self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
),
),
("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)),
)
)
deserializers = OrderedDict()
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
for name, proc in self.pipeline:
if name in disable:
if name in exclude:
continue
if not hasattr(proc, "from_disk"):
continue
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
exclude = {p: False for p in disable}
if not (path / "vocab").exists():
exclude["vocab"] = True
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
if not (path / "vocab").exists() and "vocab" not in exclude:
# Convert to list here in case exclude is (default) tuple
exclude = list(exclude) + ["vocab"]
util.from_disk(path, deserializers, exclude)
self._path = path
return self
def to_bytes(self, disable=[], **exclude):
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
"""Serialize the current state to a binary string.
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
exclude (list): Names of components or serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Language` object.
DOCS: https://spacy.io/api/language#to_bytes
"""
serializers = OrderedDict(
(
("vocab", lambda: self.vocab.to_bytes()),
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
("meta", lambda: srsly.json_dumps(self.meta)),
)
)
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
serializers = OrderedDict()
serializers["vocab"] = lambda: self.vocab.to_bytes()
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
for name, proc in self.pipeline:
if name in exclude:
continue
if not hasattr(proc, "to_bytes"):
continue
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, disable=[]):
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
disable (list): Names of the pipeline components to disable.
exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The `Language` object.
DOCS: https://spacy.io/api/language#from_bytes
"""
deserializers = OrderedDict(
(
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
(
"vocab",
lambda b: (
self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
),
),
("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)),
)
)
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
deserializers = OrderedDict()
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
for name, proc in self.pipeline:
if name in exclude:
continue
if not hasattr(proc, "from_bytes"):
continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
util.from_bytes(bytes_data, deserializers, {})
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude)
return self

View File

@ -141,16 +141,21 @@ class Pipe(object):
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
"""
serialize = OrderedDict()
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
if self.model not in (True, False, None):
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load the pipe from a bytestring."""
def load_model(b):
@ -161,26 +166,25 @@ class Pipe(object):
self.model = self.Model(**self.cfg)
self.model.from_bytes(b)
deserialize = OrderedDict(
(
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
("vocab", lambda b: self.vocab.from_bytes(b)),
("model", load_model),
)
)
deserialize = OrderedDict()
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
def to_disk(self, path, exclude=tuple(), **kwargs):
"""Serialize the pipe to disk."""
serialize = OrderedDict()
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def from_disk(self, path, exclude=tuple(), **kwargs):
"""Load the pipe from disk."""
def load_model(p):
@ -191,13 +195,11 @@ class Pipe(object):
self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open("rb").read())
deserialize = OrderedDict(
(
("cfg", lambda p: self.cfg.update(_load_cfg(p))),
("vocab", lambda p: self.vocab.from_disk(p)),
("model", load_model),
)
)
deserialize = OrderedDict()
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
return self
@ -537,7 +539,7 @@ class Tagger(Pipe):
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
serialize = OrderedDict()
if self.model not in (None, True, False):
serialize["model"] = self.model.to_bytes
@ -545,9 +547,10 @@ class Tagger(Pipe):
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
def load_model(b):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
@ -572,20 +575,22 @@ class Tagger(Pipe):
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
("model", lambda b: load_model(b)),
))
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
def to_disk(self, path, exclude=tuple(), **kwargs):
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
('model', lambda p: p.open("wb").write(self.model.to_bytes())),
('cfg', lambda p: srsly.write_json(p, self.cfg))
("vocab", lambda p: self.vocab.to_disk(p)),
("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
("model", lambda p: p.open("wb").write(self.model.to_bytes())),
("cfg", lambda p: srsly.write_json(p, self.cfg))
))
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
@ -608,6 +613,7 @@ class Tagger(Pipe):
("tag_map", load_tag_map),
("model", load_model),
))
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
return self

View File

@ -236,19 +236,17 @@ cdef class StringStore:
self.add(word)
return self
def to_bytes(self, **exclude):
def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
return srsly.json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, **kwargs):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
strings = srsly.json_loads(bytes_data)

View File

@ -603,22 +603,24 @@ cdef class Parser:
self.cfg.update(cfg)
return sgd
def to_disk(self, path, **exclude):
def to_disk(self, path, exclude=tuple(), **kwargs):
serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
'cfg': lambda p: srsly.write_json(p, self.cfg)
}
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude):
def from_disk(self, path, exclude=tuple(), **kwargs):
deserializers = {
'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False),
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None
}
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
path = util.ensure_path(path)
@ -632,22 +634,24 @@ cdef class Parser:
self.cfg.update(cfg)
return self
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
serializers = OrderedDict((
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('moves', lambda: self.moves.to_bytes(exclude=["strings"])),
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])),
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: None)
))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
# TODO: Remove this once we don't have to handle previous models

View File

@ -208,30 +208,32 @@ cdef class TransitionSystem:
self.labels[action][label_name] = new_freq-1
return 1
def to_disk(self, path, **exclude):
def to_disk(self, path, **kwargs):
with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude))
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude):
def from_disk(self, path, **kwargs):
with path.open('rb') as file_:
byte_data = file_.read()
self.from_bytes(byte_data, **exclude)
self.from_bytes(byte_data, **kwargs)
return self
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
transitions = []
serializers = {
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes()
}
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
labels = {}
deserializers = {
'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
self.initialize_actions(labels)
return self

View File

@ -113,14 +113,14 @@ def test_doc_api_serialize(en_tokenizer, text):
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(tensor=False), tensor=False
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(sentiment=False), sentiment=False
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.compat import path2str
@ -41,3 +42,18 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_exclude(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
doc.user_data["foo"] = "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.user_data["foo"] == "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
assert not new_doc.user_data
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
assert not new_doc.user_data
with pytest.raises(ValueError):
doc.to_bytes(user_data=False)
with pytest.raises(ValueError):
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)

View File

@ -52,3 +52,19 @@ def test_serialize_with_custom_tokenizer():
nlp.tokenizer = custom_tokenizer(nlp)
with make_tempdir() as d:
nlp.to_disk(d)
def test_serialize_language_exclude(meta_data):
name = "name-in-fixture"
nlp = Language(meta=meta_data)
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes())
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
assert not new_nlp.meta["name"] == name
with pytest.raises(ValueError):
nlp.to_bytes(meta=False)
with pytest.raises(ValueError):
Language().from_bytes(nlp.to_bytes(), meta=False)

View File

@ -55,7 +55,9 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
parser_d = Parser(en_vocab)
parser_d.model, _ = parser_d.Model(0)
parser_d = parser_d.from_disk(file_path)
assert parser.to_bytes(model=False) == parser_d.to_bytes(model=False)
parser_bytes = parser.to_bytes(exclude=["model"])
parser_d_bytes = parser_d.to_bytes(exclude=["model"])
assert parser_bytes == parser_d_bytes
def test_to_from_bytes(parser, blank_parser):
@ -114,3 +116,25 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
textcat.to_bytes()
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_pipe_exclude(en_vocab, Parser):
def get_new_parser():
new_parser = Parser(en_vocab)
new_parser.model, _ = new_parser.Model(0)
return new_parser
parser = Parser(en_vocab)
parser.model, _ = parser.Model(0)
parser.cfg["foo"] = "bar"
new_parser = get_new_parser().from_bytes(parser.to_bytes())
assert "foo" in new_parser.cfg
new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"])
assert "foo" not in new_parser.cfg
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]))
assert "foo" not in new_parser.cfg
with pytest.raises(ValueError):
parser.to_bytes(cfg=False)
with pytest.raises(ValueError):
get_new_parser().from_bytes(parser.to_bytes(), cfg=False)

View File

@ -360,36 +360,37 @@ cdef class Tokenizer:
self._cache.set(key, cached)
self._rules[string] = substrings
def to_disk(self, path, **exclude):
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/tokenizer#to_disk
"""
with path.open("wb") as file_:
file_.write(self.to_bytes(**exclude))
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude):
def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
path (unicode or Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_disk
"""
with path.open("rb") as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
self.from_bytes(bytes_data, **kwargs)
return self
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#to_bytes
@ -402,13 +403,14 @@ cdef class Tokenizer:
("token_match", lambda: _get_regex_pattern(self.token_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_bytes
@ -422,6 +424,7 @@ cdef class Tokenizer:
("token_match", lambda b: data.setdefault("token_match", b)),
("exceptions", lambda b: data.setdefault("rules", b))
))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
if data.get("prefix_search"):
self.prefix_search = re.compile(data["prefix_search"]).search

View File

@ -794,24 +794,26 @@ cdef class Doc:
"""
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
def to_disk(self, path, **exclude):
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/doc#to_disk
"""
path = util.ensure_path(path)
with path.open("wb") as file_:
file_.write(self.to_bytes(**exclude))
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude):
def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object.
DOCS: https://spacy.io/api/doc#from_disk
@ -819,11 +821,12 @@ cdef class Doc:
path = util.ensure_path(path)
with path.open("rb") as file_:
bytes_data = file_.read()
return self.from_bytes(bytes_data, **exclude)
return self.from_bytes(bytes_data, **kwargs)
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize, i.e. export the document contents to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
@ -849,16 +852,22 @@ cdef class Doc:
"sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor,
}
for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
raise ValueError(Errors.E128.format(arg=key))
if "user_data" not in exclude and self.user_data:
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
if "user_data_keys" not in exclude:
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
if "user_data_values" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): Itself.
DOCS: https://spacy.io/api/doc#from_bytes
@ -874,6 +883,9 @@ cdef class Doc:
"user_data_keys": lambda b: None,
"user_data_values": lambda b: None,
}
for key in kwargs:
if key in deserializers or key in ("user_data",):
raise ValueError(Errors.E128.format(arg=key))
msg = util.from_bytes(bytes_data, deserializers, exclude)
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
@ -1170,7 +1182,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
def pickle_doc(doc):
bytes_data = doc.to_bytes(vocab=False, user_data=False)
bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
doc.user_token_hooks)
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
@ -1179,7 +1191,7 @@ def pickle_doc(doc):
def unpickle_doc(vocab, hooks_and_data, bytes_data):
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data")
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
doc.user_hooks.update(doc_hooks)
doc.user_span_hooks.update(span_hooks)
doc.user_token_hooks.update(token_hooks)

View File

@ -25,7 +25,7 @@ except ImportError:
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file
from .errors import Errors
from .errors import Errors, Warnings, deprecation_warning
LANGUAGES = {}
@ -565,7 +565,8 @@ def itershuffle(iterable, bufsize=1000):
def to_bytes(getters, exclude):
serialized = OrderedDict()
for key, getter in getters.items():
if key not in exclude:
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
serialized[key] = getter()
return srsly.msgpack_dumps(serialized)
@ -573,7 +574,8 @@ def to_bytes(getters, exclude):
def from_bytes(bytes_data, setters, exclude):
msg = srsly.msgpack_loads(bytes_data)
for key, setter in setters.items():
if key not in exclude and key in msg:
# Split to support file names like meta.json
if key.split(".")[0] not in exclude and key in msg:
setter(msg[key])
return msg
@ -583,7 +585,8 @@ def to_disk(path, writers, exclude):
if not path.exists():
path.mkdir()
for key, writer in writers.items():
if key not in exclude:
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
writer(path / key)
return path
@ -591,7 +594,8 @@ def to_disk(path, writers, exclude):
def from_disk(path, readers, exclude):
path = ensure_path(path)
for key, reader in readers.items():
if key not in exclude:
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
reader(path / key)
return path
@ -677,6 +681,23 @@ def validate_json(data, validator):
return errors
def get_serialization_exclude(serializers, exclude, kwargs):
"""Helper function to validate serialization args and manage transition from
keyword arguments (pre v2.1) to exclude argument.
"""
exclude = list(exclude)
# Split to support file names like meta.json
options = [name.split(".")[0] for name in serializers]
for key, value in kwargs.items():
if key in ("vocab",) and value is False:
deprecation_warning(Warnings.W015.format(arg=key))
exclude.append(key)
elif key.split(".")[0] in options:
raise ValueError(Errors.E128.format(arg=key))
# TODO: user warning?
return exclude
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty
@ -696,14 +717,14 @@ class SimpleFrozenDict(dict):
class DummyTokenizer(object):
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
def to_bytes(self, **kwargs):
return b""
def from_bytes(self, _bytes_data, **exclude):
def from_bytes(self, _bytes_data, **kwargs):
return self
def to_disk(self, _path, **exclude):
def to_disk(self, _path, **kwargs):
return None
def from_disk(self, _path, **exclude):
def from_disk(self, _path, **kwargs):
return self

View File

@ -377,11 +377,11 @@ cdef class Vectors:
self.add(key, row=i)
return strings
def to_disk(self, path, **exclude):
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object.
it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk
"""
@ -394,9 +394,9 @@ cdef class Vectors:
("vectors", lambda p: save_array(self.data, p.open("wb"))),
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
))
return util.to_disk(path, serializers, exclude)
return util.to_disk(path, serializers, [])
def from_disk(self, path, **exclude):
def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and
returns it.
@ -428,13 +428,13 @@ cdef class Vectors:
("keys", load_keys),
("vectors", load_vectors),
))
util.from_disk(path, serializers, exclude)
util.from_disk(path, serializers, [])
return self
def to_bytes(self, **exclude):
def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vectors` object.
DOCS: https://spacy.io/api/vectors#to_bytes
@ -444,17 +444,18 @@ cdef class Vectors:
return self.data.to_bytes()
else:
return srsly.msgpack_dumps(self.data)
serializers = OrderedDict((
("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
("vectors", serialize_weights)
))
return util.to_bytes(serializers, exclude)
return util.to_bytes(serializers, [])
def from_bytes(self, data, **exclude):
def from_bytes(self, data, **kwargs):
"""Load state from a binary string.
data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vectors): The `Vectors` object.
DOCS: https://spacy.io/api/vectors#from_bytes
@ -469,5 +470,5 @@ cdef class Vectors:
("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
("vectors", deserialize_weights)
))
util.from_bytes(data, deserializers, exclude)
util.from_bytes(data, deserializers, [])
return self

View File

@ -397,47 +397,57 @@ cdef class Vocab:
orth = self.strings.add(orth)
return orth in self.vectors
def to_disk(self, path, **exclude):
def to_disk(self, path, exclude=tuple(), **kwargs):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
setters = ["strings", "lexemes", "vectors"]
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
with (path / "lexemes.bin").open('wb') as file_:
if "lexemes" not in exclude:
with (path / "lexemes.bin").open("wb") as file_:
file_.write(self.lexemes_to_bytes())
if self.vectors is not None:
if "vectors" not in "exclude" and self.vectors is not None:
self.vectors.to_disk(path)
def from_disk(self, path, **exclude):
def from_disk(self, path, exclude=tuple(), **kwargs):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
path (unicode or Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
self.strings.from_disk(path / "strings.json")
getters = ["strings", "lexemes", "vectors"]
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "lexemes" not in exclude:
with (path / "lexemes.bin").open("rb") as file_:
self.lexemes_from_bytes(file_.read())
if "vectors" not in exclude:
if self.vectors is not None:
self.vectors.from_disk(path, exclude="strings.json")
self.vectors.from_disk(path, exclude=["strings"])
if self.vectors.name is not None:
link_vectors_to_models(self)
return self
def to_bytes(self, **exclude):
def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_bytes
@ -453,13 +463,14 @@ cdef class Vocab:
("lexemes", lambda: self.lexemes_to_bytes()),
("vectors", deserialize_vectors)
))
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object.
DOCS: https://spacy.io/api/vocab#from_bytes
@ -469,11 +480,13 @@ cdef class Vocab:
return None
else:
return self.vectors.from_bytes(b)
setters = OrderedDict((
("strings", lambda b: self.strings.from_bytes(b)),
("lexemes", lambda b: self.lexemes_from_bytes(b)),
("vectors", lambda b: serialize_vectors(b))
))
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude)
if self.vectors.name is not None:
link_vectors_to_models(self)

View File

@ -245,8 +245,9 @@ Serialize the pipe to disk.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## DependencyParser.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
## DependencyParser.to_bytes {#to_bytes tag="method"}
@ -276,8 +278,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring.
| Name | Type | Description |
| ----------- | ----- | ----------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
## DependencyParser.from_bytes {#from_bytes tag="method"}
@ -293,9 +295,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ```
| Name | Type | Description |
| ------------ | ------------------ | ---------------------------------------------- |
| ------------ | ------------------ | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
## DependencyParser.labels {#labels tag="property"}
@ -312,3 +314,21 @@ The labels currently added to the component.
| Name | Type | Description |
| ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = parser.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -350,9 +350,10 @@ array of attributes.
> ```
| Name | Type | Description |
| ----------- | -------------------------------------- | ----------------------------- |
| ----------- | -------------------------------------- | ------------------------------------------------------------------------- |
| `attrs` | list | A list of attribute ID ints. |
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | Itself. |
## Doc.to_disk {#to_disk tag="method" new="2"}
@ -366,8 +367,9 @@ Save the current state to a directory.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Doc.from_disk {#from_disk tag="method" new="2"}
@ -384,6 +386,7 @@ Loads state from a directory. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | The modified `Doc` object. |
## Doc.to_bytes {#to_bytes tag="method"}
@ -398,7 +401,8 @@ Serialize, i.e. export the document contents to a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | --------------------------------------------------------------------- |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
## Doc.from_bytes {#from_bytes tag="method"}
@ -417,8 +421,9 @@ Deserialize, i.e. import the document contents from a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------ |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `data` | bytes | The string to load from. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | The `Doc` object. |
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
@ -658,3 +663,25 @@ The L2 norm of the document's vector representation.
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = doc.to_bytes(exclude=["text", "tensor"])
> doc.from_disk("./doc.bin", exclude=["user_data"])
> ```
| Name | Description |
| ------------------ | --------------------------------------------- |
| `text` | The value of the `Doc.text` attribute. |
| `sentiment` | The value of the `Doc.sentiment` attribute. |
| `tensor` | The value of the `Doc.tensor` attribute. |
| `user_data` | The value of the `Doc.user_data` dictionary. |
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
| `user_data_values` | The values of the `Doc.user_data` dictionary. |

View File

@ -245,8 +245,9 @@ Serialize the pipe to disk.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityRecognizer.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
@ -276,8 +278,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring.
| Name | Type | Description |
| ----------- | ----- | ----------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
@ -293,9 +295,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ```
| Name | Type | Description |
| ------------ | ------------------ | ---------------------------------------------- |
| ------------ | ------------------ | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
## EntityRecognizer.labels {#labels tag="property"}
@ -312,3 +314,21 @@ The labels currently added to the component.
| Name | Type | Description |
| ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = ner.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -327,7 +327,7 @@ the model**.
| Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being saved. |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
## Language.from_disk {#from_disk tag="method" new="2"}
@ -350,21 +350,21 @@ loaded object.
> ```
| Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------------------------- |
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Language` | The modified `Language` object. |
<Infobox title="Changed in v2.0" variant="warning">
As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`,
to improve consistency across classes. Pipeline components to prevent from being
loaded can now be added as a list to `disable`, instead of specifying one
keyword argument per component.
loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1),
instead of specifying one keyword argument per component.
```diff
- nlp = spacy.load("en", tagger=False, entity=False)
+ nlp = English().from_disk("/model", disable=["tagger', 'ner"])
+ nlp = English().from_disk("/model", exclude=["tagger", "ner"])
```
</Infobox>
@ -380,8 +380,8 @@ Serialize the current state to a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------- |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being serialized. |
| ----------- | ----- | ----------------------------------------------------------------------------------------- |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Language` object. |
## Language.from_bytes {#from_bytes tag="method"}
@ -401,19 +401,20 @@ available to the loaded object.
> ```
| Name | Type | Description |
| ------------ | ---------- | --------------------------------------------------------------------------------- |
| ------------ | ---------- | ----------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Language` | The `Language` object. |
<Infobox title="Changed in v2.0" variant="warning">
Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component.
`disable` (v2.0) or `exclude` (v2.1), instead of specifying one keyword argument
per component.
```diff
- nlp = English().from_bytes(bytes, tagger=False, entity=False)
+ nlp = English().from_bytes(bytes, disable=["tagger", "ner"])
+ nlp = English().from_bytes(bytes, exclude=["tagger", "ner"])
```
</Infobox>
@ -437,3 +438,23 @@ Pipeline components to prevent from being loaded can now be added as a list to
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
> nlp.from_disk("./model-data", exclude=["ner"])
> ```
| Name | Description |
| ----------- | -------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `tokenizer` | Tokenization rules and exceptions. |
| `meta` | The meta data, available as `Language.meta`. |
| ... | String names of pipeline components, e.g. `"ner"`. |

View File

@ -152,8 +152,7 @@ Serialize the current state to a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------ |
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
## StringStore.from_bytes {#from_bytes tag="method"}
@ -169,9 +168,8 @@ Load state from a binary string.
> ```
| Name | Type | Description |
| ------------ | ------------- | ---------------------------------------------- |
| ------------ | ------------- | ------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| **RETURNS** | `StringStore` | The `StringStore` object. |
## Utilities {#util}

View File

@ -245,8 +245,9 @@ Serialize the pipe to disk.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tagger.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
## Tagger.to_bytes {#to_bytes tag="method"}
@ -276,8 +278,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring.
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
## Tagger.from_bytes {#from_bytes tag="method"}
@ -293,9 +295,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ```
| Name | Type | Description |
| ------------ | -------- | ---------------------------------------------- |
| ------------ | -------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tagger` | The `Tagger` object. |
## Tagger.labels {#labels tag="property"}
@ -314,3 +316,22 @@ tags by default, e.g. `VERB`, `NOUN` and so on.
| Name | Type | Description |
| ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = tagger.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------ |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |

View File

@ -261,8 +261,9 @@ Serialize the pipe to disk.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## TextCategorizer.from_disk {#from_disk tag="method"}
@ -278,6 +279,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ----------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
## TextCategorizer.to_bytes {#to_bytes tag="method"}
@ -292,8 +294,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring.
| Name | Type | Description |
| ----------- | ----- | ---------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
## TextCategorizer.from_bytes {#from_bytes tag="method"}
@ -309,9 +311,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ```
| Name | Type | Description |
| ------------ | ----------------- | ---------------------------------------------- |
| ------------ | ----------------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
## TextCategorizer.labels {#labels tag="property"}
@ -328,3 +330,21 @@ The labels currently added to the component.
| Name | Type | Description |
| ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = textcat.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -128,8 +128,9 @@ Serialize the tokenizer to disk.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tokenizer.from_disk {#from_disk tag="method"}
@ -145,6 +146,7 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
## Tokenizer.to_bytes {#to_bytes tag="method"}
@ -159,8 +161,8 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
Serialize the tokenizer to a bytestring.
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
## Tokenizer.from_bytes {#from_bytes tag="method"}
@ -177,9 +179,9 @@ it.
> ```
| Name | Type | Description |
| ------------ | ----------- | ---------------------------------------------- |
| ------------ | ----------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
## Attributes {#attributes}
@ -190,3 +192,25 @@ it.
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = tokenizer.to_bytes(exclude=["vocab", "exceptions"])
> tokenizer.from_disk("./data", exclude=["token_match"])
> ```
| Name | Description |
| ---------------- | --------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `prefix_search` | The prefix rules. |
| `suffix_search` | The suffix rules. |
| `infix_finditer` | The infix rules. |
| `token_match` | The token match expression. |
| `exceptions` | The tokenizer exception rules. |

View File

@ -312,9 +312,8 @@ Save the current state to a directory.
> ```
| Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `**exclude` | - | Named attributes to prevent from being saved. |
## Vectors.from_disk {#from_disk tag="method"}
@ -343,8 +342,7 @@ Serialize the current state to a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | -------------------------------------------- |
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
## Vectors.from_bytes {#from_bytes tag="method"}
@ -361,9 +359,8 @@ Load state from a binary string.
> ```
| Name | Type | Description |
| ----------- | --------- | ---------------------------------------------- |
| ----------- | --------- | ---------------------- |
| `data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| **RETURNS** | `Vectors` | The `Vectors` object. |
## Attributes {#attributes}

View File

@ -222,8 +222,9 @@ Save the current state to a directory.
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -239,6 +240,7 @@ Loads state from a directory. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
## Vocab.to_bytes {#to_bytes tag="method"}
@ -252,8 +254,8 @@ Serialize the current state to a binary string.
> ```
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. |
| ----------- | ----- | ------------------------------------------------------------------------- |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
## Vocab.from_bytes {#from_bytes tag="method"}
@ -270,9 +272,9 @@ Load state from a binary string.
> ```
| Name | Type | Description |
| ------------ | ------- | ---------------------------------------------- |
| ------------ | ------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The `Vocab` object. |
## Attributes {#attributes}
@ -291,3 +293,22 @@ Load state from a binary string.
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
| `vectors_length` | int | Number of dimensions for each word vector. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = vocab.to_bytes(exclude=["strings", "vectors"])
> vocab.from_disk("./vocab", exclude=["strings"])
> ```
| Name | Description |
| --------- | ----------------------------------------------------- |
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. |