💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields
This commit is contained in:
Ines Montani 2019-03-10 19:16:45 +01:00 committed by Matthew Honnibal
parent 9a8f169e5c
commit 7ba3a5d95c
25 changed files with 598 additions and 314 deletions

View File

@ -70,6 +70,12 @@ class Warnings(object):
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more " W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
"efficient and less error-prone Doc.retokenize context manager " "efficient and less error-prone Doc.retokenize context manager "
"instead.") "instead.")
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
"methods is and should be replaced with `exclude`. This makes it "
"consistent with the other objects serializable.")
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
"being serialized or deserialized is deprecated. Please use the "
"`exclude` argument instead. For example: exclude=['{arg}'].")
@add_codes @add_codes
@ -348,7 +354,10 @@ class Errors(object):
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")
E127 = ("Cannot create phrase pattern representation for length 0. This " E127 = ("Cannot create phrase pattern representation for length 0. This "
"is likely a bug in spaCy.") "is likely a bug in spaCy.")
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
"arguments to exclude fields from being serialized or deserialized "
"is now deprecated. Please use the `exclude` argument instead. "
"For example: exclude=['{arg}'].")
@add_codes @add_codes

View File

@ -28,7 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop from .lang.lex_attrs import LEX_ATTRS, is_stop
from .errors import Errors from .errors import Errors, Warnings, deprecation_warning
from . import util from . import util
from . import about from . import about
@ -699,124 +699,114 @@ class Language(object):
self.tokenizer._reset_cache(keys) self.tokenizer._reset_cache(keys)
nr_seen = 0 nr_seen = 0
def to_disk(self, path, disable=tuple()): def to_disk(self, path, exclude=tuple(), disable=None):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this
will include the model. will include the model.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): Path to a directory, which will be created if
it doesn't exist. Paths may be strings or `Path`-like objects. it doesn't exist.
disable (list): Names of pipeline components to disable and prevent exclude (list): Names of components or serialization fields to exclude.
from being saved.
EXAMPLE: DOCS: https://spacy.io/api/language#to_disk
>>> nlp.to_disk('/path/to/models')
""" """
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = OrderedDict( serializers = OrderedDict()
( serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)), serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
)
)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if not hasattr(proc, "name"): if not hasattr(proc, "name"):
continue continue
if name in disable: if name in exclude:
continue continue
if not hasattr(proc, "to_disk"): if not hasattr(proc, "to_disk"):
continue continue
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
serializers["vocab"] = lambda p: self.vocab.to_disk(p) serializers["vocab"] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable}) util.to_disk(path, serializers, exclude)
def from_disk(self, path, disable=tuple()): def from_disk(self, path, exclude=tuple(), disable=None):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the returns it. If the saved `Language` object contains a model, the
model will be loaded. model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory.
strings or `Path`-like objects. exclude (list): Names of components or serialization fields to exclude.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object. RETURNS (Language): The modified `Language` object.
EXAMPLE: DOCS: https://spacy.io/api/language#from_disk
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
""" """
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path) path = util.ensure_path(path)
deserializers = OrderedDict( deserializers = OrderedDict()
( deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
("meta.json", lambda p: self.meta.update(srsly.read_json(p))), deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
( deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
"vocab",
lambda p: (
self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
),
),
("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)),
)
)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if name in disable: if name in exclude:
continue continue
if not hasattr(proc, "from_disk"): if not hasattr(proc, "from_disk"):
continue continue
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
exclude = {p: False for p in disable} if not (path / "vocab").exists() and "vocab" not in exclude:
if not (path / "vocab").exists(): # Convert to list here in case exclude is (default) tuple
exclude["vocab"] = True exclude = list(exclude) + ["vocab"]
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
self._path = path self._path = path
return self return self
def to_bytes(self, disable=[], **exclude): def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
disable (list): Nameds of pipeline components to disable and prevent exclude (list): Names of components or serialization fields to exclude.
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object. RETURNS (bytes): The serialized form of the `Language` object.
DOCS: https://spacy.io/api/language#to_bytes
""" """
serializers = OrderedDict( if disable is not None:
( deprecation_warning(Warnings.W014)
("vocab", lambda: self.vocab.to_bytes()), exclude = disable
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)), serializers = OrderedDict()
("meta", lambda: srsly.json_dumps(self.meta)), serializers["vocab"] = lambda: self.vocab.to_bytes()
) serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
for i, (name, proc) in enumerate(self.pipeline): for name, proc in self.pipeline:
if name in disable: if name in exclude:
continue continue
if not hasattr(proc, "to_bytes"): if not hasattr(proc, "to_bytes"):
continue continue
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False) serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, disable=[]): def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
disable (list): Names of the pipeline components to disable. exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The `Language` object. RETURNS (Language): The `Language` object.
DOCS: https://spacy.io/api/language#from_bytes
""" """
deserializers = OrderedDict( if disable is not None:
( deprecation_warning(Warnings.W014)
("meta", lambda b: self.meta.update(srsly.json_loads(b))), exclude = disable
( deserializers = OrderedDict()
"vocab", deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
lambda b: ( deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
), for name, proc in self.pipeline:
), if name in exclude:
("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)),
)
)
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue continue
if not hasattr(proc, "from_bytes"): if not hasattr(proc, "from_bytes"):
continue continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False) deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
util.from_bytes(bytes_data, deserializers, {}) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude)
return self return self

View File

@ -141,16 +141,21 @@ class Pipe(object):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the pipe to a bytestring.""" """Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
"""
serialize = OrderedDict() serialize = OrderedDict()
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
if self.model not in (True, False, None): if self.model not in (True, False, None):
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load the pipe from a bytestring.""" """Load the pipe from a bytestring."""
def load_model(b): def load_model(b):
@ -161,26 +166,25 @@ class Pipe(object):
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
self.model.from_bytes(b) self.model.from_bytes(b)
deserialize = OrderedDict( deserialize = OrderedDict()
( deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
("vocab", lambda b: self.vocab.from_bytes(b)), deserialize["model"] = load_model
("model", load_model), exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
)
)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, exclude=tuple(), **kwargs):
"""Serialize the pipe to disk.""" """Serialize the pipe to disk."""
serialize = OrderedDict() serialize = OrderedDict()
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False): if self.model not in (None, True, False):
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, exclude=tuple(), **kwargs):
"""Load the pipe from disk.""" """Load the pipe from disk."""
def load_model(p): def load_model(p):
@ -191,13 +195,11 @@ class Pipe(object):
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open("rb").read()) self.model.from_bytes(p.open("rb").read())
deserialize = OrderedDict( deserialize = OrderedDict()
( deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
("cfg", lambda p: self.cfg.update(_load_cfg(p))), deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
("vocab", lambda p: self.vocab.from_disk(p)), deserialize["model"] = load_model
("model", load_model), exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
)
)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -537,7 +539,7 @@ class Tagger(Pipe):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
serialize = OrderedDict() serialize = OrderedDict()
if self.model not in (None, True, False): if self.model not in (None, True, False):
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
@ -545,9 +547,10 @@ class Tagger(Pipe):
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
def load_model(b): def load_model(b):
# TODO: Remove this once we don't have to handle previous models # TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
@ -572,20 +575,22 @@ class Tagger(Pipe):
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
("model", lambda b: load_model(b)), ("model", lambda b: load_model(b)),
)) ))
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, exclude=tuple(), **kwargs):
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize = OrderedDict(( serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)), ("vocab", lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)), ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
('model', lambda p: p.open("wb").write(self.model.to_bytes())), ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
('cfg', lambda p: srsly.write_json(p, self.cfg)) ("cfg", lambda p: srsly.write_json(p, self.cfg))
)) ))
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p): def load_model(p):
# TODO: Remove this once we don't have to handle previous models # TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
@ -608,6 +613,7 @@ class Tagger(Pipe):
("tag_map", load_tag_map), ("tag_map", load_tag_map),
("model", load_model), ("model", load_model),
)) ))
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self

View File

@ -236,19 +236,17 @@ cdef class StringStore:
self.add(word) self.add(word)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object. RETURNS (bytes): The serialized form of the `StringStore` object.
""" """
return srsly.json_dumps(list(self)) return srsly.json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **kwargs):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object. RETURNS (StringStore): The `StringStore` object.
""" """
strings = srsly.json_loads(bytes_data) strings = srsly.json_loads(bytes_data)

View File

@ -228,7 +228,7 @@ cdef class Parser:
self.set_annotations(subbatch, parse_states, tensors=None) self.set_annotations(subbatch, parse_states, tensors=None)
for doc in batch_in_order: for doc in batch_in_order:
yield doc yield doc
def require_model(self): def require_model(self):
"""Raise an error if the component's model is not initialized.""" """Raise an error if the component's model is not initialized."""
if getattr(self, 'model', None) in (None, True, False): if getattr(self, 'model', None) in (None, True, False):
@ -272,7 +272,7 @@ cdef class Parser:
beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density) beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
# This is pretty dirty, but the NER can resize itself in init_batch, # This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to # if labels are missing. We therefore have to check whether we need to
# expand our model output. # expand our model output.
self.model.resize_output(self.moves.n_moves) self.model.resize_output(self.moves.n_moves)
model = self.model(docs) model = self.model(docs)
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
@ -442,7 +442,7 @@ cdef class Parser:
if self._rehearsal_model is None: if self._rehearsal_model is None:
return None return None
losses.setdefault(self.name, 0.) losses.setdefault(self.name, 0.)
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch, # This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to # if labels are missing. We therefore have to check whether we need to
@ -603,22 +603,24 @@ cdef class Parser:
self.cfg.update(cfg) self.cfg.update(cfg)
return sgd return sgd
def to_disk(self, path, **exclude): def to_disk(self, path, exclude=tuple(), **kwargs):
serializers = { serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'vocab': lambda p: self.vocab.to_disk(p), 'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False), 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
'cfg': lambda p: srsly.write_json(p, self.cfg) 'cfg': lambda p: srsly.write_json(p, self.cfg)
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, exclude=tuple(), **kwargs):
deserializers = { deserializers = {
'vocab': lambda p: self.vocab.from_disk(p), 'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False), 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None 'model': lambda p: None
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
path = util.ensure_path(path) path = util.ensure_path(path)
@ -632,22 +634,24 @@ cdef class Parser:
self.cfg.update(cfg) self.cfg.update(cfg)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
serializers = OrderedDict(( serializers = OrderedDict((
('model', lambda: (self.model.to_bytes() if self.model is not True else True)), ('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)), ('moves', lambda: self.moves.to_bytes(exclude=["strings"])),
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
)) ))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
deserializers = OrderedDict(( deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])),
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: None) ('model', lambda b: None)
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
# TODO: Remove this once we don't have to handle previous models # TODO: Remove this once we don't have to handle previous models

View File

@ -208,30 +208,32 @@ cdef class TransitionSystem:
self.labels[action][label_name] = new_freq-1 self.labels[action][label_name] = new_freq-1
return 1 return 1
def to_disk(self, path, **exclude): def to_disk(self, path, **kwargs):
with path.open('wb') as file_: with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude): def from_disk(self, path, **kwargs):
with path.open('rb') as file_: with path.open('rb') as file_:
byte_data = file_.read() byte_data = file_.read()
self.from_bytes(byte_data, **exclude) self.from_bytes(byte_data, **kwargs)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
transitions = [] transitions = []
serializers = { serializers = {
'moves': lambda: srsly.json_dumps(self.labels), 'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
labels = {} labels = {}
deserializers = { deserializers = {
'moves': lambda b: labels.update(srsly.json_loads(b)), 'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b) 'strings': lambda b: self.strings.from_bytes(b)
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
self.initialize_actions(labels) self.initialize_actions(labels)
return self return self

View File

@ -113,14 +113,14 @@ def test_doc_api_serialize(en_tokenizer, text):
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes( new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(tensor=False), tensor=False tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
) )
assert tokens.text == new_tokens.text assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes( new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(sentiment=False), sentiment=False tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
) )
assert tokens.text == new_tokens.text assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.compat import path2str from spacy.compat import path2str
@ -41,3 +42,18 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
doc.to_disk(file_path) doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes() assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_exclude(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
doc.user_data["foo"] = "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.user_data["foo"] == "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
assert not new_doc.user_data
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
assert not new_doc.user_data
with pytest.raises(ValueError):
doc.to_bytes(user_data=False)
with pytest.raises(ValueError):
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)

View File

@ -52,3 +52,19 @@ def test_serialize_with_custom_tokenizer():
nlp.tokenizer = custom_tokenizer(nlp) nlp.tokenizer = custom_tokenizer(nlp)
with make_tempdir() as d: with make_tempdir() as d:
nlp.to_disk(d) nlp.to_disk(d)
def test_serialize_language_exclude(meta_data):
name = "name-in-fixture"
nlp = Language(meta=meta_data)
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes())
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
assert not new_nlp.meta["name"] == name
with pytest.raises(ValueError):
nlp.to_bytes(meta=False)
with pytest.raises(ValueError):
Language().from_bytes(nlp.to_bytes(), meta=False)

View File

@ -55,7 +55,9 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
parser_d = Parser(en_vocab) parser_d = Parser(en_vocab)
parser_d.model, _ = parser_d.Model(0) parser_d.model, _ = parser_d.Model(0)
parser_d = parser_d.from_disk(file_path) parser_d = parser_d.from_disk(file_path)
assert parser.to_bytes(model=False) == parser_d.to_bytes(model=False) parser_bytes = parser.to_bytes(exclude=["model"])
parser_d_bytes = parser_d.to_bytes(exclude=["model"])
assert parser_bytes == parser_d_bytes
def test_to_from_bytes(parser, blank_parser): def test_to_from_bytes(parser, blank_parser):
@ -114,3 +116,25 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105 # See issue #1105
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
textcat.to_bytes() textcat.to_bytes()
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_pipe_exclude(en_vocab, Parser):
def get_new_parser():
new_parser = Parser(en_vocab)
new_parser.model, _ = new_parser.Model(0)
return new_parser
parser = Parser(en_vocab)
parser.model, _ = parser.Model(0)
parser.cfg["foo"] = "bar"
new_parser = get_new_parser().from_bytes(parser.to_bytes())
assert "foo" in new_parser.cfg
new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"])
assert "foo" not in new_parser.cfg
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]))
assert "foo" not in new_parser.cfg
with pytest.raises(ValueError):
parser.to_bytes(cfg=False)
with pytest.raises(ValueError):
get_new_parser().from_bytes(parser.to_bytes(), cfg=False)

View File

@ -360,36 +360,37 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
self._rules[string] = substrings self._rules[string] = substrings
def to_disk(self, path, **exclude): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects. it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/tokenizer#to_disk DOCS: https://spacy.io/api/tokenizer#to_disk
""" """
with path.open("wb") as file_: with path.open("wb") as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude): def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory.
strings or `Path`-like objects. exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object. RETURNS (Tokenizer): The modified `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_disk DOCS: https://spacy.io/api/tokenizer#from_disk
""" """
with path.open("rb") as file_: with path.open("rb") as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude) self.from_bytes(bytes_data, **kwargs)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized. exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Tokenizer` object. RETURNS (bytes): The serialized form of the `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#to_bytes DOCS: https://spacy.io/api/tokenizer#to_bytes
@ -402,13 +403,14 @@ cdef class Tokenizer:
("token_match", lambda: _get_regex_pattern(self.token_match)), ("token_match", lambda: _get_regex_pattern(self.token_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
)) ))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded. exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The `Tokenizer` object. RETURNS (Tokenizer): The `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_bytes DOCS: https://spacy.io/api/tokenizer#from_bytes
@ -422,6 +424,7 @@ cdef class Tokenizer:
("token_match", lambda b: data.setdefault("token_match", b)), ("token_match", lambda b: data.setdefault("token_match", b)),
("exceptions", lambda b: data.setdefault("rules", b)) ("exceptions", lambda b: data.setdefault("rules", b))
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if data.get("prefix_search"): if data.get("prefix_search"):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search

View File

@ -794,24 +794,26 @@ cdef class Doc:
""" """
return numpy.asarray(_get_lca_matrix(self, 0, len(self))) return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
def to_disk(self, path, **exclude): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/doc#to_disk DOCS: https://spacy.io/api/doc#to_disk
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
with path.open("wb") as file_: with path.open("wb") as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **exclude): def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object. RETURNS (Doc): The modified `Doc` object.
DOCS: https://spacy.io/api/doc#from_disk DOCS: https://spacy.io/api/doc#from_disk
@ -819,11 +821,12 @@ cdef class Doc:
path = util.ensure_path(path) path = util.ensure_path(path)
with path.open("rb") as file_: with path.open("rb") as file_:
bytes_data = file_.read() bytes_data = file_.read()
return self.from_bytes(bytes_data, **exclude) return self.from_bytes(bytes_data, **kwargs)
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize, i.e. export the document contents to a binary string. """Serialize, i.e. export the document contents to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations. all annotations.
@ -849,16 +852,22 @@ cdef class Doc:
"sentiment": lambda: self.sentiment, "sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor, "tensor": lambda: self.tensor,
} }
for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
raise ValueError(Errors.E128.format(arg=key))
if "user_data" not in exclude and self.user_data: if "user_data" not in exclude and self.user_data:
user_data_keys, user_data_values = list(zip(*self.user_data.items())) user_data_keys, user_data_values = list(zip(*self.user_data.items()))
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) if "user_data_keys" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
if "user_data_values" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Deserialize, i.e. import the document contents from a binary string. """Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from. data (bytes): The string to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): Itself. RETURNS (Doc): Itself.
DOCS: https://spacy.io/api/doc#from_bytes DOCS: https://spacy.io/api/doc#from_bytes
@ -874,6 +883,9 @@ cdef class Doc:
"user_data_keys": lambda b: None, "user_data_keys": lambda b: None,
"user_data_values": lambda b: None, "user_data_values": lambda b: None,
} }
for key in kwargs:
if key in deserializers or key in ("user_data",):
raise ValueError(Errors.E128.format(arg=key))
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
# Msgpack doesn't distinguish between lists and tuples, which is # Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within # vexing for user data. As a best guess, we *know* that within
@ -1170,7 +1182,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
def pickle_doc(doc): def pickle_doc(doc):
bytes_data = doc.to_bytes(vocab=False, user_data=False) bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
doc.user_token_hooks) doc.user_token_hooks)
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data)) return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
@ -1179,7 +1191,7 @@ def pickle_doc(doc):
def unpickle_doc(vocab, hooks_and_data, bytes_data): def unpickle_doc(vocab, hooks_and_data, bytes_data):
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data) user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data") doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
doc.user_hooks.update(doc_hooks) doc.user_hooks.update(doc_hooks)
doc.user_span_hooks.update(span_hooks) doc.user_span_hooks.update(span_hooks)
doc.user_token_hooks.update(token_hooks) doc.user_token_hooks.update(token_hooks)

View File

@ -25,7 +25,7 @@ except ImportError:
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file from .compat import import_file
from .errors import Errors from .errors import Errors, Warnings, deprecation_warning
LANGUAGES = {} LANGUAGES = {}
@ -565,7 +565,8 @@ def itershuffle(iterable, bufsize=1000):
def to_bytes(getters, exclude): def to_bytes(getters, exclude):
serialized = OrderedDict() serialized = OrderedDict()
for key, getter in getters.items(): for key, getter in getters.items():
if key not in exclude: # Split to support file names like meta.json
if key.split(".")[0] not in exclude:
serialized[key] = getter() serialized[key] = getter()
return srsly.msgpack_dumps(serialized) return srsly.msgpack_dumps(serialized)
@ -573,7 +574,8 @@ def to_bytes(getters, exclude):
def from_bytes(bytes_data, setters, exclude): def from_bytes(bytes_data, setters, exclude):
msg = srsly.msgpack_loads(bytes_data) msg = srsly.msgpack_loads(bytes_data)
for key, setter in setters.items(): for key, setter in setters.items():
if key not in exclude and key in msg: # Split to support file names like meta.json
if key.split(".")[0] not in exclude and key in msg:
setter(msg[key]) setter(msg[key])
return msg return msg
@ -583,7 +585,8 @@ def to_disk(path, writers, exclude):
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
for key, writer in writers.items(): for key, writer in writers.items():
if key not in exclude: # Split to support file names like meta.json
if key.split(".")[0] not in exclude:
writer(path / key) writer(path / key)
return path return path
@ -591,7 +594,8 @@ def to_disk(path, writers, exclude):
def from_disk(path, readers, exclude): def from_disk(path, readers, exclude):
path = ensure_path(path) path = ensure_path(path)
for key, reader in readers.items(): for key, reader in readers.items():
if key not in exclude: # Split to support file names like meta.json
if key.split(".")[0] not in exclude:
reader(path / key) reader(path / key)
return path return path
@ -677,6 +681,23 @@ def validate_json(data, validator):
return errors return errors
def get_serialization_exclude(serializers, exclude, kwargs):
"""Helper function to validate serialization args and manage transition from
keyword arguments (pre v2.1) to exclude argument.
"""
exclude = list(exclude)
# Split to support file names like meta.json
options = [name.split(".")[0] for name in serializers]
for key, value in kwargs.items():
if key in ("vocab",) and value is False:
deprecation_warning(Warnings.W015.format(arg=key))
exclude.append(key)
elif key.split(".")[0] in options:
raise ValueError(Errors.E128.format(arg=key))
# TODO: user warning?
return exclude
class SimpleFrozenDict(dict): class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default """Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty function or method argument (for arguments that should default to empty
@ -696,14 +717,14 @@ class SimpleFrozenDict(dict):
class DummyTokenizer(object): class DummyTokenizer(object):
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557) # allow serialization (see #1557)
def to_bytes(self, **exclude): def to_bytes(self, **kwargs):
return b"" return b""
def from_bytes(self, _bytes_data, **exclude): def from_bytes(self, _bytes_data, **kwargs):
return self return self
def to_disk(self, _path, **exclude): def to_disk(self, _path, **kwargs):
return None return None
def from_disk(self, _path, **exclude): def from_disk(self, _path, **kwargs):
return self return self

View File

@ -377,11 +377,11 @@ cdef class Vectors:
self.add(key, row=i) self.add(key, row=i)
return strings return strings
def to_disk(self, path, **exclude): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object. it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk DOCS: https://spacy.io/api/vectors#to_disk
""" """
@ -394,9 +394,9 @@ cdef class Vectors:
("vectors", lambda p: save_array(self.data, p.open("wb"))), ("vectors", lambda p: save_array(self.data, p.open("wb"))),
("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) ("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
)) ))
return util.to_disk(path, serializers, exclude) return util.to_disk(path, serializers, [])
def from_disk(self, path, **exclude): def from_disk(self, path, **kwargs):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
@ -428,13 +428,13 @@ cdef class Vectors:
("keys", load_keys), ("keys", load_keys),
("vectors", load_vectors), ("vectors", load_vectors),
)) ))
util.from_disk(path, serializers, exclude) util.from_disk(path, serializers, [])
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized. exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vectors` object. RETURNS (bytes): The serialized form of the `Vectors` object.
DOCS: https://spacy.io/api/vectors#to_bytes DOCS: https://spacy.io/api/vectors#to_bytes
@ -444,17 +444,18 @@ cdef class Vectors:
return self.data.to_bytes() return self.data.to_bytes()
else: else:
return srsly.msgpack_dumps(self.data) return srsly.msgpack_dumps(self.data)
serializers = OrderedDict(( serializers = OrderedDict((
("key2row", lambda: srsly.msgpack_dumps(self.key2row)), ("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
("vectors", serialize_weights) ("vectors", serialize_weights)
)) ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, [])
def from_bytes(self, data, **exclude): def from_bytes(self, data, **kwargs):
"""Load state from a binary string. """Load state from a binary string.
data (bytes): The data to load from. data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded. exclude (list): String names of serialization fields to exclude.
RETURNS (Vectors): The `Vectors` object. RETURNS (Vectors): The `Vectors` object.
DOCS: https://spacy.io/api/vectors#from_bytes DOCS: https://spacy.io/api/vectors#from_bytes
@ -469,5 +470,5 @@ cdef class Vectors:
("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
("vectors", deserialize_weights) ("vectors", deserialize_weights)
)) ))
util.from_bytes(data, deserializers, exclude) util.from_bytes(data, deserializers, [])
return self return self

View File

@ -397,47 +397,57 @@ cdef class Vocab:
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
def to_disk(self, path, **exclude): def to_disk(self, path, exclude=tuple(), **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects. it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/vocab#to_disk DOCS: https://spacy.io/api/vocab#to_disk
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
self.strings.to_disk(path / "strings.json") setters = ["strings", "lexemes", "vectors"]
with (path / "lexemes.bin").open('wb') as file_: exclude = util.get_serialization_exclude(setters, exclude, kwargs)
file_.write(self.lexemes_to_bytes()) if "strings" not in exclude:
if self.vectors is not None: self.strings.to_disk(path / "strings.json")
if "lexemes" not in exclude:
with (path / "lexemes.bin").open("wb") as file_:
file_.write(self.lexemes_to_bytes())
if "vectors" not in "exclude" and self.vectors is not None:
self.vectors.to_disk(path) self.vectors.to_disk(path)
def from_disk(self, path, **exclude): def from_disk(self, path, exclude=tuple(), **kwargs):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory.
strings or `Path`-like objects. exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object. RETURNS (Vocab): The modified `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_disk DOCS: https://spacy.io/api/vocab#to_disk
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
self.strings.from_disk(path / "strings.json") getters = ["strings", "lexemes", "vectors"]
with (path / "lexemes.bin").open("rb") as file_: exclude = util.get_serialization_exclude(getters, exclude, kwargs)
self.lexemes_from_bytes(file_.read()) if "strings" not in exclude:
if self.vectors is not None: self.strings.from_disk(path / "strings.json") # TODO: add exclude?
self.vectors.from_disk(path, exclude="strings.json") if "lexemes" not in exclude:
if self.vectors.name is not None: with (path / "lexemes.bin").open("rb") as file_:
link_vectors_to_models(self) self.lexemes_from_bytes(file_.read())
if "vectors" not in exclude:
if self.vectors is not None:
self.vectors.from_disk(path, exclude=["strings"])
if self.vectors.name is not None:
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized. exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object. RETURNS (bytes): The serialized form of the `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_bytes DOCS: https://spacy.io/api/vocab#to_bytes
@ -453,13 +463,14 @@ cdef class Vocab:
("lexemes", lambda: self.lexemes_to_bytes()), ("lexemes", lambda: self.lexemes_to_bytes()),
("vectors", deserialize_vectors) ("vectors", deserialize_vectors)
)) ))
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded. exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object. RETURNS (Vocab): The `Vocab` object.
DOCS: https://spacy.io/api/vocab#from_bytes DOCS: https://spacy.io/api/vocab#from_bytes
@ -469,11 +480,13 @@ cdef class Vocab:
return None return None
else: else:
return self.vectors.from_bytes(b) return self.vectors.from_bytes(b)
setters = OrderedDict(( setters = OrderedDict((
("strings", lambda b: self.strings.from_bytes(b)), ("strings", lambda b: self.strings.from_bytes(b)),
("lexemes", lambda b: self.lexemes_from_bytes(b)), ("lexemes", lambda b: self.lexemes_from_bytes(b)),
("vectors", lambda b: serialize_vectors(b)) ("vectors", lambda b: serialize_vectors(b))
)) ))
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
if self.vectors.name is not None: if self.vectors.name is not None:
link_vectors_to_models(self) link_vectors_to_models(self)

View File

@ -244,9 +244,10 @@ Serialize the pipe to disk.
> parser.to_disk("/path/to/parser") > parser.to_disk("/path/to/parser")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## DependencyParser.from_disk {#from_disk tag="method"} ## DependencyParser.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- | | ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
## DependencyParser.to_bytes {#to_bytes tag="method"} ## DependencyParser.to_bytes {#to_bytes tag="method"}
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ----------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | | **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
## DependencyParser.from_bytes {#from_bytes tag="method"} ## DependencyParser.from_bytes {#from_bytes tag="method"}
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> parser.from_bytes(parser_bytes) > parser.from_bytes(parser_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------------ | ---------------------------------------------- | | ------------ | ------------------ | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | | **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
## DependencyParser.labels {#labels tag="property"} ## DependencyParser.labels {#labels tag="property"}
@ -312,3 +314,21 @@ The labels currently added to the component.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = parser.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -349,11 +349,12 @@ array of attributes.
> assert doc[0].pos_ == doc2[0].pos_ > assert doc[0].pos_ == doc2[0].pos_
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------------------------------------- | ----------------------------- | | ----------- | -------------------------------------- | ------------------------------------------------------------------------- |
| `attrs` | list | A list of attribute ID ints. | | `attrs` | list | A list of attribute ID ints. |
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | | `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
| **RETURNS** | `Doc` | Itself. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | Itself. |
## Doc.to_disk {#to_disk tag="method" new="2"} ## Doc.to_disk {#to_disk tag="method" new="2"}
@ -365,9 +366,10 @@ Save the current state to a directory.
> doc.to_disk("/path/to/doc") > doc.to_disk("/path/to/doc")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Doc.from_disk {#from_disk tag="method" new="2"} ## Doc.from_disk {#from_disk tag="method" new="2"}
@ -384,6 +386,7 @@ Loads state from a directory. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | The modified `Doc` object. | | **RETURNS** | `Doc` | The modified `Doc` object. |
## Doc.to_bytes {#to_bytes tag="method"} ## Doc.to_bytes {#to_bytes tag="method"}
@ -397,9 +400,10 @@ Serialize, i.e. export the document contents to a binary string.
> doc_bytes = doc.to_bytes() > doc_bytes = doc.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | --------------------------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
## Doc.from_bytes {#from_bytes tag="method"} ## Doc.from_bytes {#from_bytes tag="method"}
@ -416,10 +420,11 @@ Deserialize, i.e. import the document contents from a binary string.
> assert doc.text == doc2.text > assert doc.text == doc2.text
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ------------------------ | | ----------- | ----- | ------------------------------------------------------------------------- |
| `data` | bytes | The string to load from. | | `data` | bytes | The string to load from. |
| **RETURNS** | `Doc` | The `Doc` object. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | The `Doc` object. |
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
@ -658,3 +663,25 @@ The L2 norm of the document's vector representation.
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | | `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = doc.to_bytes(exclude=["text", "tensor"])
> doc.from_disk("./doc.bin", exclude=["user_data"])
> ```
| Name | Description |
| ------------------ | --------------------------------------------- |
| `text` | The value of the `Doc.text` attribute. |
| `sentiment` | The value of the `Doc.sentiment` attribute. |
| `tensor` | The value of the `Doc.tensor` attribute. |
| `user_data` | The value of the `Doc.user_data` dictionary. |
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
| `user_data_values` | The values of the `Doc.user_data` dictionary. |

View File

@ -244,9 +244,10 @@ Serialize the pipe to disk.
> ner.to_disk("/path/to/ner") > ner.to_disk("/path/to/ner")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityRecognizer.from_disk {#from_disk tag="method"} ## EntityRecognizer.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- | | ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
## EntityRecognizer.to_bytes {#to_bytes tag="method"} ## EntityRecognizer.to_bytes {#to_bytes tag="method"}
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ----------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | | **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
## EntityRecognizer.from_bytes {#from_bytes tag="method"} ## EntityRecognizer.from_bytes {#from_bytes tag="method"}
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ner.from_bytes(ner_bytes) > ner.from_bytes(ner_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------------ | ---------------------------------------------- | | ------------ | ------------------ | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | | **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
## EntityRecognizer.labels {#labels tag="property"} ## EntityRecognizer.labels {#labels tag="property"}
@ -312,3 +314,21 @@ The labels currently added to the component.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = ner.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -327,7 +327,7 @@ the model**.
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being saved. | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
## Language.from_disk {#from_disk tag="method" new="2"} ## Language.from_disk {#from_disk tag="method" new="2"}
@ -349,22 +349,22 @@ loaded object.
> nlp = English().from_disk("/path/to/en_model") > nlp = English().from_disk("/path/to/en_model")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------------------------- | | ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Language` | The modified `Language` object. | | **RETURNS** | `Language` | The modified `Language` object. |
<Infobox title="Changed in v2.0" variant="warning"> <Infobox title="Changed in v2.0" variant="warning">
As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`, As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`,
to improve consistency across classes. Pipeline components to prevent from being to improve consistency across classes. Pipeline components to prevent from being
loaded can now be added as a list to `disable`, instead of specifying one loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1),
keyword argument per component. instead of specifying one keyword argument per component.
```diff ```diff
- nlp = spacy.load("en", tagger=False, entity=False) - nlp = spacy.load("en", tagger=False, entity=False)
+ nlp = English().from_disk("/model", disable=["tagger', 'ner"]) + nlp = English().from_disk("/model", exclude=["tagger", "ner"])
``` ```
</Infobox> </Infobox>
@ -379,10 +379,10 @@ Serialize the current state to a binary string.
> nlp_bytes = nlp.to_bytes() > nlp_bytes = nlp.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------- | | ----------- | ----- | ----------------------------------------------------------------------------------------- |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being serialized. | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Language` object. | | **RETURNS** | bytes | The serialized form of the `Language` object. |
## Language.from_bytes {#from_bytes tag="method"} ## Language.from_bytes {#from_bytes tag="method"}
@ -400,20 +400,21 @@ available to the loaded object.
> nlp2.from_bytes(nlp_bytes) > nlp2.from_bytes(nlp_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ---------- | --------------------------------------------------------------------------------- | | ------------ | ---------- | ----------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Language` | The `Language` object. | | **RETURNS** | `Language` | The `Language` object. |
<Infobox title="Changed in v2.0" variant="warning"> <Infobox title="Changed in v2.0" variant="warning">
Pipeline components to prevent from being loaded can now be added as a list to Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component. `disable` (v2.0) or `exclude` (v2.1), instead of specifying one keyword argument
per component.
```diff ```diff
- nlp = English().from_bytes(bytes, tagger=False, entity=False) - nlp = English().from_bytes(bytes, tagger=False, entity=False)
+ nlp = English().from_bytes(bytes, disable=["tagger", "ner"]) + nlp = English().from_bytes(bytes, exclude=["tagger", "ner"])
``` ```
</Infobox> </Infobox>
@ -437,3 +438,23 @@ Pipeline components to prevent from being loaded can now be added as a list to
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | | `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | | `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | | `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
> nlp.from_disk("./model-data", exclude=["ner"])
> ```
| Name | Description |
| ----------- | -------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `tokenizer` | Tokenization rules and exceptions. |
| `meta` | The meta data, available as `Language.meta`. |
| ... | String names of pipeline components, e.g. `"ner"`. |

View File

@ -151,10 +151,9 @@ Serialize the current state to a binary string.
> store_bytes = stringstore.to_bytes() > store_bytes = stringstore.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- | | ----------- | ----- | ------------------------------------------------ |
| `**exclude` | - | Named attributes to prevent from being serialized. | | **RETURNS** | bytes | The serialized form of the `StringStore` object. |
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
## StringStore.from_bytes {#from_bytes tag="method"} ## StringStore.from_bytes {#from_bytes tag="method"}
@ -168,11 +167,10 @@ Load state from a binary string.
> new_store = StringStore().from_bytes(store_bytes) > new_store = StringStore().from_bytes(store_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------- | ---------------------------------------------- | | ------------ | ------------- | ------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | **RETURNS** | `StringStore` | The `StringStore` object. |
| **RETURNS** | `StringStore` | The `StringStore` object. |
## Utilities {#util} ## Utilities {#util}

View File

@ -244,9 +244,10 @@ Serialize the pipe to disk.
> tagger.to_disk("/path/to/tagger") > tagger.to_disk("/path/to/tagger")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tagger.from_disk {#from_disk tag="method"} ## Tagger.from_disk {#from_disk tag="method"}
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tagger` | The modified `Tagger` object. | | **RETURNS** | `Tagger` | The modified `Tagger` object. |
## Tagger.to_bytes {#to_bytes tag="method"} ## Tagger.to_bytes {#to_bytes tag="method"}
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Tagger` object. | | **RETURNS** | bytes | The serialized form of the `Tagger` object. |
## Tagger.from_bytes {#from_bytes tag="method"} ## Tagger.from_bytes {#from_bytes tag="method"}
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> tagger.from_bytes(tagger_bytes) > tagger.from_bytes(tagger_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | -------- | ---------------------------------------------- | | ------------ | -------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tagger` | The `Tagger` object. | | **RETURNS** | `Tagger` | The `Tagger` object. |
## Tagger.labels {#labels tag="property"} ## Tagger.labels {#labels tag="property"}
@ -314,3 +316,22 @@ tags by default, e.g. `VERB`, `NOUN` and so on.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = tagger.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------ |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |

View File

@ -260,9 +260,10 @@ Serialize the pipe to disk.
> textcat.to_disk("/path/to/textcat") > textcat.to_disk("/path/to/textcat")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## TextCategorizer.from_disk {#from_disk tag="method"} ## TextCategorizer.from_disk {#from_disk tag="method"}
@ -278,6 +279,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------- | -------------------------------------------------------------------------- | | ----------- | ----------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
## TextCategorizer.to_bytes {#to_bytes tag="method"} ## TextCategorizer.to_bytes {#to_bytes tag="method"}
@ -291,10 +293,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | | **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
## TextCategorizer.from_bytes {#from_bytes tag="method"} ## TextCategorizer.from_bytes {#from_bytes tag="method"}
@ -308,11 +310,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> textcat.from_bytes(textcat_bytes) > textcat.from_bytes(textcat_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ----------------- | ---------------------------------------------- | | ------------ | ----------------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | | **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
## TextCategorizer.labels {#labels tag="property"} ## TextCategorizer.labels {#labels tag="property"}
@ -328,3 +330,21 @@ The labels currently added to the component.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ----- | ---------------------------------- |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | tuple | The labels added to the component. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = textcat.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |

View File

@ -127,9 +127,10 @@ Serialize the tokenizer to disk.
> tokenizer.to_disk("/path/to/tokenizer") > tokenizer.to_disk("/path/to/tokenizer")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tokenizer.from_disk {#from_disk tag="method"} ## Tokenizer.from_disk {#from_disk tag="method"}
@ -145,6 +146,7 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | | **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
## Tokenizer.to_bytes {#to_bytes tag="method"} ## Tokenizer.to_bytes {#to_bytes tag="method"}
@ -158,10 +160,10 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
Serialize the tokenizer to a bytestring. Serialize the tokenizer to a bytestring.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | | **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
## Tokenizer.from_bytes {#from_bytes tag="method"} ## Tokenizer.from_bytes {#from_bytes tag="method"}
@ -176,11 +178,11 @@ it.
> tokenizer.from_bytes(tokenizer_bytes) > tokenizer.from_bytes(tokenizer_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ----------- | ---------------------------------------------- | | ------------ | ----------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | | **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
## Attributes {#attributes} ## Attributes {#attributes}
@ -190,3 +192,25 @@ it.
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | | `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | | `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | | `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = tokenizer.to_bytes(exclude=["vocab", "exceptions"])
> tokenizer.from_disk("./data", exclude=["token_match"])
> ```
| Name | Description |
| ---------------- | --------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `prefix_search` | The prefix rules. |
| `suffix_search` | The suffix rules. |
| `infix_finditer` | The infix rules. |
| `token_match` | The token match expression. |
| `exceptions` | The tokenizer exception rules. |

View File

@ -311,10 +311,9 @@ Save the current state to a directory.
> >
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `**exclude` | - | Named attributes to prevent from being saved. |
## Vectors.from_disk {#from_disk tag="method"} ## Vectors.from_disk {#from_disk tag="method"}
@ -342,10 +341,9 @@ Serialize the current state to a binary string.
> vectors_bytes = vectors.to_bytes() > vectors_bytes = vectors.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- | | ----------- | ----- | -------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | **RETURNS** | bytes | The serialized form of the `Vectors` object. |
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
## Vectors.from_bytes {#from_bytes tag="method"} ## Vectors.from_bytes {#from_bytes tag="method"}
@ -360,11 +358,10 @@ Load state from a binary string.
> new_vectors.from_bytes(vectors_bytes) > new_vectors.from_bytes(vectors_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | --------- | ---------------------------------------------- | | ----------- | --------- | ---------------------- |
| `data` | bytes | The data to load from. | | `data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | **RETURNS** | `Vectors` | The `Vectors` object. |
| **RETURNS** | `Vectors` | The `Vectors` object. |
## Attributes {#attributes} ## Attributes {#attributes}

View File

@ -221,9 +221,10 @@ Save the current state to a directory.
> nlp.vocab.to_disk("/path/to/vocab") > nlp.vocab.to_disk("/path/to/vocab")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"} ## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -239,6 +240,7 @@ Loads state from a directory. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The modified `Vocab` object. | | **RETURNS** | `Vocab` | The modified `Vocab` object. |
## Vocab.to_bytes {#to_bytes tag="method"} ## Vocab.to_bytes {#to_bytes tag="method"}
@ -251,10 +253,10 @@ Serialize the current state to a binary string.
> vocab_bytes = nlp.vocab.to_bytes() > vocab_bytes = nlp.vocab.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------------------- |
| `**exclude` | - | Named attributes to prevent from being serialized. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | bytes | The serialized form of the `Vocab` object. | | **RETURNS** | bytes | The serialized form of the `Vocab` object. |
## Vocab.from_bytes {#from_bytes tag="method"} ## Vocab.from_bytes {#from_bytes tag="method"}
@ -269,11 +271,11 @@ Load state from a binary string.
> vocab.from_bytes(vocab_bytes) > vocab.from_bytes(vocab_bytes)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------- | ---------------------------------------------- | | ------------ | ------- | ------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | bytes | The data to load from. |
| `**exclude` | - | Named attributes to prevent from being loaded. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The `Vocab` object. | | **RETURNS** | `Vocab` | The `Vocab` object. |
## Attributes {#attributes} ## Attributes {#attributes}
@ -291,3 +293,22 @@ Load state from a binary string.
| `strings` | `StringStore` | A table managing the string-to-int mapping. | | `strings` | `StringStore` | A table managing the string-to-int mapping. |
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. | | `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
| `vectors_length` | int | Number of dimensions for each word vector. | | `vectors_length` | int | Number of dimensions for each word vector. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = vocab.to_bytes(exclude=["strings", "vectors"])
> vocab.from_disk("./vocab", exclude=["strings"])
> ```
| Name | Description |
| --------- | ----------------------------------------------------- |
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. |