mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
💫 Make serialization methods consistent (#3385)
* Make serialization methods consistent exclude keyword argument instead of random named keyword arguments and deprecation handling * Update docs and add section on serialization fields
This commit is contained in:
parent
9a8f169e5c
commit
7ba3a5d95c
|
@ -70,6 +70,12 @@ class Warnings(object):
|
|||
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
||||
"efficient and less error-prone Doc.retokenize context manager "
|
||||
"instead.")
|
||||
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
||||
"methods is and should be replaced with `exclude`. This makes it "
|
||||
"consistent with the other objects serializable.")
|
||||
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
||||
"being serialized or deserialized is deprecated. Please use the "
|
||||
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -348,7 +354,10 @@ class Errors(object):
|
|||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
||||
"is likely a bug in spaCy.")
|
||||
|
||||
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
||||
"arguments to exclude fields from being serialized or deserialized "
|
||||
"is now deprecated. Please use the `exclude` argument instead. "
|
||||
"For example: exclude=['{arg}'].")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -28,7 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
|||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||
from .errors import Errors
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
from . import about
|
||||
|
||||
|
@ -699,124 +699,114 @@ class Language(object):
|
|||
self.tokenizer._reset_cache(keys)
|
||||
nr_seen = 0
|
||||
|
||||
def to_disk(self, path, disable=tuple()):
|
||||
def to_disk(self, path, exclude=tuple(), disable=None):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be strings or `Path`-like objects.
|
||||
disable (list): Names of pipeline components to disable and prevent
|
||||
from being saved.
|
||||
path (unicode or Path): Path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
|
||||
EXAMPLE:
|
||||
>>> nlp.to_disk('/path/to/models')
|
||||
DOCS: https://spacy.io/api/language#to_disk
|
||||
"""
|
||||
if disable is not None:
|
||||
deprecation_warning(Warnings.W014)
|
||||
exclude = disable
|
||||
path = util.ensure_path(path)
|
||||
serializers = OrderedDict(
|
||||
(
|
||||
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
|
||||
)
|
||||
)
|
||||
serializers = OrderedDict()
|
||||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
|
||||
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, "name"):
|
||||
continue
|
||||
if name in disable:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "to_disk"):
|
||||
continue
|
||||
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
|
||||
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
util.to_disk(path, serializers, {p: False for p in disable})
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, disable=tuple()):
|
||||
def from_disk(self, path, exclude=tuple(), disable=None):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it. If the saved `Language` object contains a model, the
|
||||
model will be loaded.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.language import Language
|
||||
>>> nlp = Language().from_disk('/path/to/models')
|
||||
DOCS: https://spacy.io/api/language#from_disk
|
||||
"""
|
||||
if disable is not None:
|
||||
deprecation_warning(Warnings.W014)
|
||||
exclude = disable
|
||||
path = util.ensure_path(path)
|
||||
deserializers = OrderedDict(
|
||||
(
|
||||
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
|
||||
(
|
||||
"vocab",
|
||||
lambda p: (
|
||||
self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
||||
),
|
||||
),
|
||||
("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
||||
)
|
||||
)
|
||||
deserializers = OrderedDict()
|
||||
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
||||
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
||||
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "from_disk"):
|
||||
continue
|
||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||
exclude = {p: False for p in disable}
|
||||
if not (path / "vocab").exists():
|
||||
exclude["vocab"] = True
|
||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
|
||||
if not (path / "vocab").exists() and "vocab" not in exclude:
|
||||
# Convert to list here in case exclude is (default) tuple
|
||||
exclude = list(exclude) + ["vocab"]
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
self._path = path
|
||||
return self
|
||||
|
||||
def to_bytes(self, disable=[], **exclude):
|
||||
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
disable (list): Nameds of pipeline components to disable and prevent
|
||||
from being serialized.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Language` object.
|
||||
|
||||
DOCS: https://spacy.io/api/language#to_bytes
|
||||
"""
|
||||
serializers = OrderedDict(
|
||||
(
|
||||
("vocab", lambda: self.vocab.to_bytes()),
|
||||
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
("meta", lambda: srsly.json_dumps(self.meta)),
|
||||
)
|
||||
)
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
if disable is not None:
|
||||
deprecation_warning(Warnings.W014)
|
||||
exclude = disable
|
||||
serializers = OrderedDict()
|
||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||
for name, proc in self.pipeline:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "to_bytes"):
|
||||
continue
|
||||
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
|
||||
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
|
||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, disable=[]):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (Language): The `Language` object.
|
||||
|
||||
DOCS: https://spacy.io/api/language#from_bytes
|
||||
"""
|
||||
deserializers = OrderedDict(
|
||||
(
|
||||
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
|
||||
(
|
||||
"vocab",
|
||||
lambda b: (
|
||||
self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
||||
),
|
||||
),
|
||||
("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
||||
)
|
||||
)
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
if disable is not None:
|
||||
deprecation_warning(Warnings.W014)
|
||||
exclude = disable
|
||||
deserializers = OrderedDict()
|
||||
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
||||
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
||||
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
|
||||
for name, proc in self.pipeline:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "from_bytes"):
|
||||
continue
|
||||
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
|
||||
util.from_bytes(bytes_data, deserializers, {})
|
||||
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, deserializers, exclude)
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
@ -141,16 +141,21 @@ class Pipe(object):
|
|||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
"""
|
||||
serialize = OrderedDict()
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
if self.model not in (True, False, None):
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
"""Load the pipe from a bytestring."""
|
||||
|
||||
def load_model(b):
|
||||
|
@ -161,26 +166,25 @@ class Pipe(object):
|
|||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
deserialize = OrderedDict(
|
||||
(
|
||||
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||
("vocab", lambda b: self.vocab.from_bytes(b)),
|
||||
("model", load_model),
|
||||
)
|
||||
)
|
||||
deserialize = OrderedDict()
|
||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["model"] = load_model
|
||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict()
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
if self.model not in (None, True, False):
|
||||
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Load the pipe from disk."""
|
||||
|
||||
def load_model(p):
|
||||
|
@ -191,13 +195,11 @@ class Pipe(object):
|
|||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(p.open("rb").read())
|
||||
|
||||
deserialize = OrderedDict(
|
||||
(
|
||||
("cfg", lambda p: self.cfg.update(_load_cfg(p))),
|
||||
("vocab", lambda p: self.vocab.from_disk(p)),
|
||||
("model", load_model),
|
||||
)
|
||||
)
|
||||
deserialize = OrderedDict()
|
||||
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
@ -537,7 +539,7 @@ class Tagger(Pipe):
|
|||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
serialize = OrderedDict()
|
||||
if self.model not in (None, True, False):
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
@ -545,9 +547,10 @@ class Tagger(Pipe):
|
|||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
def load_model(b):
|
||||
# TODO: Remove this once we don't have to handle previous models
|
||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||
|
@ -572,20 +575,22 @@ class Tagger(Pipe):
|
|||
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||
("model", lambda b: load_model(b)),
|
||||
))
|
||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||
serialize = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
|
||||
('model', lambda p: p.open("wb").write(self.model.to_bytes())),
|
||||
('cfg', lambda p: srsly.write_json(p, self.cfg))
|
||||
("vocab", lambda p: self.vocab.to_disk(p)),
|
||||
("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
|
||||
("model", lambda p: p.open("wb").write(self.model.to_bytes())),
|
||||
("cfg", lambda p: srsly.write_json(p, self.cfg))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
def load_model(p):
|
||||
# TODO: Remove this once we don't have to handle previous models
|
||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||
|
@ -608,6 +613,7 @@ class Tagger(Pipe):
|
|||
("tag_map", load_tag_map),
|
||||
("model", load_model),
|
||||
))
|
||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
|
|
@ -236,19 +236,17 @@ cdef class StringStore:
|
|||
self.add(word)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
return srsly.json_dumps(list(self))
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, **kwargs):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (StringStore): The `StringStore` object.
|
||||
"""
|
||||
strings = srsly.json_loads(bytes_data)
|
||||
|
|
|
@ -228,7 +228,7 @@ cdef class Parser:
|
|||
self.set_annotations(subbatch, parse_states, tensors=None)
|
||||
for doc in batch_in_order:
|
||||
yield doc
|
||||
|
||||
|
||||
def require_model(self):
|
||||
"""Raise an error if the component's model is not initialized."""
|
||||
if getattr(self, 'model', None) in (None, True, False):
|
||||
|
@ -272,7 +272,7 @@ cdef class Parser:
|
|||
beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
|
||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
# expand our model output.
|
||||
self.model.resize_output(self.moves.n_moves)
|
||||
model = self.model(docs)
|
||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||
|
@ -442,7 +442,7 @@ cdef class Parser:
|
|||
if self._rehearsal_model is None:
|
||||
return None
|
||||
losses.setdefault(self.name, 0.)
|
||||
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
|
@ -603,22 +603,24 @@ cdef class Parser:
|
|||
self.cfg.update(cfg)
|
||||
return sgd
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
serializers = {
|
||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||
'vocab': lambda p: self.vocab.to_disk(p),
|
||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||
}
|
||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
deserializers = {
|
||||
'vocab': lambda p: self.vocab.from_disk(p),
|
||||
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
||||
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||
'model': lambda p: None
|
||||
}
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
path = util.ensure_path(path)
|
||||
|
@ -632,22 +634,24 @@ cdef class Parser:
|
|||
self.cfg.update(cfg)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
serializers = OrderedDict((
|
||||
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||
('moves', lambda: self.moves.to_bytes(exclude=["strings"])),
|
||||
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||
('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])),
|
||||
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||
('model', lambda b: None)
|
||||
))
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
# TODO: Remove this once we don't have to handle previous models
|
||||
|
|
|
@ -208,30 +208,32 @@ cdef class TransitionSystem:
|
|||
self.labels[action][label_name] = new_freq-1
|
||||
return 1
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, **kwargs):
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
file_.write(self.to_bytes(**kwargs))
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, **kwargs):
|
||||
with path.open('rb') as file_:
|
||||
byte_data = file_.read()
|
||||
self.from_bytes(byte_data, **exclude)
|
||||
self.from_bytes(byte_data, **kwargs)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
transitions = []
|
||||
serializers = {
|
||||
'moves': lambda: srsly.json_dumps(self.labels),
|
||||
'strings': lambda: self.strings.to_bytes()
|
||||
}
|
||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
labels = {}
|
||||
deserializers = {
|
||||
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
||||
'strings': lambda b: self.strings.from_bytes(b)
|
||||
}
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
self.initialize_actions(labels)
|
||||
return self
|
||||
|
|
|
@ -113,14 +113,14 @@ def test_doc_api_serialize(en_tokenizer, text):
|
|||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||
tokens.to_bytes(tensor=False), tensor=False
|
||||
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
|
||||
)
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||
tokens.to_bytes(sentiment=False), sentiment=False
|
||||
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
|
||||
)
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.compat import path2str
|
||||
|
||||
|
@ -41,3 +42,18 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
|||
doc.to_disk(file_path)
|
||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||
assert doc.to_bytes() == doc_d.to_bytes()
|
||||
|
||||
|
||||
def test_serialize_doc_exclude(en_vocab):
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
|
||||
assert not new_doc.user_data
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
|
||||
assert not new_doc.user_data
|
||||
with pytest.raises(ValueError):
|
||||
doc.to_bytes(user_data=False)
|
||||
with pytest.raises(ValueError):
|
||||
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
|
||||
|
|
|
@ -52,3 +52,19 @@ def test_serialize_with_custom_tokenizer():
|
|||
nlp.tokenizer = custom_tokenizer(nlp)
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
|
||||
|
||||
def test_serialize_language_exclude(meta_data):
|
||||
name = "name-in-fixture"
|
||||
nlp = Language(meta=meta_data)
|
||||
assert nlp.meta["name"] == name
|
||||
new_nlp = Language().from_bytes(nlp.to_bytes())
|
||||
assert nlp.meta["name"] == name
|
||||
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
|
||||
assert not new_nlp.meta["name"] == name
|
||||
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
||||
assert not new_nlp.meta["name"] == name
|
||||
with pytest.raises(ValueError):
|
||||
nlp.to_bytes(meta=False)
|
||||
with pytest.raises(ValueError):
|
||||
Language().from_bytes(nlp.to_bytes(), meta=False)
|
||||
|
|
|
@ -55,7 +55,9 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
|||
parser_d = Parser(en_vocab)
|
||||
parser_d.model, _ = parser_d.Model(0)
|
||||
parser_d = parser_d.from_disk(file_path)
|
||||
assert parser.to_bytes(model=False) == parser_d.to_bytes(model=False)
|
||||
parser_bytes = parser.to_bytes(exclude=["model"])
|
||||
parser_d_bytes = parser_d.to_bytes(exclude=["model"])
|
||||
assert parser_bytes == parser_d_bytes
|
||||
|
||||
|
||||
def test_to_from_bytes(parser, blank_parser):
|
||||
|
@ -114,3 +116,25 @@ def test_serialize_textcat_empty(en_vocab):
|
|||
# See issue #1105
|
||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||
textcat.to_bytes()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Parser", test_parsers)
|
||||
def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||
def get_new_parser():
|
||||
new_parser = Parser(en_vocab)
|
||||
new_parser.model, _ = new_parser.Model(0)
|
||||
return new_parser
|
||||
|
||||
parser = Parser(en_vocab)
|
||||
parser.model, _ = parser.Model(0)
|
||||
parser.cfg["foo"] = "bar"
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes())
|
||||
assert "foo" in new_parser.cfg
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"])
|
||||
assert "foo" not in new_parser.cfg
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]))
|
||||
assert "foo" not in new_parser.cfg
|
||||
with pytest.raises(ValueError):
|
||||
parser.to_bytes(cfg=False)
|
||||
with pytest.raises(ValueError):
|
||||
get_new_parser().from_bytes(parser.to_bytes(), cfg=False)
|
||||
|
|
|
@ -360,36 +360,37 @@ cdef class Tokenizer:
|
|||
self._cache.set(key, cached)
|
||||
self._rules[string] = substrings
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#to_disk
|
||||
"""
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
file_.write(self.to_bytes(**kwargs))
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#from_disk
|
||||
"""
|
||||
with path.open("rb") as file_:
|
||||
bytes_data = file_.read()
|
||||
self.from_bytes(bytes_data, **exclude)
|
||||
self.from_bytes(bytes_data, **kwargs)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#to_bytes
|
||||
|
@ -402,13 +403,14 @@ cdef class Tokenizer:
|
|||
("token_match", lambda: _get_regex_pattern(self.token_match)),
|
||||
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#from_bytes
|
||||
|
@ -422,6 +424,7 @@ cdef class Tokenizer:
|
|||
("token_match", lambda b: data.setdefault("token_match", b)),
|
||||
("exceptions", lambda b: data.setdefault("rules", b))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if data.get("prefix_search"):
|
||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||
|
|
|
@ -794,24 +794,26 @@ cdef class Doc:
|
|||
"""
|
||||
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
file_.write(self.to_bytes(**kwargs))
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): The modified `Doc` object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_disk
|
||||
|
@ -819,11 +821,12 @@ cdef class Doc:
|
|||
path = util.ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
bytes_data = file_.read()
|
||||
return self.from_bytes(bytes_data, **exclude)
|
||||
return self.from_bytes(bytes_data, **kwargs)
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
"""Serialize, i.e. export the document contents to a binary string.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
|
||||
|
@ -849,16 +852,22 @@ cdef class Doc:
|
|||
"sentiment": lambda: self.sentiment,
|
||||
"tensor": lambda: self.tensor,
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||
raise ValueError(Errors.E128.format(arg=key))
|
||||
if "user_data" not in exclude and self.user_data:
|
||||
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
||||
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||
if "user_data_keys" not in exclude:
|
||||
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||
if "user_data_values" not in exclude:
|
||||
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
"""Deserialize, i.e. import the document contents from a binary string.
|
||||
|
||||
data (bytes): The string to load from.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): Itself.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_bytes
|
||||
|
@ -874,6 +883,9 @@ cdef class Doc:
|
|||
"user_data_keys": lambda b: None,
|
||||
"user_data_values": lambda b: None,
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in deserializers or key in ("user_data",):
|
||||
raise ValueError(Errors.E128.format(arg=key))
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||
# vexing for user data. As a best guess, we *know* that within
|
||||
|
@ -1170,7 +1182,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
|
||||
|
||||
def pickle_doc(doc):
|
||||
bytes_data = doc.to_bytes(vocab=False, user_data=False)
|
||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
|
||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||
doc.user_token_hooks)
|
||||
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
|
||||
|
@ -1179,7 +1191,7 @@ def pickle_doc(doc):
|
|||
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
|
||||
|
||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data")
|
||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
|
||||
doc.user_hooks.update(doc_hooks)
|
||||
doc.user_span_hooks.update(span_hooks)
|
||||
doc.user_token_hooks.update(token_hooks)
|
||||
|
|
|
@ -25,7 +25,7 @@ except ImportError:
|
|||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||
from .compat import import_file
|
||||
from .errors import Errors
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
|
@ -565,7 +565,8 @@ def itershuffle(iterable, bufsize=1000):
|
|||
def to_bytes(getters, exclude):
|
||||
serialized = OrderedDict()
|
||||
for key, getter in getters.items():
|
||||
if key not in exclude:
|
||||
# Split to support file names like meta.json
|
||||
if key.split(".")[0] not in exclude:
|
||||
serialized[key] = getter()
|
||||
return srsly.msgpack_dumps(serialized)
|
||||
|
||||
|
@ -573,7 +574,8 @@ def to_bytes(getters, exclude):
|
|||
def from_bytes(bytes_data, setters, exclude):
|
||||
msg = srsly.msgpack_loads(bytes_data)
|
||||
for key, setter in setters.items():
|
||||
if key not in exclude and key in msg:
|
||||
# Split to support file names like meta.json
|
||||
if key.split(".")[0] not in exclude and key in msg:
|
||||
setter(msg[key])
|
||||
return msg
|
||||
|
||||
|
@ -583,7 +585,8 @@ def to_disk(path, writers, exclude):
|
|||
if not path.exists():
|
||||
path.mkdir()
|
||||
for key, writer in writers.items():
|
||||
if key not in exclude:
|
||||
# Split to support file names like meta.json
|
||||
if key.split(".")[0] not in exclude:
|
||||
writer(path / key)
|
||||
return path
|
||||
|
||||
|
@ -591,7 +594,8 @@ def to_disk(path, writers, exclude):
|
|||
def from_disk(path, readers, exclude):
|
||||
path = ensure_path(path)
|
||||
for key, reader in readers.items():
|
||||
if key not in exclude:
|
||||
# Split to support file names like meta.json
|
||||
if key.split(".")[0] not in exclude:
|
||||
reader(path / key)
|
||||
return path
|
||||
|
||||
|
@ -677,6 +681,23 @@ def validate_json(data, validator):
|
|||
return errors
|
||||
|
||||
|
||||
def get_serialization_exclude(serializers, exclude, kwargs):
|
||||
"""Helper function to validate serialization args and manage transition from
|
||||
keyword arguments (pre v2.1) to exclude argument.
|
||||
"""
|
||||
exclude = list(exclude)
|
||||
# Split to support file names like meta.json
|
||||
options = [name.split(".")[0] for name in serializers]
|
||||
for key, value in kwargs.items():
|
||||
if key in ("vocab",) and value is False:
|
||||
deprecation_warning(Warnings.W015.format(arg=key))
|
||||
exclude.append(key)
|
||||
elif key.split(".")[0] in options:
|
||||
raise ValueError(Errors.E128.format(arg=key))
|
||||
# TODO: user warning?
|
||||
return exclude
|
||||
|
||||
|
||||
class SimpleFrozenDict(dict):
|
||||
"""Simplified implementation of a frozen dict, mainly used as default
|
||||
function or method argument (for arguments that should default to empty
|
||||
|
@ -696,14 +717,14 @@ class SimpleFrozenDict(dict):
|
|||
class DummyTokenizer(object):
|
||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||
# allow serialization (see #1557)
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, **kwargs):
|
||||
return b""
|
||||
|
||||
def from_bytes(self, _bytes_data, **exclude):
|
||||
def from_bytes(self, _bytes_data, **kwargs):
|
||||
return self
|
||||
|
||||
def to_disk(self, _path, **exclude):
|
||||
def to_disk(self, _path, **kwargs):
|
||||
return None
|
||||
|
||||
def from_disk(self, _path, **exclude):
|
||||
def from_disk(self, _path, **kwargs):
|
||||
return self
|
||||
|
|
|
@ -377,11 +377,11 @@ cdef class Vectors:
|
|||
self.add(key, row=i)
|
||||
return strings
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode / Path): A path to a directory, which will be created if
|
||||
it doesn't exists. Either a string or a Path-like object.
|
||||
it doesn't exists.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#to_disk
|
||||
"""
|
||||
|
@ -394,9 +394,9 @@ cdef class Vectors:
|
|||
("vectors", lambda p: save_array(self.data, p.open("wb"))),
|
||||
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
|
||||
))
|
||||
return util.to_disk(path, serializers, exclude)
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
|
@ -428,13 +428,13 @@ cdef class Vectors:
|
|||
("keys", load_keys),
|
||||
("vectors", load_vectors),
|
||||
))
|
||||
util.from_disk(path, serializers, exclude)
|
||||
util.from_disk(path, serializers, [])
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#to_bytes
|
||||
|
@ -444,17 +444,18 @@ cdef class Vectors:
|
|||
return self.data.to_bytes()
|
||||
else:
|
||||
return srsly.msgpack_dumps(self.data)
|
||||
|
||||
serializers = OrderedDict((
|
||||
("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
|
||||
("vectors", serialize_weights)
|
||||
))
|
||||
return util.to_bytes(serializers, exclude)
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data, **exclude):
|
||||
def from_bytes(self, data, **kwargs):
|
||||
"""Load state from a binary string.
|
||||
|
||||
data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vectors): The `Vectors` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#from_bytes
|
||||
|
@ -469,5 +470,5 @@ cdef class Vectors:
|
|||
("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
|
||||
("vectors", deserialize_weights)
|
||||
))
|
||||
util.from_bytes(data, deserializers, exclude)
|
||||
util.from_bytes(data, deserializers, [])
|
||||
return self
|
||||
|
|
|
@ -397,47 +397,57 @@ cdef class Vocab:
|
|||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
self.strings.to_disk(path / "strings.json")
|
||||
with (path / "lexemes.bin").open('wb') as file_:
|
||||
file_.write(self.lexemes_to_bytes())
|
||||
if self.vectors is not None:
|
||||
setters = ["strings", "lexemes", "vectors"]
|
||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||
if "strings" not in exclude:
|
||||
self.strings.to_disk(path / "strings.json")
|
||||
if "lexemes" not in exclude:
|
||||
with (path / "lexemes.bin").open("wb") as file_:
|
||||
file_.write(self.lexemes_to_bytes())
|
||||
if "vectors" not in "exclude" and self.vectors is not None:
|
||||
self.vectors.to_disk(path)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
self.strings.from_disk(path / "strings.json")
|
||||
with (path / "lexemes.bin").open("rb") as file_:
|
||||
self.lexemes_from_bytes(file_.read())
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude="strings.json")
|
||||
if self.vectors.name is not None:
|
||||
link_vectors_to_models(self)
|
||||
getters = ["strings", "lexemes", "vectors"]
|
||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||
if "strings" not in exclude:
|
||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||
if "lexemes" not in exclude:
|
||||
with (path / "lexemes.bin").open("rb") as file_:
|
||||
self.lexemes_from_bytes(file_.read())
|
||||
if "vectors" not in exclude:
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude=["strings"])
|
||||
if self.vectors.name is not None:
|
||||
link_vectors_to_models(self)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
||||
|
@ -453,13 +463,14 @@ cdef class Vocab:
|
|||
("lexemes", lambda: self.lexemes_to_bytes()),
|
||||
("vectors", deserialize_vectors)
|
||||
))
|
||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||
return util.to_bytes(getters, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
||||
|
@ -469,11 +480,13 @@ cdef class Vocab:
|
|||
return None
|
||||
else:
|
||||
return self.vectors.from_bytes(b)
|
||||
|
||||
setters = OrderedDict((
|
||||
("strings", lambda b: self.strings.from_bytes(b)),
|
||||
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
||||
("vectors", lambda b: serialize_vectors(b))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, setters, exclude)
|
||||
if self.vectors.name is not None:
|
||||
link_vectors_to_models(self)
|
||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
|||
> parser.to_disk("/path/to/parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## DependencyParser.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
||||
|
||||
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
||||
|
||||
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> parser.from_bytes(parser_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------------ | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
||||
|
||||
## DependencyParser.labels {#labels tag="property"}
|
||||
|
||||
|
@ -312,3 +314,21 @@ The labels currently added to the component.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = parser.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
|
|
|
@ -349,11 +349,12 @@ array of attributes.
|
|||
> assert doc[0].pos_ == doc2[0].pos_
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ----------------------------- |
|
||||
| `attrs` | list | A list of attribute ID ints. |
|
||||
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
|
||||
| **RETURNS** | `Doc` | Itself. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ------------------------------------------------------------------------- |
|
||||
| `attrs` | list | A list of attribute ID ints. |
|
||||
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | Itself. |
|
||||
|
||||
## Doc.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -365,9 +366,10 @@ Save the current state to a directory.
|
|||
> doc.to_disk("/path/to/doc")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Doc.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -384,6 +386,7 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
||||
|
||||
## Doc.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -397,9 +400,10 @@ Serialize, i.e. export the document contents to a binary string.
|
|||
> doc_bytes = doc.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
|
||||
|
||||
## Doc.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -416,10 +420,11 @@ Deserialize, i.e. import the document contents from a binary string.
|
|||
> assert doc.text == doc2.text
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `data` | bytes | The string to load from. |
|
||||
| **RETURNS** | `Doc` | The `Doc` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `data` | bytes | The string to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The `Doc` object. |
|
||||
|
||||
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
|
||||
|
||||
|
@ -658,3 +663,25 @@ The L2 norm of the document's vector representation.
|
|||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = doc.to_bytes(exclude=["text", "tensor"])
|
||||
> doc.from_disk("./doc.bin", exclude=["user_data"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------- |
|
||||
| `text` | The value of the `Doc.text` attribute. |
|
||||
| `sentiment` | The value of the `Doc.sentiment` attribute. |
|
||||
| `tensor` | The value of the `Doc.tensor` attribute. |
|
||||
| `user_data` | The value of the `Doc.user_data` dictionary. |
|
||||
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
|
||||
| `user_data_values` | The values of the `Doc.user_data` dictionary. |
|
||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
|||
> ner.to_disk("/path/to/ner")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
||||
|
||||
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
||||
|
||||
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> ner.from_bytes(ner_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------------ | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
||||
|
||||
## EntityRecognizer.labels {#labels tag="property"}
|
||||
|
||||
|
@ -312,3 +314,21 @@ The labels currently added to the component.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = ner.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
|
|
|
@ -327,7 +327,7 @@ the model**.
|
|||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being saved. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -349,22 +349,22 @@ loaded object.
|
|||
> nlp = English().from_disk("/path/to/en_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||
|
||||
<Infobox title="Changed in v2.0" variant="warning">
|
||||
|
||||
As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`,
|
||||
to improve consistency across classes. Pipeline components to prevent from being
|
||||
loaded can now be added as a list to `disable`, instead of specifying one
|
||||
keyword argument per component.
|
||||
loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1),
|
||||
instead of specifying one keyword argument per component.
|
||||
|
||||
```diff
|
||||
- nlp = spacy.load("en", tagger=False, entity=False)
|
||||
+ nlp = English().from_disk("/model", disable=["tagger', 'ner"])
|
||||
+ nlp = English().from_disk("/model", exclude=["tagger", "ner"])
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
@ -379,10 +379,10 @@ Serialize the current state to a binary string.
|
|||
> nlp_bytes = nlp.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Language` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------------------------------------------------------- |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Language` object. |
|
||||
|
||||
## Language.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -400,20 +400,21 @@ available to the loaded object.
|
|||
> nlp2.from_bytes(nlp_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Language` | The `Language` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---------- | ----------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The `Language` object. |
|
||||
|
||||
<Infobox title="Changed in v2.0" variant="warning">
|
||||
|
||||
Pipeline components to prevent from being loaded can now be added as a list to
|
||||
`disable`, instead of specifying one keyword argument per component.
|
||||
`disable` (v2.0) or `exclude` (v2.1), instead of specifying one keyword argument
|
||||
per component.
|
||||
|
||||
```diff
|
||||
- nlp = English().from_bytes(bytes, tagger=False, entity=False)
|
||||
+ nlp = English().from_bytes(bytes, disable=["tagger", "ner"])
|
||||
+ nlp = English().from_bytes(bytes, exclude=["tagger", "ner"])
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
@ -437,3 +438,23 @@ Pipeline components to prevent from being loaded can now be added as a list to
|
|||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
||||
> nlp.from_disk("./model-data", exclude=["ner"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `tokenizer` | Tokenization rules and exceptions. |
|
||||
| `meta` | The meta data, available as `Language.meta`. |
|
||||
| ... | String names of pipeline components, e.g. `"ner"`. |
|
||||
|
|
|
@ -151,10 +151,9 @@ Serialize the current state to a binary string.
|
|||
> store_bytes = stringstore.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------ |
|
||||
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
|
||||
|
||||
## StringStore.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -168,11 +167,10 @@ Load state from a binary string.
|
|||
> new_store = StringStore().from_bytes(store_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `StringStore` | The `StringStore` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `StringStore` | The `StringStore` object. |
|
||||
|
||||
## Utilities {#util}
|
||||
|
||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
|||
> tagger.to_disk("/path/to/tagger")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Tagger.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||
|
||||
## Tagger.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
||||
|
||||
## Tagger.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> tagger.from_bytes(tagger_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
||||
|
||||
## Tagger.labels {#labels tag="property"}
|
||||
|
||||
|
@ -314,3 +316,22 @@ tags by default, e.g. `VERB`, `NOUN` and so on.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = tagger.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
|
||||
|
|
|
@ -260,9 +260,10 @@ Serialize the pipe to disk.
|
|||
> textcat.to_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## TextCategorizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -278,6 +279,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
||||
|
||||
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -291,10 +293,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
||||
|
||||
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -308,11 +310,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> textcat.from_bytes(textcat_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------------- | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
||||
|
||||
## TextCategorizer.labels {#labels tag="property"}
|
||||
|
||||
|
@ -328,3 +330,21 @@ The labels currently added to the component.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = textcat.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
|
|
|
@ -127,9 +127,10 @@ Serialize the tokenizer to disk.
|
|||
> tokenizer.to_disk("/path/to/tokenizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Tokenizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -145,6 +146,7 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
||||
|
||||
## Tokenizer.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -158,10 +160,10 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the tokenizer to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
|
||||
|
||||
## Tokenizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -176,11 +178,11 @@ it.
|
|||
> tokenizer.from_bytes(tokenizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------- | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
@ -190,3 +192,25 @@ it.
|
|||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = tokenizer.to_bytes(exclude=["vocab", "exceptions"])
|
||||
> tokenizer.from_disk("./data", exclude=["token_match"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `prefix_search` | The prefix rules. |
|
||||
| `suffix_search` | The suffix rules. |
|
||||
| `infix_finditer` | The infix rules. |
|
||||
| `token_match` | The token match expression. |
|
||||
| `exceptions` | The tokenizer exception rules. |
|
||||
|
|
|
@ -311,10 +311,9 @@ Save the current state to a directory.
|
|||
>
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `**exclude` | - | Named attributes to prevent from being saved. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## Vectors.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -342,10 +341,9 @@ Serialize the current state to a binary string.
|
|||
> vectors_bytes = vectors.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------- |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
|
||||
|
||||
## Vectors.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -360,11 +358,10 @@ Load state from a binary string.
|
|||
> new_vectors.from_bytes(vectors_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | ---------------------------------------------- |
|
||||
| `data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `Vectors` | The `Vectors` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | ---------------------- |
|
||||
| `data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `Vectors` | The `Vectors` object. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
|
|
@ -221,9 +221,10 @@ Save the current state to a directory.
|
|||
> nlp.vocab.to_disk("/path/to/vocab")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -239,6 +240,7 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||
|
||||
## Vocab.to_bytes {#to_bytes tag="method"}
|
||||
|
@ -251,10 +253,10 @@ Serialize the current state to a binary string.
|
|||
> vocab_bytes = nlp.vocab.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
|
||||
|
||||
## Vocab.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -269,11 +271,11 @@ Load state from a binary string.
|
|||
> vocab.from_bytes(vocab_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------- | ---------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
||||
| **RETURNS** | `Vocab` | The `Vocab` object. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The `Vocab` object. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
@ -291,3 +293,22 @@ Load state from a binary string.
|
|||
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
|
||||
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
|
||||
| `vectors_length` | int | Number of dimensions for each word vector. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = vocab.to_bytes(exclude=["strings", "vectors"])
|
||||
> vocab.from_disk("./vocab", exclude=["strings"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ----------------------------------------------------- |
|
||||
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
|
||||
| `lexemes` | The lexeme data. |
|
||||
| `vectors` | The word vectors, if available. |
|
||||
|
|
Loading…
Reference in New Issue
Block a user