mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
💫 Make serialization methods consistent (#3385)
* Make serialization methods consistent exclude keyword argument instead of random named keyword arguments and deprecation handling * Update docs and add section on serialization fields
This commit is contained in:
parent
9a8f169e5c
commit
7ba3a5d95c
|
@ -70,6 +70,12 @@ class Warnings(object):
|
||||||
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
||||||
"efficient and less error-prone Doc.retokenize context manager "
|
"efficient and less error-prone Doc.retokenize context manager "
|
||||||
"instead.")
|
"instead.")
|
||||||
|
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
||||||
|
"methods is and should be replaced with `exclude`. This makes it "
|
||||||
|
"consistent with the other objects serializable.")
|
||||||
|
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
||||||
|
"being serialized or deserialized is deprecated. Please use the "
|
||||||
|
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -348,7 +354,10 @@ class Errors(object):
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
||||||
"is likely a bug in spaCy.")
|
"is likely a bug in spaCy.")
|
||||||
|
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
||||||
|
"arguments to exclude fields from being serialized or deserialized "
|
||||||
|
"is now deprecated. Please use the `exclude` argument instead. "
|
||||||
|
"For example: exclude=['{arg}'].")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -28,7 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||||
from .errors import Errors
|
from .errors import Errors, Warnings, deprecation_warning
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
@ -699,124 +699,114 @@ class Language(object):
|
||||||
self.tokenizer._reset_cache(keys)
|
self.tokenizer._reset_cache(keys)
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, exclude=tuple(), disable=None):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
will include the model.
|
will include the model.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): Path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be strings or `Path`-like objects.
|
it doesn't exist.
|
||||||
disable (list): Names of pipeline components to disable and prevent
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
from being saved.
|
|
||||||
|
|
||||||
EXAMPLE:
|
DOCS: https://spacy.io/api/language#to_disk
|
||||||
>>> nlp.to_disk('/path/to/models')
|
|
||||||
"""
|
"""
|
||||||
|
if disable is not None:
|
||||||
|
deprecation_warning(Warnings.W014)
|
||||||
|
exclude = disable
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = OrderedDict(
|
serializers = OrderedDict()
|
||||||
(
|
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
|
||||||
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
|
||||||
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, "name"):
|
if not hasattr(proc, "name"):
|
||||||
continue
|
continue
|
||||||
if name in disable:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "to_disk"):
|
if not hasattr(proc, "to_disk"):
|
||||||
continue
|
continue
|
||||||
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
|
||||||
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
util.to_disk(path, serializers, {p: False for p in disable})
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, disable=tuple()):
|
def from_disk(self, path, exclude=tuple(), disable=None):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it. If the saved `Language` object contains a model, the
|
returns it. If the saved `Language` object contains a model, the
|
||||||
model will be loaded.
|
model will be loaded.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory. Paths may be either
|
path (unicode or Path): A path to a directory.
|
||||||
strings or `Path`-like objects.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
disable (list): Names of the pipeline components to disable.
|
|
||||||
RETURNS (Language): The modified `Language` object.
|
RETURNS (Language): The modified `Language` object.
|
||||||
|
|
||||||
EXAMPLE:
|
DOCS: https://spacy.io/api/language#from_disk
|
||||||
>>> from spacy.language import Language
|
|
||||||
>>> nlp = Language().from_disk('/path/to/models')
|
|
||||||
"""
|
"""
|
||||||
|
if disable is not None:
|
||||||
|
deprecation_warning(Warnings.W014)
|
||||||
|
exclude = disable
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict()
|
||||||
(
|
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
||||||
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
|
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
||||||
(
|
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
|
||||||
"vocab",
|
|
||||||
lambda p: (
|
|
||||||
self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
|
||||||
),
|
|
||||||
),
|
|
||||||
("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in disable:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "from_disk"):
|
if not hasattr(proc, "from_disk"):
|
||||||
continue
|
continue
|
||||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
|
||||||
exclude = {p: False for p in disable}
|
if not (path / "vocab").exists() and "vocab" not in exclude:
|
||||||
if not (path / "vocab").exists():
|
# Convert to list here in case exclude is (default) tuple
|
||||||
exclude["vocab"] = True
|
exclude = list(exclude) + ["vocab"]
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
self._path = path
|
self._path = path
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, disable=[], **exclude):
|
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
disable (list): Nameds of pipeline components to disable and prevent
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
from being serialized.
|
|
||||||
RETURNS (bytes): The serialized form of the `Language` object.
|
RETURNS (bytes): The serialized form of the `Language` object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/language#to_bytes
|
||||||
"""
|
"""
|
||||||
serializers = OrderedDict(
|
if disable is not None:
|
||||||
(
|
deprecation_warning(Warnings.W014)
|
||||||
("vocab", lambda: self.vocab.to_bytes()),
|
exclude = disable
|
||||||
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
serializers = OrderedDict()
|
||||||
("meta", lambda: srsly.json_dumps(self.meta)),
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
)
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||||
)
|
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for name, proc in self.pipeline:
|
||||||
if name in disable:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "to_bytes"):
|
if not hasattr(proc, "to_bytes"):
|
||||||
continue
|
continue
|
||||||
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
|
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
|
||||||
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, disable=[]):
|
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
disable (list): Names of the pipeline components to disable.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (Language): The `Language` object.
|
RETURNS (Language): The `Language` object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/language#from_bytes
|
||||||
"""
|
"""
|
||||||
deserializers = OrderedDict(
|
if disable is not None:
|
||||||
(
|
deprecation_warning(Warnings.W014)
|
||||||
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
|
exclude = disable
|
||||||
(
|
deserializers = OrderedDict()
|
||||||
"vocab",
|
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
||||||
lambda b: (
|
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
||||||
self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
|
||||||
),
|
for name, proc in self.pipeline:
|
||||||
),
|
if name in exclude:
|
||||||
("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
|
||||||
if name in disable:
|
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "from_bytes"):
|
if not hasattr(proc, "from_bytes"):
|
||||||
continue
|
continue
|
||||||
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
|
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
|
||||||
util.from_bytes(bytes_data, deserializers, {})
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -141,16 +141,21 @@ class Pipe(object):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
"""Serialize the pipe to a bytestring."""
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
"""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
if self.model not in (True, False, None):
|
if self.model not in (True, False, None):
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
"""Load the pipe from a bytestring."""
|
"""Load the pipe from a bytestring."""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
|
@ -161,26 +166,25 @@ class Pipe(object):
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
deserialize = OrderedDict(
|
deserialize = OrderedDict()
|
||||||
(
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
("vocab", lambda b: self.vocab.from_bytes(b)),
|
deserialize["model"] = load_model
|
||||||
("model", load_model),
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
)
|
|
||||||
)
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
"""Serialize the pipe to disk."""
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
"""Load the pipe from disk."""
|
"""Load the pipe from disk."""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
@ -191,13 +195,11 @@ class Pipe(object):
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(p.open("rb").read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
|
|
||||||
deserialize = OrderedDict(
|
deserialize = OrderedDict()
|
||||||
(
|
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||||
("cfg", lambda p: self.cfg.update(_load_cfg(p))),
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
("vocab", lambda p: self.vocab.from_disk(p)),
|
deserialize["model"] = load_model
|
||||||
("model", load_model),
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
)
|
|
||||||
)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -537,7 +539,7 @@ class Tagger(Pipe):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -545,9 +547,10 @@ class Tagger(Pipe):
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
# TODO: Remove this once we don't have to handle previous models
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||||
|
@ -572,20 +575,22 @@ class Tagger(Pipe):
|
||||||
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
("model", lambda b: load_model(b)),
|
("model", lambda b: load_model(b)),
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
("vocab", lambda p: self.vocab.to_disk(p)),
|
||||||
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
|
("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
|
||||||
('model', lambda p: p.open("wb").write(self.model.to_bytes())),
|
("model", lambda p: p.open("wb").write(self.model.to_bytes())),
|
||||||
('cfg', lambda p: srsly.write_json(p, self.cfg))
|
("cfg", lambda p: srsly.write_json(p, self.cfg))
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
# TODO: Remove this once we don't have to handle previous models
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||||
|
@ -608,6 +613,7 @@ class Tagger(Pipe):
|
||||||
("tag_map", load_tag_map),
|
("tag_map", load_tag_map),
|
||||||
("model", load_model),
|
("model", load_model),
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -236,19 +236,17 @@ cdef class StringStore:
|
||||||
self.add(word)
|
self.add(word)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
|
||||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
"""
|
"""
|
||||||
return srsly.json_dumps(list(self))
|
return srsly.json_dumps(list(self))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **kwargs):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
|
||||||
RETURNS (StringStore): The `StringStore` object.
|
RETURNS (StringStore): The `StringStore` object.
|
||||||
"""
|
"""
|
||||||
strings = srsly.json_loads(bytes_data)
|
strings = srsly.json_loads(bytes_data)
|
||||||
|
|
|
@ -228,7 +228,7 @@ cdef class Parser:
|
||||||
self.set_annotations(subbatch, parse_states, tensors=None)
|
self.set_annotations(subbatch, parse_states, tensors=None)
|
||||||
for doc in batch_in_order:
|
for doc in batch_in_order:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def require_model(self):
|
def require_model(self):
|
||||||
"""Raise an error if the component's model is not initialized."""
|
"""Raise an error if the component's model is not initialized."""
|
||||||
if getattr(self, 'model', None) in (None, True, False):
|
if getattr(self, 'model', None) in (None, True, False):
|
||||||
|
@ -272,7 +272,7 @@ cdef class Parser:
|
||||||
beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
|
beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
self.model.resize_output(self.moves.n_moves)
|
self.model.resize_output(self.moves.n_moves)
|
||||||
model = self.model(docs)
|
model = self.model(docs)
|
||||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||||
|
@ -442,7 +442,7 @@ cdef class Parser:
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return None
|
return None
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
|
@ -603,22 +603,24 @@ cdef class Parser:
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
serializers = {
|
serializers = {
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||||
}
|
}
|
||||||
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'vocab': lambda p: self.vocab.from_disk(p),
|
||||||
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None
|
'model': lambda p: None
|
||||||
}
|
}
|
||||||
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
@ -632,22 +634,24 @@ cdef class Parser:
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(exclude=["strings"])),
|
||||||
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
|
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])),
|
||||||
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', lambda b: None)
|
('model', lambda b: None)
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
# TODO: Remove this once we don't have to handle previous models
|
||||||
|
|
|
@ -208,30 +208,32 @@ cdef class TransitionSystem:
|
||||||
self.labels[action][label_name] = new_freq-1
|
self.labels[action][label_name] = new_freq-1
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **kwargs):
|
||||||
with path.open('wb') as file_:
|
with path.open('wb') as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**kwargs))
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **kwargs):
|
||||||
with path.open('rb') as file_:
|
with path.open('rb') as file_:
|
||||||
byte_data = file_.read()
|
byte_data = file_.read()
|
||||||
self.from_bytes(byte_data, **exclude)
|
self.from_bytes(byte_data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
'strings': lambda b: self.strings.from_bytes(b)
|
||||||
}
|
}
|
||||||
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
self.initialize_actions(labels)
|
self.initialize_actions(labels)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -113,14 +113,14 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||||
tokens.to_bytes(tensor=False), tensor=False
|
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
|
||||||
)
|
)
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||||
tokens.to_bytes(sentiment=False), sentiment=False
|
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
|
||||||
)
|
)
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.compat import path2str
|
from spacy.compat import path2str
|
||||||
|
|
||||||
|
@ -41,3 +42,18 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
||||||
doc.to_disk(file_path)
|
doc.to_disk(file_path)
|
||||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||||
assert doc.to_bytes() == doc_d.to_bytes()
|
assert doc.to_bytes() == doc_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_doc_exclude(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
doc.user_data["foo"] = "bar"
|
||||||
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
|
assert new_doc.user_data["foo"] == "bar"
|
||||||
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
|
||||||
|
assert not new_doc.user_data
|
||||||
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
|
||||||
|
assert not new_doc.user_data
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc.to_bytes(user_data=False)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
|
||||||
|
|
|
@ -52,3 +52,19 @@ def test_serialize_with_custom_tokenizer():
|
||||||
nlp.tokenizer = custom_tokenizer(nlp)
|
nlp.tokenizer = custom_tokenizer(nlp)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_language_exclude(meta_data):
|
||||||
|
name = "name-in-fixture"
|
||||||
|
nlp = Language(meta=meta_data)
|
||||||
|
assert nlp.meta["name"] == name
|
||||||
|
new_nlp = Language().from_bytes(nlp.to_bytes())
|
||||||
|
assert nlp.meta["name"] == name
|
||||||
|
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
|
||||||
|
assert not new_nlp.meta["name"] == name
|
||||||
|
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
||||||
|
assert not new_nlp.meta["name"] == name
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.to_bytes(meta=False)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Language().from_bytes(nlp.to_bytes(), meta=False)
|
||||||
|
|
|
@ -55,7 +55,9 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||||
parser_d = Parser(en_vocab)
|
parser_d = Parser(en_vocab)
|
||||||
parser_d.model, _ = parser_d.Model(0)
|
parser_d.model, _ = parser_d.Model(0)
|
||||||
parser_d = parser_d.from_disk(file_path)
|
parser_d = parser_d.from_disk(file_path)
|
||||||
assert parser.to_bytes(model=False) == parser_d.to_bytes(model=False)
|
parser_bytes = parser.to_bytes(exclude=["model"])
|
||||||
|
parser_d_bytes = parser_d.to_bytes(exclude=["model"])
|
||||||
|
assert parser_bytes == parser_d_bytes
|
||||||
|
|
||||||
|
|
||||||
def test_to_from_bytes(parser, blank_parser):
|
def test_to_from_bytes(parser, blank_parser):
|
||||||
|
@ -114,3 +116,25 @@ def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||||
textcat.to_bytes()
|
textcat.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
|
def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||||
|
def get_new_parser():
|
||||||
|
new_parser = Parser(en_vocab)
|
||||||
|
new_parser.model, _ = new_parser.Model(0)
|
||||||
|
return new_parser
|
||||||
|
|
||||||
|
parser = Parser(en_vocab)
|
||||||
|
parser.model, _ = parser.Model(0)
|
||||||
|
parser.cfg["foo"] = "bar"
|
||||||
|
new_parser = get_new_parser().from_bytes(parser.to_bytes())
|
||||||
|
assert "foo" in new_parser.cfg
|
||||||
|
new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"])
|
||||||
|
assert "foo" not in new_parser.cfg
|
||||||
|
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]))
|
||||||
|
assert "foo" not in new_parser.cfg
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
parser.to_bytes(cfg=False)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
get_new_parser().from_bytes(parser.to_bytes(), cfg=False)
|
||||||
|
|
|
@ -360,36 +360,37 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
it doesn't exist.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#to_disk
|
DOCS: https://spacy.io/api/tokenizer#to_disk
|
||||||
"""
|
"""
|
||||||
with path.open("wb") as file_:
|
with path.open("wb") as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**kwargs))
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **kwargs):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory. Paths may be either
|
path (unicode or Path): A path to a directory.
|
||||||
strings or `Path`-like objects.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#from_disk
|
DOCS: https://spacy.io/api/tokenizer#from_disk
|
||||||
"""
|
"""
|
||||||
with path.open("rb") as file_:
|
with path.open("rb") as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.from_bytes(bytes_data, **exclude)
|
self.from_bytes(bytes_data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#to_bytes
|
DOCS: https://spacy.io/api/tokenizer#to_bytes
|
||||||
|
@ -402,13 +403,14 @@ cdef class Tokenizer:
|
||||||
("token_match", lambda: _get_regex_pattern(self.token_match)),
|
("token_match", lambda: _get_regex_pattern(self.token_match)),
|
||||||
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
|
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#from_bytes
|
DOCS: https://spacy.io/api/tokenizer#from_bytes
|
||||||
|
@ -422,6 +424,7 @@ cdef class Tokenizer:
|
||||||
("token_match", lambda b: data.setdefault("token_match", b)),
|
("token_match", lambda b: data.setdefault("token_match", b)),
|
||||||
("exceptions", lambda b: data.setdefault("rules", b))
|
("exceptions", lambda b: data.setdefault("rules", b))
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if data.get("prefix_search"):
|
if data.get("prefix_search"):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
|
|
|
@ -794,24 +794,26 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_disk
|
DOCS: https://spacy.io/api/doc#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("wb") as file_:
|
with path.open("wb") as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**kwargs))
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **kwargs):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory. Paths may be either
|
path (unicode or Path): A path to a directory. Paths may be either
|
||||||
strings or `Path`-like objects.
|
strings or `Path`-like objects.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): The modified `Doc` object.
|
RETURNS (Doc): The modified `Doc` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_disk
|
DOCS: https://spacy.io/api/doc#from_disk
|
||||||
|
@ -819,11 +821,12 @@ cdef class Doc:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("rb") as file_:
|
with path.open("rb") as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
return self.from_bytes(bytes_data, **exclude)
|
return self.from_bytes(bytes_data, **kwargs)
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
"""Serialize, i.e. export the document contents to a binary string.
|
"""Serialize, i.e. export the document contents to a binary string.
|
||||||
|
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
|
|
||||||
|
@ -849,16 +852,22 @@ cdef class Doc:
|
||||||
"sentiment": lambda: self.sentiment,
|
"sentiment": lambda: self.sentiment,
|
||||||
"tensor": lambda: self.tensor,
|
"tensor": lambda: self.tensor,
|
||||||
}
|
}
|
||||||
|
for key in kwargs:
|
||||||
|
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||||
|
raise ValueError(Errors.E128.format(arg=key))
|
||||||
if "user_data" not in exclude and self.user_data:
|
if "user_data" not in exclude and self.user_data:
|
||||||
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
||||||
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
if "user_data_keys" not in exclude:
|
||||||
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||||
|
if "user_data_values" not in exclude:
|
||||||
|
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
"""Deserialize, i.e. import the document contents from a binary string.
|
"""Deserialize, i.e. import the document contents from a binary string.
|
||||||
|
|
||||||
data (bytes): The string to load from.
|
data (bytes): The string to load from.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_bytes
|
DOCS: https://spacy.io/api/doc#from_bytes
|
||||||
|
@ -874,6 +883,9 @@ cdef class Doc:
|
||||||
"user_data_keys": lambda b: None,
|
"user_data_keys": lambda b: None,
|
||||||
"user_data_values": lambda b: None,
|
"user_data_values": lambda b: None,
|
||||||
}
|
}
|
||||||
|
for key in kwargs:
|
||||||
|
if key in deserializers or key in ("user_data",):
|
||||||
|
raise ValueError(Errors.E128.format(arg=key))
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
|
@ -1170,7 +1182,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
bytes_data = doc.to_bytes(vocab=False, user_data=False)
|
bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
|
||||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||||
doc.user_token_hooks)
|
doc.user_token_hooks)
|
||||||
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
|
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
|
||||||
|
@ -1179,7 +1191,7 @@ def pickle_doc(doc):
|
||||||
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||||
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
|
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
|
||||||
|
|
||||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data")
|
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
|
||||||
doc.user_hooks.update(doc_hooks)
|
doc.user_hooks.update(doc_hooks)
|
||||||
doc.user_span_hooks.update(span_hooks)
|
doc.user_span_hooks.update(span_hooks)
|
||||||
doc.user_token_hooks.update(token_hooks)
|
doc.user_token_hooks.update(token_hooks)
|
||||||
|
|
|
@ -25,7 +25,7 @@ except ImportError:
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||||
from .compat import import_file
|
from .compat import import_file
|
||||||
from .errors import Errors
|
from .errors import Errors, Warnings, deprecation_warning
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
|
@ -565,7 +565,8 @@ def itershuffle(iterable, bufsize=1000):
|
||||||
def to_bytes(getters, exclude):
|
def to_bytes(getters, exclude):
|
||||||
serialized = OrderedDict()
|
serialized = OrderedDict()
|
||||||
for key, getter in getters.items():
|
for key, getter in getters.items():
|
||||||
if key not in exclude:
|
# Split to support file names like meta.json
|
||||||
|
if key.split(".")[0] not in exclude:
|
||||||
serialized[key] = getter()
|
serialized[key] = getter()
|
||||||
return srsly.msgpack_dumps(serialized)
|
return srsly.msgpack_dumps(serialized)
|
||||||
|
|
||||||
|
@ -573,7 +574,8 @@ def to_bytes(getters, exclude):
|
||||||
def from_bytes(bytes_data, setters, exclude):
|
def from_bytes(bytes_data, setters, exclude):
|
||||||
msg = srsly.msgpack_loads(bytes_data)
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
for key, setter in setters.items():
|
for key, setter in setters.items():
|
||||||
if key not in exclude and key in msg:
|
# Split to support file names like meta.json
|
||||||
|
if key.split(".")[0] not in exclude and key in msg:
|
||||||
setter(msg[key])
|
setter(msg[key])
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
@ -583,7 +585,8 @@ def to_disk(path, writers, exclude):
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
for key, writer in writers.items():
|
for key, writer in writers.items():
|
||||||
if key not in exclude:
|
# Split to support file names like meta.json
|
||||||
|
if key.split(".")[0] not in exclude:
|
||||||
writer(path / key)
|
writer(path / key)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
@ -591,7 +594,8 @@ def to_disk(path, writers, exclude):
|
||||||
def from_disk(path, readers, exclude):
|
def from_disk(path, readers, exclude):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
for key, reader in readers.items():
|
for key, reader in readers.items():
|
||||||
if key not in exclude:
|
# Split to support file names like meta.json
|
||||||
|
if key.split(".")[0] not in exclude:
|
||||||
reader(path / key)
|
reader(path / key)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
@ -677,6 +681,23 @@ def validate_json(data, validator):
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def get_serialization_exclude(serializers, exclude, kwargs):
|
||||||
|
"""Helper function to validate serialization args and manage transition from
|
||||||
|
keyword arguments (pre v2.1) to exclude argument.
|
||||||
|
"""
|
||||||
|
exclude = list(exclude)
|
||||||
|
# Split to support file names like meta.json
|
||||||
|
options = [name.split(".")[0] for name in serializers]
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if key in ("vocab",) and value is False:
|
||||||
|
deprecation_warning(Warnings.W015.format(arg=key))
|
||||||
|
exclude.append(key)
|
||||||
|
elif key.split(".")[0] in options:
|
||||||
|
raise ValueError(Errors.E128.format(arg=key))
|
||||||
|
# TODO: user warning?
|
||||||
|
return exclude
|
||||||
|
|
||||||
|
|
||||||
class SimpleFrozenDict(dict):
|
class SimpleFrozenDict(dict):
|
||||||
"""Simplified implementation of a frozen dict, mainly used as default
|
"""Simplified implementation of a frozen dict, mainly used as default
|
||||||
function or method argument (for arguments that should default to empty
|
function or method argument (for arguments that should default to empty
|
||||||
|
@ -696,14 +717,14 @@ class SimpleFrozenDict(dict):
|
||||||
class DummyTokenizer(object):
|
class DummyTokenizer(object):
|
||||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
# allow serialization (see #1557)
|
# allow serialization (see #1557)
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **kwargs):
|
||||||
return b""
|
return b""
|
||||||
|
|
||||||
def from_bytes(self, _bytes_data, **exclude):
|
def from_bytes(self, _bytes_data, **kwargs):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, _path, **exclude):
|
def to_disk(self, _path, **kwargs):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def from_disk(self, _path, **exclude):
|
def from_disk(self, _path, **kwargs):
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -377,11 +377,11 @@ cdef class Vectors:
|
||||||
self.add(key, row=i)
|
self.add(key, row=i)
|
||||||
return strings
|
return strings
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode / Path): A path to a directory, which will be created if
|
path (unicode / Path): A path to a directory, which will be created if
|
||||||
it doesn't exists. Either a string or a Path-like object.
|
it doesn't exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#to_disk
|
DOCS: https://spacy.io/api/vectors#to_disk
|
||||||
"""
|
"""
|
||||||
|
@ -394,9 +394,9 @@ cdef class Vectors:
|
||||||
("vectors", lambda p: save_array(self.data, p.open("wb"))),
|
("vectors", lambda p: save_array(self.data, p.open("wb"))),
|
||||||
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
|
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
|
||||||
))
|
))
|
||||||
return util.to_disk(path, serializers, exclude)
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **kwargs):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
|
@ -428,13 +428,13 @@ cdef class Vectors:
|
||||||
("keys", load_keys),
|
("keys", load_keys),
|
||||||
("vectors", load_vectors),
|
("vectors", load_vectors),
|
||||||
))
|
))
|
||||||
util.from_disk(path, serializers, exclude)
|
util.from_disk(path, serializers, [])
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vectors` object.
|
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#to_bytes
|
DOCS: https://spacy.io/api/vectors#to_bytes
|
||||||
|
@ -444,17 +444,18 @@ cdef class Vectors:
|
||||||
return self.data.to_bytes()
|
return self.data.to_bytes()
|
||||||
else:
|
else:
|
||||||
return srsly.msgpack_dumps(self.data)
|
return srsly.msgpack_dumps(self.data)
|
||||||
|
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
|
("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
|
||||||
("vectors", serialize_weights)
|
("vectors", serialize_weights)
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
def from_bytes(self, data, **exclude):
|
def from_bytes(self, data, **kwargs):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
data (bytes): The data to load from.
|
data (bytes): The data to load from.
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vectors): The `Vectors` object.
|
RETURNS (Vectors): The `Vectors` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#from_bytes
|
DOCS: https://spacy.io/api/vectors#from_bytes
|
||||||
|
@ -469,5 +470,5 @@ cdef class Vectors:
|
||||||
("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
|
("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
|
||||||
("vectors", deserialize_weights)
|
("vectors", deserialize_weights)
|
||||||
))
|
))
|
||||||
util.from_bytes(data, deserializers, exclude)
|
util.from_bytes(data, deserializers, [])
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -397,47 +397,57 @@ cdef class Vocab:
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
return orth in self.vectors
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
it doesn't exist.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
self.strings.to_disk(path / "strings.json")
|
setters = ["strings", "lexemes", "vectors"]
|
||||||
with (path / "lexemes.bin").open('wb') as file_:
|
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||||
file_.write(self.lexemes_to_bytes())
|
if "strings" not in exclude:
|
||||||
if self.vectors is not None:
|
self.strings.to_disk(path / "strings.json")
|
||||||
|
if "lexemes" not in exclude:
|
||||||
|
with (path / "lexemes.bin").open("wb") as file_:
|
||||||
|
file_.write(self.lexemes_to_bytes())
|
||||||
|
if "vectors" not in "exclude" and self.vectors is not None:
|
||||||
self.vectors.to_disk(path)
|
self.vectors.to_disk(path)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory. Paths may be either
|
path (unicode or Path): A path to a directory.
|
||||||
strings or `Path`-like objects.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
self.strings.from_disk(path / "strings.json")
|
getters = ["strings", "lexemes", "vectors"]
|
||||||
with (path / "lexemes.bin").open("rb") as file_:
|
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||||
self.lexemes_from_bytes(file_.read())
|
if "strings" not in exclude:
|
||||||
if self.vectors is not None:
|
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||||
self.vectors.from_disk(path, exclude="strings.json")
|
if "lexemes" not in exclude:
|
||||||
if self.vectors.name is not None:
|
with (path / "lexemes.bin").open("rb") as file_:
|
||||||
link_vectors_to_models(self)
|
self.lexemes_from_bytes(file_.read())
|
||||||
|
if "vectors" not in exclude:
|
||||||
|
if self.vectors is not None:
|
||||||
|
self.vectors.from_disk(path, exclude=["strings"])
|
||||||
|
if self.vectors.name is not None:
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
DOCS: https://spacy.io/api/vocab#to_bytes
|
||||||
|
@ -453,13 +463,14 @@ cdef class Vocab:
|
||||||
("lexemes", lambda: self.lexemes_to_bytes()),
|
("lexemes", lambda: self.lexemes_to_bytes()),
|
||||||
("vectors", deserialize_vectors)
|
("vectors", deserialize_vectors)
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||||
return util.to_bytes(getters, exclude)
|
return util.to_bytes(getters, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
DOCS: https://spacy.io/api/vocab#from_bytes
|
||||||
|
@ -469,11 +480,13 @@ cdef class Vocab:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return self.vectors.from_bytes(b)
|
return self.vectors.from_bytes(b)
|
||||||
|
|
||||||
setters = OrderedDict((
|
setters = OrderedDict((
|
||||||
("strings", lambda b: self.strings.from_bytes(b)),
|
("strings", lambda b: self.strings.from_bytes(b)),
|
||||||
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
||||||
("vectors", lambda b: serialize_vectors(b))
|
("vectors", lambda b: serialize_vectors(b))
|
||||||
))
|
))
|
||||||
|
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
if self.vectors.name is not None:
|
if self.vectors.name is not None:
|
||||||
link_vectors_to_models(self)
|
link_vectors_to_models(self)
|
||||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
||||||
> parser.to_disk("/path/to/parser")
|
> parser.to_disk("/path/to/parser")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## DependencyParser.from_disk {#from_disk tag="method"}
|
## DependencyParser.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ----------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
> parser.from_bytes(parser_bytes)
|
> parser.from_bytes(parser_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------------ | ---------------------------------------------- |
|
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.labels {#labels tag="property"}
|
## DependencyParser.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
@ -312,3 +314,21 @@ The labels currently added to the component.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------- |
|
| ----------- | ----- | ---------------------------------- |
|
||||||
| **RETURNS** | tuple | The labels added to the component. |
|
| **RETURNS** | tuple | The labels added to the component. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = parser.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -349,11 +349,12 @@ array of attributes.
|
||||||
> assert doc[0].pos_ == doc2[0].pos_
|
> assert doc[0].pos_ == doc2[0].pos_
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------------------------------------- | ----------------------------- |
|
| ----------- | -------------------------------------- | ------------------------------------------------------------------------- |
|
||||||
| `attrs` | list | A list of attribute ID ints. |
|
| `attrs` | list | A list of attribute ID ints. |
|
||||||
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
|
| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. |
|
||||||
| **RETURNS** | `Doc` | Itself. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `Doc` | Itself. |
|
||||||
|
|
||||||
## Doc.to_disk {#to_disk tag="method" new="2"}
|
## Doc.to_disk {#to_disk tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -365,9 +366,10 @@ Save the current state to a directory.
|
||||||
> doc.to_disk("/path/to/doc")
|
> doc.to_disk("/path/to/doc")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Doc.from_disk {#from_disk tag="method" new="2"}
|
## Doc.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -384,6 +386,7 @@ Loads state from a directory. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
||||||
|
|
||||||
## Doc.to_bytes {#to_bytes tag="method"}
|
## Doc.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -397,9 +400,10 @@ Serialize, i.e. export the document contents to a binary string.
|
||||||
> doc_bytes = doc.to_bytes()
|
> doc_bytes = doc.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
|
||||||
|
|
||||||
## Doc.from_bytes {#from_bytes tag="method"}
|
## Doc.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -416,10 +420,11 @@ Deserialize, i.e. import the document contents from a binary string.
|
||||||
> assert doc.text == doc2.text
|
> assert doc.text == doc2.text
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------ |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `data` | bytes | The string to load from. |
|
| `data` | bytes | The string to load from. |
|
||||||
| **RETURNS** | `Doc` | The `Doc` object. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `Doc` | The `Doc` object. |
|
||||||
|
|
||||||
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
|
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
|
||||||
|
|
||||||
|
@ -658,3 +663,25 @@ The L2 norm of the document's vector representation.
|
||||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = doc.to_bytes(exclude=["text", "tensor"])
|
||||||
|
> doc.from_disk("./doc.bin", exclude=["user_data"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------ | --------------------------------------------- |
|
||||||
|
| `text` | The value of the `Doc.text` attribute. |
|
||||||
|
| `sentiment` | The value of the `Doc.sentiment` attribute. |
|
||||||
|
| `tensor` | The value of the `Doc.tensor` attribute. |
|
||||||
|
| `user_data` | The value of the `Doc.user_data` dictionary. |
|
||||||
|
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
|
||||||
|
| `user_data_values` | The values of the `Doc.user_data` dictionary. |
|
||||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
||||||
> ner.to_disk("/path/to/ner")
|
> ner.to_disk("/path/to/ner")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ----------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
> ner.from_bytes(ner_bytes)
|
> ner.from_bytes(ner_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------------ | ---------------------------------------------- |
|
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.labels {#labels tag="property"}
|
## EntityRecognizer.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
@ -312,3 +314,21 @@ The labels currently added to the component.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------- |
|
| ----------- | ----- | ---------------------------------- |
|
||||||
| **RETURNS** | tuple | The labels added to the component. |
|
| **RETURNS** | tuple | The labels added to the component. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = ner.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -327,7 +327,7 @@ the model**.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being saved. |
|
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -349,22 +349,22 @@ loaded object.
|
||||||
> nlp = English().from_disk("/path/to/en_model")
|
> nlp = English().from_disk("/path/to/en_model")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | --------------------------------------------------------------------------------- |
|
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||||
|
|
||||||
<Infobox title="Changed in v2.0" variant="warning">
|
<Infobox title="Changed in v2.0" variant="warning">
|
||||||
|
|
||||||
As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`,
|
As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`,
|
||||||
to improve consistency across classes. Pipeline components to prevent from being
|
to improve consistency across classes. Pipeline components to prevent from being
|
||||||
loaded can now be added as a list to `disable`, instead of specifying one
|
loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1),
|
||||||
keyword argument per component.
|
instead of specifying one keyword argument per component.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- nlp = spacy.load("en", tagger=False, entity=False)
|
- nlp = spacy.load("en", tagger=False, entity=False)
|
||||||
+ nlp = English().from_disk("/model", disable=["tagger', 'ner"])
|
+ nlp = English().from_disk("/model", exclude=["tagger", "ner"])
|
||||||
```
|
```
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -379,10 +379,10 @@ Serialize the current state to a binary string.
|
||||||
> nlp_bytes = nlp.to_bytes()
|
> nlp_bytes = nlp.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ----- | ----------------------------------------------------------------------------------------- |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being serialized. |
|
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Language` object. |
|
| **RETURNS** | bytes | The serialized form of the `Language` object. |
|
||||||
|
|
||||||
## Language.from_bytes {#from_bytes tag="method"}
|
## Language.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -400,20 +400,21 @@ available to the loaded object.
|
||||||
> nlp2.from_bytes(nlp_bytes)
|
> nlp2.from_bytes(nlp_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ---------- | --------------------------------------------------------------------------------- |
|
| ------------ | ---------- | ----------------------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Language` | The `Language` object. |
|
| **RETURNS** | `Language` | The `Language` object. |
|
||||||
|
|
||||||
<Infobox title="Changed in v2.0" variant="warning">
|
<Infobox title="Changed in v2.0" variant="warning">
|
||||||
|
|
||||||
Pipeline components to prevent from being loaded can now be added as a list to
|
Pipeline components to prevent from being loaded can now be added as a list to
|
||||||
`disable`, instead of specifying one keyword argument per component.
|
`disable` (v2.0) or `exclude` (v2.1), instead of specifying one keyword argument
|
||||||
|
per component.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- nlp = English().from_bytes(bytes, tagger=False, entity=False)
|
- nlp = English().from_bytes(bytes, tagger=False, entity=False)
|
||||||
+ nlp = English().from_bytes(bytes, disable=["tagger", "ner"])
|
+ nlp = English().from_bytes(bytes, exclude=["tagger", "ner"])
|
||||||
```
|
```
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -437,3 +438,23 @@ Pipeline components to prevent from being loaded can now be added as a list to
|
||||||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||||
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||||
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
|
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
||||||
|
> nlp.from_disk("./model-data", exclude=["ner"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `tokenizer` | Tokenization rules and exceptions. |
|
||||||
|
| `meta` | The meta data, available as `Language.meta`. |
|
||||||
|
| ... | String names of pipeline components, e.g. `"ner"`. |
|
||||||
|
|
|
@ -151,10 +151,9 @@ Serialize the current state to a binary string.
|
||||||
> store_bytes = stringstore.to_bytes()
|
> store_bytes = stringstore.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | -------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------ |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
|
|
||||||
|
|
||||||
## StringStore.from_bytes {#from_bytes tag="method"}
|
## StringStore.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -168,11 +167,10 @@ Load state from a binary string.
|
||||||
> new_store = StringStore().from_bytes(store_bytes)
|
> new_store = StringStore().from_bytes(store_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------- | ---------------------------------------------- |
|
| ------------ | ------------- | ------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| **RETURNS** | `StringStore` | The `StringStore` object. |
|
||||||
| **RETURNS** | `StringStore` | The `StringStore` object. |
|
|
||||||
|
|
||||||
## Utilities {#util}
|
## Utilities {#util}
|
||||||
|
|
||||||
|
|
|
@ -244,9 +244,10 @@ Serialize the pipe to disk.
|
||||||
> tagger.to_disk("/path/to/tagger")
|
> tagger.to_disk("/path/to/tagger")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Tagger.from_disk {#from_disk tag="method"}
|
## Tagger.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.to_bytes {#to_bytes tag="method"}
|
## Tagger.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | -------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.from_bytes {#from_bytes tag="method"}
|
## Tagger.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
> tagger.from_bytes(tagger_bytes)
|
> tagger.from_bytes(tagger_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ---------------------------------------------- |
|
| ------------ | -------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.labels {#labels tag="property"}
|
## Tagger.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
@ -314,3 +316,22 @@ tags by default, e.g. `VERB`, `NOUN` and so on.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------- |
|
| ----------- | ----- | ---------------------------------- |
|
||||||
| **RETURNS** | tuple | The labels added to the component. |
|
| **RETURNS** | tuple | The labels added to the component. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = tagger.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------- | ------------------------------------------------------------------------------------------ |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
|
||||||
|
|
|
@ -260,9 +260,10 @@ Serialize the pipe to disk.
|
||||||
> textcat.to_disk("/path/to/textcat")
|
> textcat.to_disk("/path/to/textcat")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## TextCategorizer.from_disk {#from_disk tag="method"}
|
## TextCategorizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -278,6 +279,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -291,10 +293,10 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -308,11 +310,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
> textcat.from_bytes(textcat_bytes)
|
> textcat.from_bytes(textcat_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ----------------- | ---------------------------------------------- |
|
| ------------ | ----------------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.labels {#labels tag="property"}
|
## TextCategorizer.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
@ -328,3 +330,21 @@ The labels currently added to the component.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------- |
|
| ----------- | ----- | ---------------------------------- |
|
||||||
| **RETURNS** | tuple | The labels added to the component. |
|
| **RETURNS** | tuple | The labels added to the component. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = textcat.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -127,9 +127,10 @@ Serialize the tokenizer to disk.
|
||||||
> tokenizer.to_disk("/path/to/tokenizer")
|
> tokenizer.to_disk("/path/to/tokenizer")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Tokenizer.from_disk {#from_disk tag="method"}
|
## Tokenizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -145,6 +146,7 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
||||||
|
|
||||||
## Tokenizer.to_bytes {#to_bytes tag="method"}
|
## Tokenizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -158,10 +160,10 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
Serialize the tokenizer to a bytestring.
|
Serialize the tokenizer to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | -------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
|
||||||
|
|
||||||
## Tokenizer.from_bytes {#from_bytes tag="method"}
|
## Tokenizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -176,11 +178,11 @@ it.
|
||||||
> tokenizer.from_bytes(tokenizer_bytes)
|
> tokenizer.from_bytes(tokenizer_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ----------- | ---------------------------------------------- |
|
| ------------ | ----------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
|
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
|
@ -190,3 +192,25 @@ it.
|
||||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = tokenizer.to_bytes(exclude=["vocab", "exceptions"])
|
||||||
|
> tokenizer.from_disk("./data", exclude=["token_match"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `prefix_search` | The prefix rules. |
|
||||||
|
| `suffix_search` | The suffix rules. |
|
||||||
|
| `infix_finditer` | The infix rules. |
|
||||||
|
| `token_match` | The token match expression. |
|
||||||
|
| `exceptions` | The tokenizer exception rules. |
|
||||||
|
|
|
@ -311,10 +311,9 @@ Save the current state to a directory.
|
||||||
>
|
>
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being saved. |
|
|
||||||
|
|
||||||
## Vectors.from_disk {#from_disk tag="method"}
|
## Vectors.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -342,10 +341,9 @@ Serialize the current state to a binary string.
|
||||||
> vectors_bytes = vectors.to_bytes()
|
> vectors_bytes = vectors.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | -------------------------------------------------- |
|
| ----------- | ----- | -------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
|
|
||||||
|
|
||||||
## Vectors.from_bytes {#from_bytes tag="method"}
|
## Vectors.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -360,11 +358,10 @@ Load state from a binary string.
|
||||||
> new_vectors.from_bytes(vectors_bytes)
|
> new_vectors.from_bytes(vectors_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | --------- | ---------------------------------------------- |
|
| ----------- | --------- | ---------------------- |
|
||||||
| `data` | bytes | The data to load from. |
|
| `data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| **RETURNS** | `Vectors` | The `Vectors` object. |
|
||||||
| **RETURNS** | `Vectors` | The `Vectors` object. |
|
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
|
|
|
@ -221,9 +221,10 @@ Save the current state to a directory.
|
||||||
> nlp.vocab.to_disk("/path/to/vocab")
|
> nlp.vocab.to_disk("/path/to/vocab")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -239,6 +240,7 @@ Loads state from a directory. Modifies the object in place and returns it.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||||
|
|
||||||
## Vocab.to_bytes {#to_bytes tag="method"}
|
## Vocab.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -251,10 +253,10 @@ Serialize the current state to a binary string.
|
||||||
> vocab_bytes = nlp.vocab.to_bytes()
|
> vocab_bytes = nlp.vocab.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | -------------------------------------------------- |
|
| ----------- | ----- | ------------------------------------------------------------------------- |
|
||||||
| `**exclude` | - | Named attributes to prevent from being serialized. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
|
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
|
||||||
|
|
||||||
## Vocab.from_bytes {#from_bytes tag="method"}
|
## Vocab.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -269,11 +271,11 @@ Load state from a binary string.
|
||||||
> vocab.from_bytes(vocab_bytes)
|
> vocab.from_bytes(vocab_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------- | ---------------------------------------------- |
|
| ------------ | ------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `**exclude` | - | Named attributes to prevent from being loaded. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Vocab` | The `Vocab` object. |
|
| **RETURNS** | `Vocab` | The `Vocab` object. |
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
|
@ -291,3 +293,22 @@ Load state from a binary string.
|
||||||
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
|
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
|
||||||
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
|
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
|
||||||
| `vectors_length` | int | Number of dimensions for each word vector. |
|
| `vectors_length` | int | Number of dimensions for each word vector. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = vocab.to_bytes(exclude=["strings", "vectors"])
|
||||||
|
> vocab.from_disk("./vocab", exclude=["strings"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------- | ----------------------------------------------------- |
|
||||||
|
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
|
||||||
|
| `lexemes` | The lexeme data. |
|
||||||
|
| `vectors` | The word vectors, if available. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user