From 54885b5e8812b0e400934d06ace8cede8657fea6 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:24:40 +0200 Subject: [PATCH] Add serialization 101 --- .../docs/usage/_spacy-101/_serialization.jade | 35 +++++++++++++++++++ website/docs/usage/saving-loading.jade | 10 ++++++ website/docs/usage/spacy-101.jade | 4 +++ 3 files changed, 49 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_serialization.jade diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade new file mode 100644 index 000000000..b6a889014 --- /dev/null +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -0,0 +1,35 @@ +//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION + +p + | If you've been modifying the pipeline, vocabulary vectors and entities, or made + | updates to the model, you'll eventually want + | to #[strong save your progress] – for example, everything that's in your #[code nlp] + | object. This means you'll have to translate its contents and structure + | into a format that can be saved, like a file or a byte string. This + | process is called serialization. spaCy comes with + | #[strong built-in serialization methods] and supports the + | #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol]. + ++aside("What's pickle?") + | Pickle is Python's built-in object persistance system. It lets you + | transfer arbitrary Python objects between processes. This is usually used + | to load an object to and from disk, but it's also used for distributed + | computing, e.g. with + | #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark] + | or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an + | object, you're agreeing to execute whatever code it contains. It's like + | calling #[code eval()] on a string – so don't unpickle objects from + | untrusted sources. + +p + | All container classes and pipeline components, i.e. + for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"] + | #[+api(cls.toLowerCase()) #[code=cls]], + | have the following methods available: + ++table(["Method", "Returns", "Example"]) + - style = [1, 0, 1] + +annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style) + +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) + +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) + +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 63c951d40..e580bca25 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,15 @@ include ../../_includes/_mixins ++h(2, "101") Serialization 101 + +include _spacy-101/_serialization + ++infobox("Important note") + | In spaCy v2.0, the API for saving and loading has changed to only use the + | four methods listed above consistently across objects and classes. For an + | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] + | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + +h(2, "models") Saving models diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 4fb758bb4..958200637 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -105,6 +105,10 @@ include _spacy-101/_word-vectors +h(2, "pipelines") Pipelines ++h(2, "serialization") Serialization + +include _spacy-101/_serialization + +h(2, "architecture") Architecture +image