Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-15 02:32:37 +03:00 · 2017-11-07 02:00:06 +01:00 · 2017-11-07 02:00:06 +01:00 · 9a88e66103
commit 9a88e66103
parent 174abe4677 42a0fbf291
32 changed files with 491 additions and 189 deletions
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -1,18 +1,24 @@
-import plac
-import collections
-import random
+"""
+This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence

+Prerequisites:
+spacy download en_vectors_web_lg
+pip install keras==2.0.9
+
+Compatible with: spaCy v2.0.0+
+"""
+
+import plac
+import random
 import pathlib
 import cytoolz
 import numpy
 from keras.models import Sequential, model_from_json
-from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
+from keras.layers import LSTM, Dense, Embedding, Bidirectional
 from keras.layers import TimeDistributed
 from keras.optimizers import Adam
-from spacy.compat import pickle
-
 import thinc.extra.datasets
-
+from spacy.compat import pickle
 import spacy


@ -84,8 +90,8 @@ def get_features(docs, max_length):


 def train(train_texts, train_labels, dev_texts, dev_labels,
-        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
-        by_sentence=True):
+          lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
+          nb_epoch=5, by_sentence=True):
    print("Loading spaCy")
    nlp = spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -6,7 +6,7 @@ money and currency values (entities labelled as MONEY) and then check the
 dependency tree to find the noun phrase they are referring to – for example:
 $9.4 million --> Net income.

-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@ -16,7 +16,7 @@ show you how computers understand [language]

 I'm assuming that we can use the token.head to build these groups."

-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@ -33,6 +33,8 @@ formatted in jsonl as a sequence of entries like this:
 {"text":"Annapolis"}
 {"text":"Appalachia"}
 {"text":"Argentina"}
+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import print_function, unicode_literals, division

--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -7,8 +7,7 @@ they're called on is passed in as the first argument.

 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Developed for: spaCy 2.0.0a17
-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -8,8 +8,7 @@ coordinates. Can be extended with more details from the API.
 * REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Developed for: spaCy 2.0.0a17
-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -8,8 +8,7 @@ respectively.

 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Developed for: spaCy 2.0.0a17
-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -6,7 +6,7 @@ each "sentence" on a newline, and spaces between tokens. Data is loaded from
 the IMDB movie reviews dataset and will be loaded automatically via Thinc's
 built-in dataset loader.

-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import print_function, unicode_literals
 from toolz import partition_all
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -15,8 +15,7 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
 ('hotel', 'PLACE', 'show') --> show PLACE hotel
 ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a19
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -7,8 +7,7 @@ For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a19
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -23,8 +23,7 @@ For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a19
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -5,8 +5,7 @@ model or a blank model. For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a18
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -8,8 +8,7 @@ the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a19
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -8,8 +8,7 @@ see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Text classification: https://alpha.spacy.io/usage/text-classification

-Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a19
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function
 import plac
@ -18,8 +17,7 @@ from pathlib import Path
 import thinc.extra.datasets

 import spacy
-from spacy.gold import minibatch
-from spacy.util import compounding
+from spacy.util import minibatch, compounding


@plac.annotations(
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -2,6 +2,7 @@
 # coding: utf8
 """Load vectors for a language trained using fastText
 https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals
 import plac
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -1,19 +1,18 @@
+# coding: utf8
+
 from __future__ import unicode_literals
 from ...language import Language

+
 def test_simple_train():
    nlp = Language()
-
    nlp.add_pipe(nlp.create_pipe('textcat'))
-    nlp.get_pipe('textcat').add_label('is_good')
-
+    nlp.get_pipe('textcat').add_label('answer')
    nlp.begin_training()
-
    for i in range(5):
        for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.),
                            ('bbbbbbbbb', 0.), ('aaaaaa', 1)]:
            nlp.update([text], [{'cats': {'answer': answer}}])
    doc = nlp(u'aaa')
-    assert 'is_good' in doc.cats
-    assert doc.cats['is_good'] >= 0.5
-
+    assert 'answer' in doc.cats
+    assert doc.cats['answer'] >= 0.5
--- a/spacy/util.py
+++ b/spacy/util.py
@ -392,7 +392,7 @@ def minibatch(items, size=8):
    so that batch-size can vary on each step.
    """
    if isinstance(size, int):
-        size_ = itertools.repeat(8)
+        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -127,20 +127,22 @@ mixin help(tooltip, icon_size)
 //- Aside wrapper
    label - [string] aside label

-mixin aside-wrapper(label)
+mixin aside-wrapper(label, emoji)
    aside.c-aside
        .c-aside__content(role="complementary")&attributes(attributes)
            if label
-                h4.u-text-label.u-text-label--dark=label
-
+                h4.u-text-label.u-text-label--dark
+                    if emoji
+                        span.o-emoji=emoji
+                    |  #{label}
            block


 //- Aside for text
    label - [string] aside title (optional)

-mixin aside(label)
-    +aside-wrapper(label)
+mixin aside(label, emoji)
+    +aside-wrapper(label, emoji)
        .c-aside__text.u-text-small
            block

@ -703,6 +705,6 @@ mixin landing-logos(title, logos)
 mixin under-construction()
    +infobox("Under construction", "🚧")
        |  This section is still being written and will be updated for the v2.0
-        |  release. Is there anything that you think should definitely mentioned or
-        |  explained here? Any examples you'd like to see? #[strong Let us know]
-        |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
+        |  release. Is there anything that you think should definitely mentioned
+        |  or explained here? Any examples you'd like to see?
+        |  #[strong Let us know] on the #[+a(gh("spacy") + "/issues") issue tracker]!
--- a/website/api/_top-level/_util.jade
+++ b/website/api/_top-level/_util.jade
@ -320,3 +320,137 @@ p
            |  #[code title] is rendered as coloured headline. #[code exits]
            |  performs system exit after printing, using the value of the
            |  argument as the exit code, e.g. #[code exits=1].
+
+
+h(3, "util.minibatch") util.minibatch
+    +tag function
+    +tag-new(2)
+
+p
+    |  Iterate over batches of items. #[code size] may be an iterator, so that
+    |  batch-size can vary on each step.
+
+aside-code("Example").
+    batches = minibatch(train_data)
+    for batch in batches:
+        texts, annotations = zip(*batch)
+        nlp.update(texts, annotations)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code items]
+        +cell iterable
+        +cell The items to batch up.
+
+    +row
+        +cell #[code size]
+        +cell int / iterable
+        +cell
+            |  The batch size(s). Use
+            |  #[+api("top-level#util.compounding") #[code util.compounding]] or
+            |  #[+api("top-level#util.decaying") #[code util.decaying]] or
+            |  for an infinite series of compounding or decaying values.
+
+    +row("foot")
+        +cell yields
+        +cell list
+        +cell The batches.
+
+h(3, "util.compounding") util.compounding
+    +tag function
+    +tag-new(2)
+
+p
+    |  Yield an infinite series of compounding values. Each time the generator
+    |  is called, a value is produced by multiplying the previous value by the
+    |  compound rate.
+
+aside-code("Example").
+    sizes = compounding(1., 10., 1.5)
+    assert next(sizes) == 1.
+    assert next(sizes) == 1. * 1.5
+    assert next(sizes) == 1.5 * 1.5
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start]
+        +cell int / float
+        +cell The first value.
+
+    +row
+        +cell #[code stop]
+        +cell int / float
+        +cell The maximum value.
+
+    +row
+        +cell #[code compound]
+        +cell int / float
+        +cell The compounding factor.
+
+    +row("foot")
+        +cell yields
+        +cell int
+        +cell Compounding values.
+
+h(3, "util.decaying") util.decaying
+    +tag function
+    +tag-new(2)
+
+p
+    |  Yield an infinite series of linearly decaying values.
+
+aside-code("Example").
+    sizes = decaying(1., 10., 0.001)
+    assert next(sizes) == 1.
+    assert next(sizes) == 1. - 0.001
+    assert next(sizes) == 0.999 - 0.001
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start]
+        +cell int / float
+        +cell The first value.
+
+    +row
+        +cell #[code end]
+        +cell int / float
+        +cell The maximum value.
+
+    +row
+        +cell #[code decay]
+        +cell int / float
+        +cell The decaying factor.
+
+    +row("foot")
+        +cell yields
+        +cell int
+        +cell The decaying values.
+
+h(3, "util.itershuffle") util.itershuffle
+    +tag function
+    +tag-new(2)
+
+p
+    |  Shuffle an iterator. This works by holding #[code bufsize] items back and
+    |  yielding them sometime later. Obviously, this is not unbiased – but
+    |  should be good enough for batching. Larger bufsize means less bias.
+
+aside-code("Example").
+    values = range(1000)
+    shuffled = itershuffle(values)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code iterable]
+        +cell iterable
+        +cell Iterator to shuffle.
+
+    +row
+        +cell #[code buffsize]
+        +cell int
+        +cell Items to hold back.
+
+    +row("foot")
+        +cell yields
+        +cell iterable
+        +cell The shuffled iterator.
--- a/website/api/language.jade
+++ b/website/api/language.jade
@ -157,12 +157,19 @@ p Update the models in the pipeline.
    +row
        +cell #[code docs]
        +cell iterable
-        +cell A batch of #[code Doc] objects.
+        +cell
+            |  A batch of #[code Doc] objects or unicode. If unicode, a
+            |  #[code Doc] object will be created from the text.

    +row
        +cell #[code golds]
        +cell iterable
-        +cell A batch of #[code GoldParse] objects.
+        +cell
+            |  A batch of #[code GoldParse] objects or dictionaries.
+            |  Dictionaries will be used to create
+            |  #[+api("goldparse") #[code GoldParse]] objects. For the available
+            |  keys and their usage, see
+            |  #[+api("goldparse#init") #[code GoldParse.__init__]].

    +row
        +cell #[code drop]
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -148,8 +148,8 @@
            "Basics": "basics",
            "NER": "ner",
            "Tagger & Parser": "tagger-parser",
-            "Similarity": "similarity",
            "Text Classification": "textcat",
+            "Tips and Advice": "tips",
            "Saving & Loading": "saving-loading"
        }
    },
--- a/website/usage/_processing-pipelines/_extensions.jade
+++ b/website/usage/_processing-pipelines/_extensions.jade
@ -88,8 +88,8 @@ p
        |  and add it to the #[code Language] instance returned by the
        |  model's #[code load()] method. For examples of this, check out the
        |  implementations of spaCy's
-        |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
-        |  and  #[+api("util#load_model_from_path") #[code load_model_from_path()]]
+        |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]]
+        |  and  #[+api("util#load_model_from_path") #[code load_model_from_path]]
        |  utility functions.

        +code-wrapper
--- a/website/usage/_training/_basics.jade
+++ b/website/usage/_training/_basics.jade
@ -172,15 +172,23 @@ p

    +row
        +cell #[code get_data]
-        +cell A function converting the training data to spaCy's JSON format.
+        +cell
+            |  An optional function converting the training data to spaCy's
+            |  JSON format.

    +row
        +cell #[code doc]
-        +cell #[+api("doc") #[code Doc]] objects.
+        +cell
+            |  #[+api("doc") #[code Doc]] objects. The #[code update] method
+            |  takes a sequence of them, so you can batch up your training
+            |  examples.

    +row
        +cell #[code gold]
-        +cell #[+api("goldparse") #[code GoldParse]] objects.
+        +cell
+            |  #[+api("goldparse") #[code GoldParse]] objects. The #[code update]
+            |  method takes a sequence of them, so you can batch up your
+            |  training examples.

    +row
        +cell #[code drop]
@ -197,3 +205,53 @@ p
    |  a model will be saved out to the directory. After training, you can
    |  use the #[+api("cli#package") #[code package]] command to generate an
    |  installable Python package from your model.
+
+code(false, "bash").
+    spacy convert /tmp/train.conllu /tmp/data
+    spacy train en /tmp/model /tmp/data/train.json -n 5
+
+h(3, "training-simple-style") Simple training style
+    +tag-new(2)
+
+p
+    |  Instead of sequences of #[code Doc] and #[code GoldParse] objects,
+    |  you can also use the "simple training style" and pass
+    |  #[strong raw texts] and #[strong dictionaries of annotations]
+    |  to #[+api("language#update") #[code nlp.update]].
+    |  The dictionaries can have the keys #[code entities], #[code heads],
+    |  #[code deps], #[code tags] and #[code cats]. This is generally
+    |  recommended, as it removes one layer of abstraction, and avoids
+    |  unnecessary imports. It also makes it easier to structure and load
+    |  your training data.
+
+aside-code("Example Annotations").
+    {
+        'entities': [(0, 4, 'ORG')],
+        'heads': [1, 1, 1, 5, 5, 2, 7, 5],
+        'deps': ['nsubj', 'ROOT', 'prt', 'quantmod', 'compound', 'pobj', 'det', 'npadvmod'],
+        'tags': ['PROPN', 'VERB', 'ADP', 'SYM', 'NUM', 'NUM', 'DET', 'NOUN'],
+        'cats': {'BUSINESS': 1.0}
+    }
+
+code("Simple training loop").
+    TRAIN_DATA = [
+         ("Uber blew through $1 million a week", {'entities': [(0, 4, 'ORG')]}),
+         ("Google rebrands its business apps", {'entities': [(0, 6, "ORG")]})]
+
+    nlp = spacy.blank('en')
+    optimizer = nlp.begin_training()
+    for i in range(20):
+        random.shuffle(TRAIN_DATA)
+        for text, annotations in TRAIN_DATA:
+            nlp.update([text], [annotations], sgd=optimizer)
+    nlp.to_disk('/model')
+
+p
+    |  The above training loop leaves out a few details that can really
+    |  improve accuracy – but the principle really is #[em that] simple. Once
+    |  you've got your pipeline together and you want to tune the accuracy,
+    |  you usually want to process your training examples in batches, and
+    |  experiment with #[+api("top-level#util.minibatch") #[code minibatch]]
+    |  sizes and dropout rates, set via the #[code drop] keyword argument. See
+    |  the #[+api("language") #[code Language]] and #[+api("pipe") #[code Pipe]]
+    |  API docs for available options.
--- a/website/usage/_training/_ner.jade
+++ b/website/usage/_training/_ner.jade
@ -2,12 +2,9 @@

 p
    |  All #[+a("/models") spaCy models] support online learning, so
-    |  you can update a pre-trained model with new examples. To update the
-    |  model, you first need to create an instance of
-    |  #[+api("goldparse") #[code GoldParse]], with the entity labels
-    |  you want to learn. You'll usually need to provide many examples to
-    |  meaningfully improve the system — a few hundred is a good start, although
-    |  more is better.
+    |  you can update a pre-trained model with new examples. You'll usually
+    |  need to provide many #[strong examples] to meaningfully improve the
+    |  system — a few hundred is a good start, although more is better.

 p
    |  You should avoid iterating over the same few examples multiple times, or
@ -21,7 +18,7 @@ p
    |  the model of other examples by augmenting your annotations with sentences
    |  annotated with entities automatically recognised by the original model.
    |  Ultimately, this is an empirical process: you'll need to
-    |  #[strong experiment on your own data] to find a solution that works best
+    |  #[strong experiment on your data] to find a solution that works best
    |  for you.

 +h(3, "example-train-ner") Updating the Named Entity Recognizer
@ -39,12 +36,6 @@ p
 +h(4) Step by step guide

 +list("numbers")
-    +item
-        |  #[strong Reformat the training data] to match spaCy's
-        |  #[+a("/api/annotation#json-input") JSON format]. The built-in
-        |  #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]]
-        |  function can help you with this.
-
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
@ -56,17 +47,13 @@ p
        |  This way, you'll only be training the entity recognizer.

    +item
-        |  #[strong Shuffle and loop over] the examples and create a
-        |  #[code Doc] and #[code GoldParse] object for each example.
-
-    +item
-        |  For each example, #[strong update the model]
-        |  by calling #[+api("language#update") #[code nlp.update]], which steps
+        |  #[strong Shuffle and loop over] the examples. For each example,
+        |  #[strong update the model] by calling
+        |  #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
-        |  #[strong prediction]. It then consults the annotations provided on the
-        |  #[code GoldParse] instance, to see whether it was
-        |  right. If it was wrong, it adjusts its weights so that the correct
-        |  action will score higher next time.
+        |  #[strong prediction]. It then consults the annotations to see whether
+        |  it was right. If it was wrong, it adjusts its weights so that the
+        |  correct action will score higher next time.

    +item
        |  #[strong Save] the trained model using
@ -90,13 +77,16 @@ p

 +github("spacy", "examples/training/train_new_entity_type.py", 500)

+aside("Important note", "⚠️")
+    |  If you're using an existing model, make sure to mix in examples of
+    |  #[strong other entity types] that spaCy correctly recognized before.
+    |  Otherwise, your model might learn the new type, but "forget" what it
+    |  previously knew. This is also referred to as the
+    |  #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) "catastrophic forgetting" problem].
+
 +h(4) Step by step guide

 +list("numbers")
-    +item
-        |  Create #[code Doc] and #[code GoldParse] objects for
-        |  #[strong each example in your training data].
-
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
@ -117,10 +107,9 @@ p
        |  #[strong Loop over] the examples and call
        |  #[+api("language#update") #[code nlp.update]], which steps through
        |  the words of the input. At each word, it makes a
-        |  #[strong prediction]. It then consults the annotations provided on the
-        |  #[code GoldParse] instance, to see whether it was right. If it was
-        |  wrong, it adjusts its weights so that the correct action will score
-        |  higher next time.
+        |  #[strong prediction]. It then consults the annotations, to see
+        |  whether it was right. If it was wrong, it adjusts its weights so that
+        |  the correct action will score higher next time.

    +item
        |  #[strong Save] the trained model using
--- a/website/usage/_training/_saving-loading.jade
+++ b/website/usage/_training/_saving-loading.jade
@ -41,7 +41,7 @@ p
        "author": "You",
        "email": "you@example.com",
        "license": "CC BY-SA 3.0",
-        "pipeline": ["token_vectors", "tagger"]
+        "pipeline": ["tagger", "parser", "ner"]
    }

 +code(false, "bash").
@ -94,26 +94,13 @@ p
    |  The #[code load()] method that comes with our model package
    |  templates will take care of putting all this together and returning a
    |  #[code Language] object with the loaded pipeline and data. If your model
-    |  requires custom pipeline components, you should
-    |  #[strong ship then with your model] and register their
-    |  #[+a("/usage/processing-pipelines#creating-factory") factories]
-    |  via  #[+api("spacy#set_factory") #[code set_factory()]].
-
-+aside-code("Factory example").
-    def my_factory(vocab):
-        # load some state
-        def my_component(doc):
-            # process the doc
-            return doc
-        return my_component
-
-+code.
-    spacy.set_factory('custom_component', custom_component_factory)
-
-+infobox("Custom models with pipeline components")
-    |  For more details and an example of how to package a sentiment model
-    |  with a custom pipeline component, see the usage guide on
-    |  #[+a("/usage/processing-pipelines#example2") language processing pipelines].
+    |  requires custom #[+a("/usage/processing-pipelines") pipeline components]
+    |  or a custom language class, you can also
+    |  #[strong ship the code with your model]. For examples of this, check out
+    |  the implementations of spaCy's
+    |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]]
+    |  and  #[+api("util#load_model_from_path") #[code load_model_from_path]]
+    |  utility functions.

 +h(3, "models-building") Building the model package

@ -155,8 +142,7 @@ p
    |  #[+api("language#from_disk") #[code from_disk]] instead.

 +code.
-    from spacy.lang.en import English
-    nlp = English().from_disk('/path/to/data')
+    nlp = spacy.blank('en').from_disk('/path/to/data')

 +infobox("Important note: Loading data in v2.x")
    .o-block
@ -168,7 +154,7 @@ p
        |  spaCy v2.0 solves this with a clear distinction between setting up
        |  the instance and loading the data.

-    +code-new nlp = English().from_disk('/path/to/data')
+    +code-new nlp = spacy.blank('en').from_disk('/path/to/data')
    +code-old nlp = spacy.load('en', path='/path/to/data')

 +h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy
--- a/website/usage/_training/_similarity.jade
+++ b/website/usage/_training/_similarity.jade
@ -1,3 +0,0 @@
-//- 💫 DOCS > USAGE > TRAINING > SIMILARITY
-
-+under-construction
--- a/website/usage/_training/_tagger-parser.jade
+++ b/website/usage/_training/_tagger-parser.jade
@ -30,19 +30,13 @@ p
        |  not necessary – but it doesn't hurt either, just to be safe.

    +item
-        |  #[strong Shuffle and loop over] the examples and create a
-        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
-        |  to pass in the #[code heads] and #[code deps] when you create the
-        |  #[code GoldParse].
-
-    +item
-        |  For each example, #[strong update the model]
-        |  by calling #[+api("language#update") #[code nlp.update]], which steps
-        |  through the words of the input. At each word, it makes a
-        |  #[strong prediction]. It then consults the annotations provided on the
-        |  #[code GoldParse] instance, to see whether it was
-        |  right. If it was wrong, it adjusts its weights so that the correct
-        |  action will score higher next time.
+        |  #[strong Shuffle and loop over] the examples. For each example,
+        |  #[strong update the model] by calling
+        |  #[+api("language#update") #[code nlp.update]], which steps through
+        |  the words of the input. At each word, it makes a
+        |  #[strong prediction]. It then consults the annotations  to see
+        |  whether it was right. If it was wrong, it adjusts its weights so
+        |  that the correct action will score higher next time.

    +item
        |  #[strong Save] the trained model using
@ -67,26 +61,29 @@ p

 +list("numbers")
    +item
-        |  #[strong Create] a new #[code Language] class and before initialising
-        |  it, update the #[code tag_map] in its #[code Defaults] with your
-        |  custom tags.
+        |  #[strong Load the model] you want to start with, or create an
+        |  #[strong empty model] using
+        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
+        |  language. If you're using a blank model, don't forget to add the
+        |  tagger to the pipeline. If you're using an existing model,
+        |  make sure to disable all other pipeline components during training
+        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
+        |  This way, you'll only be training the tagger.

    +item
-        |  #[strong Create a new tagger] component and add it to the pipeline.
+        |  #[strong Add the tag map] to the tagger using the
+        |  #[+api("tagger#add_label") #[code add_label]] method. The first
+        |  argument is the new tag name, the second the mapping to spaCy's
+        |  coarse-grained tags, e.g. #[code {'pos': 'NOUN'}].

    +item
-        |  #[strong Shuffle and loop over] the examples and create a
-        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
-        |  to pass in the #[code tags] when you create the #[code GoldParse].
-
-    +item
-        |  For each example, #[strong update the model]
-        |  by calling #[+api("language#update") #[code nlp.update]], which steps
-        |  through the words of the input. At each word, it makes a
-        |  #[strong prediction]. It then consults the annotations provided on the
-        |  #[code GoldParse] instance, to see whether it was
-        |  right. If it was wrong, it adjusts its weights so that the correct
-        |  action will score higher next time.
+        |  #[strong Shuffle and loop over] the examples. For each example,
+        |  #[strong update the model] by calling
+        |  #[+api("language#update") #[code nlp.update]], which steps through
+        |  the words of the input. At each word, it makes a
+        |  #[strong prediction]. It then consults the annotations to see whether
+        |  it was right. If it was wrong, it adjusts its weights so that the
+        |  correct action will score higher next time.

    +item
        |  #[strong Save] the trained model using
@ -124,7 +121,7 @@ p
    |  respective action – e.g. search the database for hotels with high ratings
    |  for their wifi offerings.

-+aside("Tip: merge phrases and entities")
+aside("Tip: merge phrases and entities", "💡")
    |  To achieve even better accuracy, try merging multi-word tokens and
    |  entities specific to your domain into one token before parsing your text.
    |  You can do this by running the entity recognizer or
@ -160,9 +157,10 @@ p
        |  #[strong empty model] using
        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
        |  language. If you're using a blank model, don't forget to add the
-        |  parser to the pipeline. If you're using an existing model,
-        |  make sure to disable all other pipeline components during training
-        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
+        |  custom parser to the pipeline. If you're using an existing model,
+        |  make sure to #[strong remove the old parser] from the pipeline, and
+        |  disable all other pipeline components during training using
+        |  #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
        |  This way, you'll only be training the parser.

    +item
@ -170,19 +168,13 @@ p
        |  #[+api("dependencyparser#add_label") #[code add_label]] method.

    +item
-        |  #[strong Shuffle and loop over] the examples and create a
-        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
-        |  to pass in the #[code heads] and #[code deps] when you create the
-        |  #[code GoldParse].
-
-    +item
-        |  For each example, #[strong update the model]
-        |  by calling #[+api("language#update") #[code nlp.update]], which steps
+        |  #[strong Shuffle and loop over] the examples. For each example,
+        |  #[strong update the model] by calling
+        |  #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
-        |  #[strong prediction]. It then consults the annotations provided on the
-        |  #[code GoldParse] instance, to see whether it was
-        |  right. If it was wrong, it adjusts its weights so that the correct
-        |  action will score higher next time.
+        |  #[strong prediction]. It then consults the annotations to see whether
+        |  it was right. If it was wrong, it adjusts its weights so that the
+        |  correct action will score higher next time.

    +item
        |  #[strong Save] the trained model using
--- a/website/usage/_training/_textcat.jade
+++ b/website/usage/_training/_textcat.jade
@ -35,17 +35,18 @@ p
        |  be able to see results on each training iteration.

    +item
-        |  #[strong Loop over] the training examples, partition them into
-        |  batches and create #[code Doc] and #[code GoldParse] objects for each
-        |  example in the batch.
+        |  #[strong Loop over] the training examples and partition them into
+        |  batches using spaCy's
+        |  #[+api("top-level#util.minibatch") #[code minibatch]] and
+        |  #[+api("top-level#util.compounding") #[code compounding]] helpers.

    +item
        |  #[strong Update the model] by calling
        |  #[+api("language#update") #[code nlp.update]], which steps
        |  through the examples and makes a #[strong prediction]. It then
-        |  consults the annotations provided on the #[code GoldParse] instance,
-        |  to see whether it was right. If it was wrong, it adjusts its weights
-        |  so that the correct prediction will score higher next time.
+        |  consults the annotations to see whether it was right. If it was
+        |  wrong, it adjusts its weights so that the correct prediction will
+        |  score higher next time.

    +item
        |  Optionally, you can also #[strong evaluate the text classifier] on
--- a/website/usage/_training/_tips.jade
+++ b/website/usage/_training/_tips.jade
@ -0,0 +1,135 @@
+//- 💫 DOCS > USAGE > TRAINING > OPTIMIZATION TIPS AND ADVICE
+
+p
+    |  There are lots of conflicting "recipes" for training deep neural
+    |  networks at the moment. The cutting-edge models take a very long time to
+    |  train, so most researchers can't run enough experiments to figure out
+    |  what's #[em really] going on. For what it's worth, here's a recipe seems
+    |  to work well on a lot of problems:
+
+code("Batch heuristic").
+    def get_batches(train_data, model_type):
+        max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64}
+        max_batch_size = max_batch_sizes[model_type]
+        if len(train_data) &lt; 1000:
+            max_batch_size /= 2
+        if len(train_data) &lt; 500:
+            max_batch_size /= 2
+        batch_size = compounding(1, max_batch_size, 1.001)
+        batches = minibatch(train_data, size=batch_size)
+        return batches
+
+p
+    |  This will set the batch size to start at #[code 1], and increase each
+    |  batch until it reaches a maximum size. The tagger, parser and entity
+    |  recognizer all take whole sentences as input, so they're learning a lot
+    |  of labels in a single example. You therefore need smaller batches for
+    |  them. The batch size for the text categorizer should be somewhat larger,
+    |  especially if your documents are long.
+
+p
+    |  The trick of increasing the batch size is starting to become quite
+    |  popular (see #[+a("https://arxiv.org/abs/1711.00489") Smith et al., 2017]).
+    |  Their recipe is quite different from how spaCy's models are being
+    |  trained, but there are some similarities. In training the various spaCy
+    |  models, we haven't found much advantage from decaying the learning
+    |  rate – but starting with a low batch size has definitely helped. You
+    |  should try it out on your data, and see how you go.
+
+h(3, "tips-hyperparams") Learning rate, regularization and gradient clipping
+
+p
+    |  By default spaCy uses the Adam solver, with default settings
+    |  (learning rate #[code 0.001], #[code beta1=0.9], #[code beta2=0.999]).
+    |  Some researchers have said they found these settings terrible on their
+    |  problems – but they've always performed very well in training spaCy's
+    |  models, in combination with the rest of our recipe. You can change these
+    |  settings directly, by modifying the corresponding attributes on the
+    |  #[code optimizer] object. You can also set environment variables, to
+    |  adjust the defaults.
+
+p
+    |  There are two other key hyper-parameters of the solver: #[code L2]
+    |  #[strong regularization], and #[strong gradient clipping]
+    |  (#[code max_grad_norm]). Gradient clipping is a hack that's not discussed
+    |  often, but everybody seems to be using. It's quite important in helping
+    |  to ensure the network doesn't diverge, which is a fancy way of saying
+    |  "fall over during training". The effect is sort of similar to setting the
+    |  learning rate low. It can also compensate for a large batch size (this is
+    |  a good example of how the choices of all these hyper-parameters
+    |  intersect).
+
+h(3, "tips-dropout") Dropout rate
+
+p
+    |  For small datasets, it's useful to set a
+    |  #[strong high dropout rate at first], and #[strong decay] it down towards
+    |  a more reasonable value. This helps avoid the network immediately
+    |  overfitting, while still encouraging it to learn some of the more
+    |  interesting things in your data. spaCy comes with a
+    |  #[+api("top-level#util.decaying") #[code decaying]] utility function to
+    |  facilitate this. You might try setting:
+
+code.
+    from spacy.util import decaying
+    dropout = decaying(0.6, 0.2, 1e-4)
+
+p
+    |  You can then draw values from the iterator with #[code next(dropout)],
+    |  which you would pass to the #[code drop] keyword argument of
+    |  #[+api("language#update") #[code nlp.update]]. It's pretty much always a
+    |  good idea to use at least #[strong some dropout]. All of the models
+    |  currently use Bernoulli dropout, for no particularly principled reason –
+    |  we just haven't experimented with another scheme like Gaussian dropout
+    |  yet.
+
+h(3, "tips-param-avg") Parameter averaging
+
+p
+    |  The last part of our optimisation recipe is #[strong parameter averaging],
+    |  an old trick introduced by
+    |  #[+a("https://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf") Freund and Schapire (1999)],
+    |  popularised in the NLP community by
+    |  #[+a("http://www.aclweb.org/anthology/P04-1015") Collins (2002)],
+    |  explained in more detail by
+    |  #[+a("http://leon.bottou.org/projects/sgd") Leon Botto]. Just about the
+    |  only other people who seem to be using this for neural network training
+    |  are the SyntaxNet team (one of whom is Michael Collins) – but it really
+    |  seems to work great on every problem.
+
+p
+    |  The trick is to store the moving average of the weights during training.
+    |  We don't optimise this average – we just track it. Then  when we want to
+    |  actually use the model, we use the averages, not the most recent value.
+    |  In spaCy (and #[+a(gh("thinc")) Thinc]) this is done by using a
+    |  context manager, #[+api("language#use_params") #[code use_params]], to
+    |  temporarily replace the weights:
+
+code.
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk('/model')
+
+p
+    |  The context manager is handy because you naturally want to evaluate and
+    |  save the model at various points during training (e.g. after each epoch).
+    |  After evaluating and saving, the context manager will exit and the
+    |  weights will be restored, so you resume training from the most recent
+    |  value, rather than the average. By evaluating the model after each epoch,
+    |  you can remove one hyper-parameter from consideration (the number of
+    |  epochs). Having one less magic number to guess is extremely nice – so
+    |  having the averaging under a context manager is very convenient.
+
+h(3, "tips-transfer-learning") Transfer learning
+
+p
+    |  Finally, if you're training from a small data set, it's very useful to
+    |  start off with some knowledge already in the model. #[strong Word vectors]
+    |  are an easy and reliable way to do that, but depending on the
+    |  application, you may also be able to start with useful knowledge from one
+    |  of spaCy's #[+a("/models") pre-trained models], such as the parser,
+    |  entity recogniser and tagger. If you're adapting a pre-trained model and
+    |  you want it to retain accuracy on the tasks it was originally trained
+    |  for, you should consider the  "catastrophic forgetting" problem.
+    |  #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) See this blog post]
+    |  to read more about the problem and our suggested solution,
+    |  pseudo-rehearsal.
--- a/website/usage/_v2/_migrating.jade
+++ b/website/usage/_v2/_migrating.jade
@ -110,17 +110,23 @@ p
    |  spaCy when to #[em stop], you can now explicitly call
    |  #[+api("language#begin_training") #[code begin_taining]], which
    |  returns an optimizer you can pass into the
-    |  #[+api("language#update") #[code update]] function.
+    |  #[+api("language#update") #[code update]] function. While #[code update]
+    |  still accepts sequences of #[code Doc] and #[code GoldParse] objects,
+    |  you can now also pass in a list of strings and dictionaries describing
+    |  the annotations. This is the recommended usage, as it removes one layer
+    |  of abstraction from the training.

 +code-new.
    optimizer = nlp.begin_training()
    for itn in range(1000):
-        for doc, gold in train_data:
-            nlp.update([doc], [gold], sgd=optimizer)
+        for texts, annotations in train_data:
+            nlp.update(texts, annotations, sgd=optimizer)
    nlp.to_disk('/model')
 +code-old.
    for itn in range(1000):
-        for doc, gold in train_data:
+        for text, entities in train_data:
+            doc = Doc(text)
+            gold = GoldParse(doc, entities=entities)
            nlp.update(doc, gold)
    nlp.end_training()
    nlp.save_to_directory('/model')
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -165,18 +165,15 @@ include ../_includes/_mixins
    +h(3, "keras") Text classification with Keras

    p
-        |  In this example, we're using spaCy to pre-process text for use with
-        |  a #[+a("https://keras.io") Keras] text classification model.
+        |  This example shows how to use a #[+a("https://keras.io") Keras]
+        |  LSTM sentiment classification model in spaCy. spaCy splits
+        |  the document into sentences, and each sentence is classified using
+        |  the LSTM. The scores for the sentences are then aggregated to give
+        |  the document score. This kind of hierarchical model is quite
+        |  difficult in "pure" Keras or Tensorflow, but it's very effective.
+        |  The Keras example on this dataset performs quite poorly, because it
+        |  cuts off the documents so that they're a fixed size. This hurts
+        |  review accuracy a lot, because people often summarise their rating
+        |  in the final sentence.

    +github("spacy", "examples/deep_learning_keras.py")
-
-    +h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
-
-    p
-        |  This example contains an implementation of the entailment prediction
-        |  model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
-        |  The model is notable for its competitive performance with very few
-        |  parameters, and was implemented using #[+a("https://keras.io") Keras]
-        |  and spaCy.
-
-    +github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")
--- a/website/usage/training.jade
+++ b/website/usage/training.jade
@ -20,14 +20,14 @@ p
    +h(2, "tagger-parser") Training the tagger and parser
    include _training/_tagger-parser

-+section("similarity")
-    +h(2, "similarity") Training a similarity model
-    include _training/_similarity
-
 +section("textcat")
    +h(2, "textcat") Training a text classification model
    include _training/_textcat

+section("tips")
+    +h(2, "tips") Optimization tips and advice
+    include _training/_tips
+
 +section("saving-loading")
    +h(2, "saving-loading") Saving and loading models
    include _training/_saving-loading