Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-07 02:00:06 +01:00
commit 9a88e66103
32 changed files with 491 additions and 189 deletions

View File

@ -1,18 +1,24 @@
import plac """
import collections This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
import random
Prerequisites:
spacy download en_vectors_web_lg
pip install keras==2.0.9
Compatible with: spaCy v2.0.0+
"""
import plac
import random
import pathlib import pathlib
import cytoolz import cytoolz
import numpy import numpy
from keras.models import Sequential, model_from_json from keras.models import Sequential, model_from_json
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional from keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.layers import TimeDistributed from keras.layers import TimeDistributed
from keras.optimizers import Adam from keras.optimizers import Adam
from spacy.compat import pickle
import thinc.extra.datasets import thinc.extra.datasets
from spacy.compat import pickle
import spacy import spacy
@ -84,8 +90,8 @@ def get_features(docs, max_length):
def train(train_texts, train_labels, dev_texts, dev_labels, def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
by_sentence=True): nb_epoch=5, by_sentence=True):
print("Loading spaCy") print("Loading spaCy")
nlp = spacy.load('en_vectors_web_lg') nlp = spacy.load('en_vectors_web_lg')
nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp.add_pipe(nlp.create_pipe('sentencizer'))

View File

@ -6,7 +6,7 @@ money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to for example: dependency tree to find the noun phrase they are referring to for example:
$9.4 million --> Net income. $9.4 million --> Net income.
Last updated for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -16,7 +16,7 @@ show you how computers understand [language]
I'm assuming that we can use the token.head to build these groups." I'm assuming that we can use the token.head to build these groups."
Last updated for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -33,6 +33,8 @@ formatted in jsonl as a sequence of entries like this:
{"text":"Annapolis"} {"text":"Annapolis"}
{"text":"Appalachia"} {"text":"Appalachia"}
{"text":"Argentina"} {"text":"Argentina"}
Compatible with: spaCy v2.0.0+
""" """
from __future__ import print_function, unicode_literals, division from __future__ import print_function, unicode_literals, division

View File

@ -7,8 +7,7 @@ they're called on is passed in as the first argument.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a18
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,7 @@ coordinates. Can be extended with more details from the API.
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) * REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a18
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,7 @@ respectively.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a18
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -6,7 +6,7 @@ each "sentence" on a newline, and spaces between tokens. Data is loaded from
the IMDB movie reviews dataset and will be loaded automatically via Thinc's the IMDB movie reviews dataset and will be loaded automatically via Thinc's
built-in dataset loader. built-in dataset loader.
Last updated for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
""" """
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
from toolz import partition_all from toolz import partition_all

View File

@ -15,8 +15,7 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
('hotel', 'PLACE', 'show') --> show PLACE hotel ('hotel', 'PLACE', 'show') --> show PLACE hotel
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -7,8 +7,7 @@ For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training * Training: https://alpha.spacy.io/usage/training
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -23,8 +23,7 @@ For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training * Training: https://alpha.spacy.io/usage/training
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -5,8 +5,7 @@ model or a blank model. For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training * Training: https://alpha.spacy.io/usage/training
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a18
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,7 @@ the documentation:
* Training: https://alpha.spacy.io/usage/training * Training: https://alpha.spacy.io/usage/training
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,7 @@ see the documentation:
* Training: https://alpha.spacy.io/usage/training * Training: https://alpha.spacy.io/usage/training
* Text classification: https://alpha.spacy.io/usage/text-classification * Text classification: https://alpha.spacy.io/usage/text-classification
Developed for: spaCy 2.0.0a18 Compatible with: spaCy v2.0.0+
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
@ -18,8 +17,7 @@ from pathlib import Path
import thinc.extra.datasets import thinc.extra.datasets
import spacy import spacy
from spacy.gold import minibatch from spacy.util import minibatch, compounding
from spacy.util import compounding
@plac.annotations( @plac.annotations(

View File

@ -2,6 +2,7 @@
# coding: utf8 # coding: utf8
"""Load vectors for a language trained using fastText """Load vectors for a language trained using fastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Compatible with: spaCy v2.0.0+
""" """
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac

View File

@ -1,19 +1,18 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...language import Language from ...language import Language
def test_simple_train(): def test_simple_train():
nlp = Language() nlp = Language()
nlp.add_pipe(nlp.create_pipe('textcat')) nlp.add_pipe(nlp.create_pipe('textcat'))
nlp.get_pipe('textcat').add_label('is_good') nlp.get_pipe('textcat').add_label('answer')
nlp.begin_training() nlp.begin_training()
for i in range(5): for i in range(5):
for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.), for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.),
('bbbbbbbbb', 0.), ('aaaaaa', 1)]: ('bbbbbbbbb', 0.), ('aaaaaa', 1)]:
nlp.update([text], [{'cats': {'answer': answer}}]) nlp.update([text], [{'cats': {'answer': answer}}])
doc = nlp(u'aaa') doc = nlp(u'aaa')
assert 'is_good' in doc.cats assert 'answer' in doc.cats
assert doc.cats['is_good'] >= 0.5 assert doc.cats['answer'] >= 0.5

View File

@ -392,7 +392,7 @@ def minibatch(items, size=8):
so that batch-size can vary on each step. so that batch-size can vary on each step.
""" """
if isinstance(size, int): if isinstance(size, int):
size_ = itertools.repeat(8) size_ = itertools.repeat(size)
else: else:
size_ = size size_ = size
items = iter(items) items = iter(items)

View File

@ -127,20 +127,22 @@ mixin help(tooltip, icon_size)
//- Aside wrapper //- Aside wrapper
label - [string] aside label label - [string] aside label
mixin aside-wrapper(label) mixin aside-wrapper(label, emoji)
aside.c-aside aside.c-aside
.c-aside__content(role="complementary")&attributes(attributes) .c-aside__content(role="complementary")&attributes(attributes)
if label if label
h4.u-text-label.u-text-label--dark=label h4.u-text-label.u-text-label--dark
if emoji
span.o-emoji=emoji
| #{label}
block block
//- Aside for text //- Aside for text
label - [string] aside title (optional) label - [string] aside title (optional)
mixin aside(label) mixin aside(label, emoji)
+aside-wrapper(label) +aside-wrapper(label, emoji)
.c-aside__text.u-text-small .c-aside__text.u-text-small
block block
@ -703,6 +705,6 @@ mixin landing-logos(title, logos)
mixin under-construction() mixin under-construction()
+infobox("Under construction", "🚧") +infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0 | This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or | release. Is there anything that you think should definitely mentioned
| explained here? Any examples you'd like to see? #[strong Let us know] | or explained here? Any examples you'd like to see?
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! | #[strong Let us know] on the #[+a(gh("spacy") + "/issues") issue tracker]!

View File

@ -320,3 +320,137 @@ p
| #[code title] is rendered as coloured headline. #[code exits] | #[code title] is rendered as coloured headline. #[code exits]
| performs system exit after printing, using the value of the | performs system exit after printing, using the value of the
| argument as the exit code, e.g. #[code exits=1]. | argument as the exit code, e.g. #[code exits=1].
+h(3, "util.minibatch") util.minibatch
+tag function
+tag-new(2)
p
| Iterate over batches of items. #[code size] may be an iterator, so that
| batch-size can vary on each step.
+aside-code("Example").
batches = minibatch(train_data)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations)
+table(["Name", "Type", "Description"])
+row
+cell #[code items]
+cell iterable
+cell The items to batch up.
+row
+cell #[code size]
+cell int / iterable
+cell
| The batch size(s). Use
| #[+api("top-level#util.compounding") #[code util.compounding]] or
| #[+api("top-level#util.decaying") #[code util.decaying]] or
| for an infinite series of compounding or decaying values.
+row("foot")
+cell yields
+cell list
+cell The batches.
+h(3, "util.compounding") util.compounding
+tag function
+tag-new(2)
p
| Yield an infinite series of compounding values. Each time the generator
| is called, a value is produced by multiplying the previous value by the
| compound rate.
+aside-code("Example").
sizes = compounding(1., 10., 1.5)
assert next(sizes) == 1.
assert next(sizes) == 1. * 1.5
assert next(sizes) == 1.5 * 1.5
+table(["Name", "Type", "Description"])
+row
+cell #[code start]
+cell int / float
+cell The first value.
+row
+cell #[code stop]
+cell int / float
+cell The maximum value.
+row
+cell #[code compound]
+cell int / float
+cell The compounding factor.
+row("foot")
+cell yields
+cell int
+cell Compounding values.
+h(3, "util.decaying") util.decaying
+tag function
+tag-new(2)
p
| Yield an infinite series of linearly decaying values.
+aside-code("Example").
sizes = decaying(1., 10., 0.001)
assert next(sizes) == 1.
assert next(sizes) == 1. - 0.001
assert next(sizes) == 0.999 - 0.001
+table(["Name", "Type", "Description"])
+row
+cell #[code start]
+cell int / float
+cell The first value.
+row
+cell #[code end]
+cell int / float
+cell The maximum value.
+row
+cell #[code decay]
+cell int / float
+cell The decaying factor.
+row("foot")
+cell yields
+cell int
+cell The decaying values.
+h(3, "util.itershuffle") util.itershuffle
+tag function
+tag-new(2)
p
| Shuffle an iterator. This works by holding #[code bufsize] items back and
| yielding them sometime later. Obviously, this is not unbiased but
| should be good enough for batching. Larger bufsize means less bias.
+aside-code("Example").
values = range(1000)
shuffled = itershuffle(values)
+table(["Name", "Type", "Description"])
+row
+cell #[code iterable]
+cell iterable
+cell Iterator to shuffle.
+row
+cell #[code buffsize]
+cell int
+cell Items to hold back.
+row("foot")
+cell yields
+cell iterable
+cell The shuffled iterator.

View File

@ -157,12 +157,19 @@ p Update the models in the pipeline.
+row +row
+cell #[code docs] +cell #[code docs]
+cell iterable +cell iterable
+cell A batch of #[code Doc] objects. +cell
| A batch of #[code Doc] objects or unicode. If unicode, a
| #[code Doc] object will be created from the text.
+row +row
+cell #[code golds] +cell #[code golds]
+cell iterable +cell iterable
+cell A batch of #[code GoldParse] objects. +cell
| A batch of #[code GoldParse] objects or dictionaries.
| Dictionaries will be used to create
| #[+api("goldparse") #[code GoldParse]] objects. For the available
| keys and their usage, see
| #[+api("goldparse#init") #[code GoldParse.__init__]].
+row +row
+cell #[code drop] +cell #[code drop]

View File

@ -148,8 +148,8 @@
"Basics": "basics", "Basics": "basics",
"NER": "ner", "NER": "ner",
"Tagger & Parser": "tagger-parser", "Tagger & Parser": "tagger-parser",
"Similarity": "similarity",
"Text Classification": "textcat", "Text Classification": "textcat",
"Tips and Advice": "tips",
"Saving & Loading": "saving-loading" "Saving & Loading": "saving-loading"
} }
}, },

View File

@ -88,8 +88,8 @@ p
| and add it to the #[code Language] instance returned by the | and add it to the #[code Language] instance returned by the
| model's #[code load()] method. For examples of this, check out the | model's #[code load()] method. For examples of this, check out the
| implementations of spaCy's | implementations of spaCy's
| #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]] | #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]]
| and #[+api("util#load_model_from_path") #[code load_model_from_path()]] | and #[+api("util#load_model_from_path") #[code load_model_from_path]]
| utility functions. | utility functions.
+code-wrapper +code-wrapper

View File

@ -172,15 +172,23 @@ p
+row +row
+cell #[code get_data] +cell #[code get_data]
+cell A function converting the training data to spaCy's JSON format. +cell
| An optional function converting the training data to spaCy's
| JSON format.
+row +row
+cell #[code doc] +cell #[code doc]
+cell #[+api("doc") #[code Doc]] objects. +cell
| #[+api("doc") #[code Doc]] objects. The #[code update] method
| takes a sequence of them, so you can batch up your training
| examples.
+row +row
+cell #[code gold] +cell #[code gold]
+cell #[+api("goldparse") #[code GoldParse]] objects. +cell
| #[+api("goldparse") #[code GoldParse]] objects. The #[code update]
| method takes a sequence of them, so you can batch up your
| training examples.
+row +row
+cell #[code drop] +cell #[code drop]
@ -197,3 +205,53 @@ p
| a model will be saved out to the directory. After training, you can | a model will be saved out to the directory. After training, you can
| use the #[+api("cli#package") #[code package]] command to generate an | use the #[+api("cli#package") #[code package]] command to generate an
| installable Python package from your model. | installable Python package from your model.
+code(false, "bash").
spacy convert /tmp/train.conllu /tmp/data
spacy train en /tmp/model /tmp/data/train.json -n 5
+h(3, "training-simple-style") Simple training style
+tag-new(2)
p
| Instead of sequences of #[code Doc] and #[code GoldParse] objects,
| you can also use the "simple training style" and pass
| #[strong raw texts] and #[strong dictionaries of annotations]
| to #[+api("language#update") #[code nlp.update]].
| The dictionaries can have the keys #[code entities], #[code heads],
| #[code deps], #[code tags] and #[code cats]. This is generally
| recommended, as it removes one layer of abstraction, and avoids
| unnecessary imports. It also makes it easier to structure and load
| your training data.
+aside-code("Example Annotations").
{
'entities': [(0, 4, 'ORG')],
'heads': [1, 1, 1, 5, 5, 2, 7, 5],
'deps': ['nsubj', 'ROOT', 'prt', 'quantmod', 'compound', 'pobj', 'det', 'npadvmod'],
'tags': ['PROPN', 'VERB', 'ADP', 'SYM', 'NUM', 'NUM', 'DET', 'NOUN'],
'cats': {'BUSINESS': 1.0}
}
+code("Simple training loop").
TRAIN_DATA = [
("Uber blew through $1 million a week", {'entities': [(0, 4, 'ORG')]}),
("Google rebrands its business apps", {'entities': [(0, 6, "ORG")]})]
nlp = spacy.blank('en')
optimizer = nlp.begin_training()
for i in range(20):
random.shuffle(TRAIN_DATA)
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('/model')
p
| The above training loop leaves out a few details that can really
| improve accuracy but the principle really is #[em that] simple. Once
| you've got your pipeline together and you want to tune the accuracy,
| you usually want to process your training examples in batches, and
| experiment with #[+api("top-level#util.minibatch") #[code minibatch]]
| sizes and dropout rates, set via the #[code drop] keyword argument. See
| the #[+api("language") #[code Language]] and #[+api("pipe") #[code Pipe]]
| API docs for available options.

View File

@ -2,12 +2,9 @@
p p
| All #[+a("/models") spaCy models] support online learning, so | All #[+a("/models") spaCy models] support online learning, so
| you can update a pre-trained model with new examples. To update the | you can update a pre-trained model with new examples. You'll usually
| model, you first need to create an instance of | need to provide many #[strong examples] to meaningfully improve the
| #[+api("goldparse") #[code GoldParse]], with the entity labels | system — a few hundred is a good start, although more is better.
| you want to learn. You'll usually need to provide many examples to
| meaningfully improve the system — a few hundred is a good start, although
| more is better.
p p
| You should avoid iterating over the same few examples multiple times, or | You should avoid iterating over the same few examples multiple times, or
@ -21,7 +18,7 @@ p
| the model of other examples by augmenting your annotations with sentences | the model of other examples by augmenting your annotations with sentences
| annotated with entities automatically recognised by the original model. | annotated with entities automatically recognised by the original model.
| Ultimately, this is an empirical process: you'll need to | Ultimately, this is an empirical process: you'll need to
| #[strong experiment on your own data] to find a solution that works best | #[strong experiment on your data] to find a solution that works best
| for you. | for you.
+h(3, "example-train-ner") Updating the Named Entity Recognizer +h(3, "example-train-ner") Updating the Named Entity Recognizer
@ -39,12 +36,6 @@ p
+h(4) Step by step guide +h(4) Step by step guide
+list("numbers") +list("numbers")
+item
| #[strong Reformat the training data] to match spaCy's
| #[+a("/api/annotation#json-input") JSON format]. The built-in
| #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]]
| function can help you with this.
+item +item
| #[strong Load the model] you want to start with, or create an | #[strong Load the model] you want to start with, or create an
| #[strong empty model] using | #[strong empty model] using
@ -56,17 +47,13 @@ p
| This way, you'll only be training the entity recognizer. | This way, you'll only be training the entity recognizer.
+item +item
| #[strong Shuffle and loop over] the examples and create a | #[strong Shuffle and loop over] the examples. For each example,
| #[code Doc] and #[code GoldParse] object for each example. | #[strong update the model] by calling
| #[+api("language#update") #[code nlp.update]], which steps
+item
| For each example, #[strong update the model]
| by calling #[+api("language#update") #[code nlp.update]], which steps
| through the words of the input. At each word, it makes a | through the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations provided on the | #[strong prediction]. It then consults the annotations to see whether
| #[code GoldParse] instance, to see whether it was | it was right. If it was wrong, it adjusts its weights so that the
| right. If it was wrong, it adjusts its weights so that the correct | correct action will score higher next time.
| action will score higher next time.
+item +item
| #[strong Save] the trained model using | #[strong Save] the trained model using
@ -90,13 +77,16 @@ p
+github("spacy", "examples/training/train_new_entity_type.py", 500) +github("spacy", "examples/training/train_new_entity_type.py", 500)
+aside("Important note", "⚠️")
| If you're using an existing model, make sure to mix in examples of
| #[strong other entity types] that spaCy correctly recognized before.
| Otherwise, your model might learn the new type, but "forget" what it
| previously knew. This is also referred to as the
| #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) "catastrophic forgetting" problem].
+h(4) Step by step guide +h(4) Step by step guide
+list("numbers") +list("numbers")
+item
| Create #[code Doc] and #[code GoldParse] objects for
| #[strong each example in your training data].
+item +item
| #[strong Load the model] you want to start with, or create an | #[strong Load the model] you want to start with, or create an
| #[strong empty model] using | #[strong empty model] using
@ -117,10 +107,9 @@ p
| #[strong Loop over] the examples and call | #[strong Loop over] the examples and call
| #[+api("language#update") #[code nlp.update]], which steps through | #[+api("language#update") #[code nlp.update]], which steps through
| the words of the input. At each word, it makes a | the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations provided on the | #[strong prediction]. It then consults the annotations, to see
| #[code GoldParse] instance, to see whether it was right. If it was | whether it was right. If it was wrong, it adjusts its weights so that
| wrong, it adjusts its weights so that the correct action will score | the correct action will score higher next time.
| higher next time.
+item +item
| #[strong Save] the trained model using | #[strong Save] the trained model using

View File

@ -41,7 +41,7 @@ p
"author": "You", "author": "You",
"email": "you@example.com", "email": "you@example.com",
"license": "CC BY-SA 3.0", "license": "CC BY-SA 3.0",
"pipeline": ["token_vectors", "tagger"] "pipeline": ["tagger", "parser", "ner"]
} }
+code(false, "bash"). +code(false, "bash").
@ -94,26 +94,13 @@ p
| The #[code load()] method that comes with our model package | The #[code load()] method that comes with our model package
| templates will take care of putting all this together and returning a | templates will take care of putting all this together and returning a
| #[code Language] object with the loaded pipeline and data. If your model | #[code Language] object with the loaded pipeline and data. If your model
| requires custom pipeline components, you should | requires custom #[+a("/usage/processing-pipelines") pipeline components]
| #[strong ship then with your model] and register their | or a custom language class, you can also
| #[+a("/usage/processing-pipelines#creating-factory") factories] | #[strong ship the code with your model]. For examples of this, check out
| via #[+api("spacy#set_factory") #[code set_factory()]]. | the implementations of spaCy's
| #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]]
+aside-code("Factory example"). | and #[+api("util#load_model_from_path") #[code load_model_from_path]]
def my_factory(vocab): | utility functions.
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+code.
spacy.set_factory('custom_component', custom_component_factory)
+infobox("Custom models with pipeline components")
| For more details and an example of how to package a sentiment model
| with a custom pipeline component, see the usage guide on
| #[+a("/usage/processing-pipelines#example2") language processing pipelines].
+h(3, "models-building") Building the model package +h(3, "models-building") Building the model package
@ -155,8 +142,7 @@ p
| #[+api("language#from_disk") #[code from_disk]] instead. | #[+api("language#from_disk") #[code from_disk]] instead.
+code. +code.
from spacy.lang.en import English nlp = spacy.blank('en').from_disk('/path/to/data')
nlp = English().from_disk('/path/to/data')
+infobox("Important note: Loading data in v2.x") +infobox("Important note: Loading data in v2.x")
.o-block .o-block
@ -168,7 +154,7 @@ p
| spaCy v2.0 solves this with a clear distinction between setting up | spaCy v2.0 solves this with a clear distinction between setting up
| the instance and loading the data. | the instance and loading the data.
+code-new nlp = English().from_disk('/path/to/data') +code-new nlp = spacy.blank('en').from_disk('/path/to/data')
+code-old nlp = spacy.load('en', path='/path/to/data') +code-old nlp = spacy.load('en', path='/path/to/data')
+h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy +h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy

View File

@ -1,3 +0,0 @@
//- 💫 DOCS > USAGE > TRAINING > SIMILARITY
+under-construction

View File

@ -30,19 +30,13 @@ p
| not necessary but it doesn't hurt either, just to be safe. | not necessary but it doesn't hurt either, just to be safe.
+item +item
| #[strong Shuffle and loop over] the examples and create a | #[strong Shuffle and loop over] the examples. For each example,
| #[code Doc] and #[code GoldParse] object for each example. Make sure | #[strong update the model] by calling
| to pass in the #[code heads] and #[code deps] when you create the | #[+api("language#update") #[code nlp.update]], which steps through
| #[code GoldParse]. | the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations to see
+item | whether it was right. If it was wrong, it adjusts its weights so
| For each example, #[strong update the model] | that the correct action will score higher next time.
| by calling #[+api("language#update") #[code nlp.update]], which steps
| through the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations provided on the
| #[code GoldParse] instance, to see whether it was
| right. If it was wrong, it adjusts its weights so that the correct
| action will score higher next time.
+item +item
| #[strong Save] the trained model using | #[strong Save] the trained model using
@ -67,26 +61,29 @@ p
+list("numbers") +list("numbers")
+item +item
| #[strong Create] a new #[code Language] class and before initialising | #[strong Load the model] you want to start with, or create an
| it, update the #[code tag_map] in its #[code Defaults] with your | #[strong empty model] using
| custom tags. | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
| language. If you're using a blank model, don't forget to add the
| tagger to the pipeline. If you're using an existing model,
| make sure to disable all other pipeline components during training
| using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
| This way, you'll only be training the tagger.
+item +item
| #[strong Create a new tagger] component and add it to the pipeline. | #[strong Add the tag map] to the tagger using the
| #[+api("tagger#add_label") #[code add_label]] method. The first
| argument is the new tag name, the second the mapping to spaCy's
| coarse-grained tags, e.g. #[code {'pos': 'NOUN'}].
+item +item
| #[strong Shuffle and loop over] the examples and create a | #[strong Shuffle and loop over] the examples. For each example,
| #[code Doc] and #[code GoldParse] object for each example. Make sure | #[strong update the model] by calling
| to pass in the #[code tags] when you create the #[code GoldParse]. | #[+api("language#update") #[code nlp.update]], which steps through
| the words of the input. At each word, it makes a
+item | #[strong prediction]. It then consults the annotations to see whether
| For each example, #[strong update the model] | it was right. If it was wrong, it adjusts its weights so that the
| by calling #[+api("language#update") #[code nlp.update]], which steps | correct action will score higher next time.
| through the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations provided on the
| #[code GoldParse] instance, to see whether it was
| right. If it was wrong, it adjusts its weights so that the correct
| action will score higher next time.
+item +item
| #[strong Save] the trained model using | #[strong Save] the trained model using
@ -124,7 +121,7 @@ p
| respective action e.g. search the database for hotels with high ratings | respective action e.g. search the database for hotels with high ratings
| for their wifi offerings. | for their wifi offerings.
+aside("Tip: merge phrases and entities") +aside("Tip: merge phrases and entities", "💡")
| To achieve even better accuracy, try merging multi-word tokens and | To achieve even better accuracy, try merging multi-word tokens and
| entities specific to your domain into one token before parsing your text. | entities specific to your domain into one token before parsing your text.
| You can do this by running the entity recognizer or | You can do this by running the entity recognizer or
@ -160,9 +157,10 @@ p
| #[strong empty model] using | #[strong empty model] using
| #[+api("spacy#blank") #[code spacy.blank]] with the ID of your | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
| language. If you're using a blank model, don't forget to add the | language. If you're using a blank model, don't forget to add the
| parser to the pipeline. If you're using an existing model, | custom parser to the pipeline. If you're using an existing model,
| make sure to disable all other pipeline components during training | make sure to #[strong remove the old parser] from the pipeline, and
| using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. | disable all other pipeline components during training using
| #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
| This way, you'll only be training the parser. | This way, you'll only be training the parser.
+item +item
@ -170,19 +168,13 @@ p
| #[+api("dependencyparser#add_label") #[code add_label]] method. | #[+api("dependencyparser#add_label") #[code add_label]] method.
+item +item
| #[strong Shuffle and loop over] the examples and create a | #[strong Shuffle and loop over] the examples. For each example,
| #[code Doc] and #[code GoldParse] object for each example. Make sure | #[strong update the model] by calling
| to pass in the #[code heads] and #[code deps] when you create the | #[+api("language#update") #[code nlp.update]], which steps
| #[code GoldParse].
+item
| For each example, #[strong update the model]
| by calling #[+api("language#update") #[code nlp.update]], which steps
| through the words of the input. At each word, it makes a | through the words of the input. At each word, it makes a
| #[strong prediction]. It then consults the annotations provided on the | #[strong prediction]. It then consults the annotations to see whether
| #[code GoldParse] instance, to see whether it was | it was right. If it was wrong, it adjusts its weights so that the
| right. If it was wrong, it adjusts its weights so that the correct | correct action will score higher next time.
| action will score higher next time.
+item +item
| #[strong Save] the trained model using | #[strong Save] the trained model using

View File

@ -35,17 +35,18 @@ p
| be able to see results on each training iteration. | be able to see results on each training iteration.
+item +item
| #[strong Loop over] the training examples, partition them into | #[strong Loop over] the training examples and partition them into
| batches and create #[code Doc] and #[code GoldParse] objects for each | batches using spaCy's
| example in the batch. | #[+api("top-level#util.minibatch") #[code minibatch]] and
| #[+api("top-level#util.compounding") #[code compounding]] helpers.
+item +item
| #[strong Update the model] by calling | #[strong Update the model] by calling
| #[+api("language#update") #[code nlp.update]], which steps | #[+api("language#update") #[code nlp.update]], which steps
| through the examples and makes a #[strong prediction]. It then | through the examples and makes a #[strong prediction]. It then
| consults the annotations provided on the #[code GoldParse] instance, | consults the annotations to see whether it was right. If it was
| to see whether it was right. If it was wrong, it adjusts its weights | wrong, it adjusts its weights so that the correct prediction will
| so that the correct prediction will score higher next time. | score higher next time.
+item +item
| Optionally, you can also #[strong evaluate the text classifier] on | Optionally, you can also #[strong evaluate the text classifier] on

View File

@ -0,0 +1,135 @@
//- 💫 DOCS > USAGE > TRAINING > OPTIMIZATION TIPS AND ADVICE
p
| There are lots of conflicting "recipes" for training deep neural
| networks at the moment. The cutting-edge models take a very long time to
| train, so most researchers can't run enough experiments to figure out
| what's #[em really] going on. For what it's worth, here's a recipe seems
| to work well on a lot of problems:
+code("Batch heuristic").
def get_batches(train_data, model_type):
max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64}
max_batch_size = max_batch_sizes[model_type]
if len(train_data) < 1000:
max_batch_size /= 2
if len(train_data) < 500:
max_batch_size /= 2
batch_size = compounding(1, max_batch_size, 1.001)
batches = minibatch(train_data, size=batch_size)
return batches
p
| This will set the batch size to start at #[code 1], and increase each
| batch until it reaches a maximum size. The tagger, parser and entity
| recognizer all take whole sentences as input, so they're learning a lot
| of labels in a single example. You therefore need smaller batches for
| them. The batch size for the text categorizer should be somewhat larger,
| especially if your documents are long.
p
| The trick of increasing the batch size is starting to become quite
| popular (see #[+a("https://arxiv.org/abs/1711.00489") Smith et al., 2017]).
| Their recipe is quite different from how spaCy's models are being
| trained, but there are some similarities. In training the various spaCy
| models, we haven't found much advantage from decaying the learning
| rate but starting with a low batch size has definitely helped. You
| should try it out on your data, and see how you go.
+h(3, "tips-hyperparams") Learning rate, regularization and gradient clipping
p
| By default spaCy uses the Adam solver, with default settings
| (learning rate #[code 0.001], #[code beta1=0.9], #[code beta2=0.999]).
| Some researchers have said they found these settings terrible on their
| problems but they've always performed very well in training spaCy's
| models, in combination with the rest of our recipe. You can change these
| settings directly, by modifying the corresponding attributes on the
| #[code optimizer] object. You can also set environment variables, to
| adjust the defaults.
p
| There are two other key hyper-parameters of the solver: #[code L2]
| #[strong regularization], and #[strong gradient clipping]
| (#[code max_grad_norm]). Gradient clipping is a hack that's not discussed
| often, but everybody seems to be using. It's quite important in helping
| to ensure the network doesn't diverge, which is a fancy way of saying
| "fall over during training". The effect is sort of similar to setting the
| learning rate low. It can also compensate for a large batch size (this is
| a good example of how the choices of all these hyper-parameters
| intersect).
+h(3, "tips-dropout") Dropout rate
p
| For small datasets, it's useful to set a
| #[strong high dropout rate at first], and #[strong decay] it down towards
| a more reasonable value. This helps avoid the network immediately
| overfitting, while still encouraging it to learn some of the more
| interesting things in your data. spaCy comes with a
| #[+api("top-level#util.decaying") #[code decaying]] utility function to
| facilitate this. You might try setting:
+code.
from spacy.util import decaying
dropout = decaying(0.6, 0.2, 1e-4)
p
| You can then draw values from the iterator with #[code next(dropout)],
| which you would pass to the #[code drop] keyword argument of
| #[+api("language#update") #[code nlp.update]]. It's pretty much always a
| good idea to use at least #[strong some dropout]. All of the models
| currently use Bernoulli dropout, for no particularly principled reason
| we just haven't experimented with another scheme like Gaussian dropout
| yet.
+h(3, "tips-param-avg") Parameter averaging
p
| The last part of our optimisation recipe is #[strong parameter averaging],
| an old trick introduced by
| #[+a("https://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf") Freund and Schapire (1999)],
| popularised in the NLP community by
| #[+a("http://www.aclweb.org/anthology/P04-1015") Collins (2002)],
| explained in more detail by
| #[+a("http://leon.bottou.org/projects/sgd") Leon Botto]. Just about the
| only other people who seem to be using this for neural network training
| are the SyntaxNet team (one of whom is Michael Collins) but it really
| seems to work great on every problem.
p
| The trick is to store the moving average of the weights during training.
| We don't optimise this average we just track it. Then when we want to
| actually use the model, we use the averages, not the most recent value.
| In spaCy (and #[+a(gh("thinc")) Thinc]) this is done by using a
| context manager, #[+api("language#use_params") #[code use_params]], to
| temporarily replace the weights:
+code.
with nlp.use_params(optimizer.averages):
nlp.to_disk('/model')
p
| The context manager is handy because you naturally want to evaluate and
| save the model at various points during training (e.g. after each epoch).
| After evaluating and saving, the context manager will exit and the
| weights will be restored, so you resume training from the most recent
| value, rather than the average. By evaluating the model after each epoch,
| you can remove one hyper-parameter from consideration (the number of
| epochs). Having one less magic number to guess is extremely nice so
| having the averaging under a context manager is very convenient.
+h(3, "tips-transfer-learning") Transfer learning
p
| Finally, if you're training from a small data set, it's very useful to
| start off with some knowledge already in the model. #[strong Word vectors]
| are an easy and reliable way to do that, but depending on the
| application, you may also be able to start with useful knowledge from one
| of spaCy's #[+a("/models") pre-trained models], such as the parser,
| entity recogniser and tagger. If you're adapting a pre-trained model and
| you want it to retain accuracy on the tasks it was originally trained
| for, you should consider the "catastrophic forgetting" problem.
| #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) See this blog post]
| to read more about the problem and our suggested solution,
| pseudo-rehearsal.

View File

@ -110,17 +110,23 @@ p
| spaCy when to #[em stop], you can now explicitly call | spaCy when to #[em stop], you can now explicitly call
| #[+api("language#begin_training") #[code begin_taining]], which | #[+api("language#begin_training") #[code begin_taining]], which
| returns an optimizer you can pass into the | returns an optimizer you can pass into the
| #[+api("language#update") #[code update]] function. | #[+api("language#update") #[code update]] function. While #[code update]
| still accepts sequences of #[code Doc] and #[code GoldParse] objects,
| you can now also pass in a list of strings and dictionaries describing
| the annotations. This is the recommended usage, as it removes one layer
| of abstraction from the training.
+code-new. +code-new.
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(1000): for itn in range(1000):
for doc, gold in train_data: for texts, annotations in train_data:
nlp.update([doc], [gold], sgd=optimizer) nlp.update(texts, annotations, sgd=optimizer)
nlp.to_disk('/model') nlp.to_disk('/model')
+code-old. +code-old.
for itn in range(1000): for itn in range(1000):
for doc, gold in train_data: for text, entities in train_data:
doc = Doc(text)
gold = GoldParse(doc, entities=entities)
nlp.update(doc, gold) nlp.update(doc, gold)
nlp.end_training() nlp.end_training()
nlp.save_to_directory('/model') nlp.save_to_directory('/model')

View File

@ -165,18 +165,15 @@ include ../_includes/_mixins
+h(3, "keras") Text classification with Keras +h(3, "keras") Text classification with Keras
p p
| In this example, we're using spaCy to pre-process text for use with | This example shows how to use a #[+a("https://keras.io") Keras]
| a #[+a("https://keras.io") Keras] text classification model. | LSTM sentiment classification model in spaCy. spaCy splits
| the document into sentences, and each sentence is classified using
| the LSTM. The scores for the sentences are then aggregated to give
| the document score. This kind of hierarchical model is quite
| difficult in "pure" Keras or Tensorflow, but it's very effective.
| The Keras example on this dataset performs quite poorly, because it
| cuts off the documents so that they're a fixed size. This hurts
| review accuracy a lot, because people often summarise their rating
| in the final sentence.
+github("spacy", "examples/deep_learning_keras.py") +github("spacy", "examples/deep_learning_keras.py")
+h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
p
| This example contains an implementation of the entailment prediction
| model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
| The model is notable for its competitive performance with very few
| parameters, and was implemented using #[+a("https://keras.io") Keras]
| and spaCy.
+github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")

View File

@ -20,14 +20,14 @@ p
+h(2, "tagger-parser") Training the tagger and parser +h(2, "tagger-parser") Training the tagger and parser
include _training/_tagger-parser include _training/_tagger-parser
+section("similarity")
+h(2, "similarity") Training a similarity model
include _training/_similarity
+section("textcat") +section("textcat")
+h(2, "textcat") Training a text classification model +h(2, "textcat") Training a text classification model
include _training/_textcat include _training/_textcat
+section("tips")
+h(2, "tips") Optimization tips and advice
include _training/_tips
+section("saving-loading") +section("saving-loading")
+h(2, "saving-loading") Saving and loading models +h(2, "saving-loading") Saving and loading models
include _training/_saving-loading include _training/_saving-loading