diff --git a/website/usage/_data.json b/website/usage/_data.json index fa7a1fcd2..193787d91 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -150,6 +150,7 @@ "Tagger & Parser": "tagger-parser", "Similarity": "similarity", "Text Classification": "textcat", + "Tips and Advice": "tips", "Saving & Loading": "saving-loading" } }, diff --git a/website/usage/_training/_basics.jade b/website/usage/_training/_basics.jade index 6ac1477e8..abacfc839 100644 --- a/website/usage/_training/_basics.jade +++ b/website/usage/_training/_basics.jade @@ -206,6 +206,10 @@ p | use the #[+api("cli#package") #[code package]] command to generate an | installable Python package from your model. ++code(false, "bash"). + spacy convert /tmp/train.conllu /tmp/data + spacy train en /tmp/model /tmp/data/train.json -n 5 + +h(3, "training-simple-style") Simple training style +tag-new(2) diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index b8ef43916..383db0f29 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -2,12 +2,9 @@ p | All #[+a("/models") spaCy models] support online learning, so - | you can update a pre-trained model with new examples. To update the - | model, you first need to create an instance of - | #[+api("goldparse") #[code GoldParse]], with the entity labels - | you want to learn. You'll usually need to provide many examples to - | meaningfully improve the system — a few hundred is a good start, although - | more is better. + | you can update a pre-trained model with new examples. You'll usually + | need to provide many #[strong examples] to meaningfully improve the + | system — a few hundred is a good start, although more is better. p | You should avoid iterating over the same few examples multiple times, or @@ -21,7 +18,7 @@ p | the model of other examples by augmenting your annotations with sentences | annotated with entities automatically recognised by the original model. | Ultimately, this is an empirical process: you'll need to - | #[strong experiment on your own data] to find a solution that works best + | #[strong experiment on your data] to find a solution that works best | for you. +h(3, "example-train-ner") Updating the Named Entity Recognizer diff --git a/website/usage/_training/_saving-loading.jade b/website/usage/_training/_saving-loading.jade index e6e54385c..885b3ad24 100644 --- a/website/usage/_training/_saving-loading.jade +++ b/website/usage/_training/_saving-loading.jade @@ -41,7 +41,7 @@ p "author": "You", "email": "you@example.com", "license": "CC BY-SA 3.0", - "pipeline": ["token_vectors", "tagger"] + "pipeline": ["tagger", "parser", "ner"] } +code(false, "bash"). @@ -94,26 +94,13 @@ p | The #[code load()] method that comes with our model package | templates will take care of putting all this together and returning a | #[code Language] object with the loaded pipeline and data. If your model - | requires custom pipeline components, you should - | #[strong ship then with your model] and register their - | #[+a("/usage/processing-pipelines#creating-factory") factories] - | via #[+api("spacy#set_factory") #[code set_factory()]]. - -+aside-code("Factory example"). - def my_factory(vocab): - # load some state - def my_component(doc): - # process the doc - return doc - return my_component - -+code. - spacy.set_factory('custom_component', custom_component_factory) - -+infobox("Custom models with pipeline components") - | For more details and an example of how to package a sentiment model - | with a custom pipeline component, see the usage guide on - | #[+a("/usage/processing-pipelines#example2") language processing pipelines]. + | requires custom #[+a("/usage/processing-pipelines") pipeline components] + | or a custom language class, you can also + | #[strong ship the code with your model]. For examples of this, check out + | the implementations of spaCy's + | #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]] + | and #[+api("util#load_model_from_path") #[code load_model_from_path]] + | utility functions. +h(3, "models-building") Building the model package @@ -155,8 +142,7 @@ p | #[+api("language#from_disk") #[code from_disk]] instead. +code. - from spacy.lang.en import English - nlp = English().from_disk('/path/to/data') + nlp = spacy.blank('en').from_disk('/path/to/data') +infobox("Important note: Loading data in v2.x") .o-block @@ -168,7 +154,7 @@ p | spaCy v2.0 solves this with a clear distinction between setting up | the instance and loading the data. - +code-new nlp = English().from_disk('/path/to/data') + +code-new nlp = spacy.blank('en').from_disk('/path/to/data') +code-old nlp = spacy.load('en', path='/path/to/data') +h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy diff --git a/website/usage/_training/_tips.jade b/website/usage/_training/_tips.jade new file mode 100644 index 000000000..942c8de8d --- /dev/null +++ b/website/usage/_training/_tips.jade @@ -0,0 +1,135 @@ +//- 💫 DOCS > USAGE > TRAINING > OPTIMIZATION TIPS AND ADVICE + +p + | There are lots of conflicting "recipes" for training deep neural + | networks at the moment. The cutting-edge models take a very long time to + | train, so most researchers can't run enough experiments to figure out + | what's #[em really] going on. For what it's worth, here's a recipe seems + | to work well on a lot of problems: + ++code("Batch heuristic"). + def get_batches(train_data, model_type): + max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64} + max_batch_size = max_batch_sizes[model_type] + if len(train_data) < 1000: + max_batch_size /= 2 + if len(train_data) < 500: + max_batch_size /= 2 + batch_size = compounding(1, max_batch_size, 1.001) + batches = minibatch(train_data, size=batch_size) + return batches + +p + | This will set the batch size to start at #[code 1], and increase each + | batch until it reaches a maximum size. The tagger, parser and entity + | recognizer all take whole sentences as input, so they're learning a lot + | of labels in a single example. You therefore need smaller batches for + | them. The batch size for the text categorizer should be somewhat larger, + | especially if your documents are long. + +p + | The trick of increasing the batch size is starting to become quite + | popular (see #[+a("https://arxiv.org/abs/1711.00489") Smith et al., 2017]). + | Their recipe is quite different from how spaCy's models are being + | trained, but there are some similarities. In training the various spaCy + | models, we haven't found much advantage from decaying the learning + | rate – but starting with a low batch size has definitely helped. You + | should try it out on your data, and see how you go. + ++h(3, "tips-hyperparams") Learning rate, regularization and gradient clipping + +p + | By default spaCy uses the Adam solver, with default settings + | (learning rate #[code 0.001], #[code beta1=0.9], #[code beta2=0.999]). + | Some researchers have said they found these settings terrible on their + | problems – but they've always performed very well in training spaCy's + | models, in combination with the rest of our recipe. You can change these + | settings directly, by modifying the corresponding attributes on the + | #[code optimizer] object. You can also set environment variables, to + | adjust the defaults. + +p + | There are two other key hyper-parameters of the solver: #[code L2] + | #[strong regularization], and #[strong gradient clipping] + | (#[code max_grad_norm]). Gradient clipping is a hack that's not discussed + | often, but everybody seems to be using. It's quite important in helping + | to ensure the network doesn't diverge, which is a fancy way of saying + | "fall over during training". The effect is sort of similar to setting the + | learning rate low. It can also compensate for a large batch size (this is + | a good example of how the choices of all these hyper-parameters + | intersect). + ++h(3, "tips-dropout") Dropout rate + +p + | For small datasets, it's useful to set a + | #[strong high dropout rate at first], and #[strong decay] it down towards + | a more reasonable value. This helps avoid the network immediately + | overfitting, while still encouraging it to learn some of the more + | interesting things in your data. spaCy comes with a + | #[+api("top-level#util.decaying") #[code decaying]] utility function to + | facilitate this. You might try setting: + ++code. + from spacy.util import decaying + dropout = decaying(0.6, 0.2, 1e-4) + +p + | You can then draw values from the iterator with #[code next(dropout)], + | which you would pass to the #[code drop] keyword argument of + | #[+api("language#update") #[code nlp.update]]. It's pretty much always a + | good idea to use at least #[strong some dropout]. All of the models + | currently use Bernoulli dropout, for no particularly principled reason – + | we just haven't experimented with another scheme like Gaussian dropout + | yet. + ++h(3, "tips-param-avg") Parameter averaging + +p + | The last part of our optimisation recipe is #[strong parameter averaging], + | an old trick introduced by + | #[+a("https://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf") Freund and Schapire (1999)], + | popularised in the NLP community by + | #[+a("http://www.aclweb.org/anthology/P04-1015") Collins (2002)], + | explained in more detail by + | #[+a("http://leon.bottou.org/projects/sgd") Leon Botto]. Just about the + | only other people who seem to be using this for neural network training + | are the SyntaxNet team (one of whom is Michael Collins) – but it really + | seems to work great on every problem. + +p + | The trick is to store the moving average of the weights during training. + | We don't optimise this average – we just track it. Then when we want to + | actually use the model, we use the averages, not the most recent value. + | In spaCy (and #[+a(gh("thinc")) Thinc]) this is done by using a + | context manager, #[+api("language#use_params") #[code use_params]], to + | temporarily replace the weights: + ++code. + with nlp.use_params(optimizer.averages): + nlp.to_disk('/model') + +p + | The context manager is handy because you naturally want to evaluate and + | save the model at various points during training (e.g. after each epoch). + | After evaluating and saving, the context manager will exit and the + | weights will be restored, so you resume training from the most recent + | value, rather than the average. By evaluating the model after each epoch, + | you can remove one hyper-parameter from consideration (the number of + | epochs). Having one less magic number to guess is extremely nice – so + | having the averaging under a context manager is very convenient. + ++h(3, "tips-transfer-learning") Transfer learning + +p + | Finally, if you're training from a small data set, it's very useful to + | start off with some knowledge already in the model. #[strong Word vectors] + | are an easy and reliable way to do that, but depending on the + | application, you may also be able to start with useful knowledge from one + | of spaCy's #[+a("/models") pre-trained models], such as the parser, + | entity recogniser and tagger. If you're adapting a pre-trained model and + | you want it to retain accuracy on the tasks it was originally trained + | for, you should consider the "catastrophic forgetting" problem. + | #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) See this blog post] + | to read more about the problem and our suggested solution, + | pseudo-rehearsal. diff --git a/website/usage/training.jade b/website/usage/training.jade index 8f15668c4..ec8401fb1 100644 --- a/website/usage/training.jade +++ b/website/usage/training.jade @@ -28,6 +28,10 @@ p +h(2, "textcat") Training a text classification model include _training/_textcat ++section("tips") + +h(2, "tips") Optimization tips and advice + include _training/_tips + +section("saving-loading") +h(2, "saving-loading") Saving and loading models include _training/_saving-loading