diff --git a/spacy/about.py b/spacy/about.py
index 38e934374..aa42ae05d 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,11 +3,11 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
-__version__ = '1.8.2'
+__version__ = '2.0.0'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
-__author__ = 'Matthew Honnibal'
-__email__ = 'matt@explosion.ai'
+__author__ = 'Explosion AI'
+__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__docs_models__ = 'https://spacy.io/docs/usage/models'
diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 05e64b0fa..ce8bfad4e 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -382,3 +382,4 @@ mixin annotation-row(annots, style)
+cell #[code=cell]
else
+cell=cell
+ block
diff --git a/website/assets/img/docs/training-loop.svg b/website/assets/img/docs/training-loop.svg
new file mode 100644
index 000000000..c0acd10cf
--- /dev/null
+++ b/website/assets/img/docs/training-loop.svg
@@ -0,0 +1,40 @@
+
diff --git a/website/assets/img/docs/training.svg b/website/assets/img/docs/training.svg
new file mode 100644
index 000000000..cd6b74f04
--- /dev/null
+++ b/website/assets/img/docs/training.svg
@@ -0,0 +1,47 @@
+
diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade
index 9e45a89d9..9c26f506c 100644
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@@ -141,10 +141,10 @@ p
p Update the models in the pipeline.
+aside-code("Example").
- with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
- for epoch in trainer.epochs(gold):
- for docs, golds in epoch:
- state = nlp.update(docs, golds, sgd=optimizer)
+ for raw_text, entity_offsets in train_data:
+ doc = nlp.make_doc(raw_text)
+ gold = GoldParse(doc, entities=entity_offsets)
+ nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
@@ -173,17 +173,13 @@ p Update the models in the pipeline.
+cell Results from the update.
+h(2, "begin_training") Language.begin_training
- +tag contextmanager
+ +tag method
p
- | Allocate models, pre-process training data and acquire a trainer and
- | optimizer. Used as a contextmanager.
+ | Allocate models, pre-process training data and acquire an optimizer.
+aside-code("Example").
- with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
- for epoch in trainer.epochs(gold):
- for docs, golds in epoch:
- state = nlp.update(docs, golds, sgd=optimizer)
+ optimizer = nlp.begin_training(gold_tuples)
+table(["Name", "Type", "Description"])
+row
@@ -199,7 +195,7 @@ p
+footrow
+cell yields
+cell tuple
- +cell A trainer and an optimizer.
+ +cell An optimizer.
+h(2, "use_params") Language.use_params
+tag contextmanager
diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade
index 5620a6151..27804344e 100644
--- a/website/docs/usage/_spacy-101/_serialization.jade
+++ b/website/docs/usage/_spacy-101/_serialization.jade
@@ -1,12 +1,12 @@
//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION
p
- | If you've been modifying the pipeline, vocabulary vectors and entities, or made
- | updates to the model, you'll eventually want
- | to #[strong save your progress] – for example, everything that's in your #[code nlp]
- | object. This means you'll have to translate its contents and structure
- | into a format that can be saved, like a file or a byte string. This
- | process is called serialization. spaCy comes with
+ | If you've been modifying the pipeline, vocabulary, vectors and entities,
+ | or made updates to the model, you'll eventually want to
+ | #[strong save your progress] – for example, everything that's in your
+ | #[code nlp] object. This means you'll have to translate its contents and
+ | structure into a format that can be saved, like a file or a byte string.
+ | This process is called serialization. spaCy comes with
| #[strong built-in serialization methods] and supports the
| #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol].
@@ -45,11 +45,7 @@ p
| #[code Vocab] holds the context-independent information about the words,
| tags and labels, and their #[strong hash values]. If the #[code Vocab]
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
- | those IDs – for example, the word text or the dependency labels. You
- | might be saving #[code 446] for "whale", but in a different vocabulary,
- | this ID could map to "VERB". Similarly, if your document was processed by
- | a German model, its vocab will include the specific
- | #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
+ | those IDs back to strings.
+code.
moby_dick = open('moby_dick.txt', 'r') # open a large document
diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade
index f4a0c7194..9b283c0eb 100644
--- a/website/docs/usage/_spacy-101/_training.jade
+++ b/website/docs/usage/_spacy-101/_training.jade
@@ -1,3 +1,52 @@
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
-+under-construction
+p
+ | spaCy's models are #[strong statistical] and every "decision" they make –
+ | for example, which part-of-speech tag to assign, or whether a word is a
+ | named entity – is a #[strong prediction]. This prediction is based
+ | on the examples the model has seen during #[strong training]. To train
+ | a model, you first need training data – examples of text, and the
+ | labels you want the model to predict. This could be a part-of-speech tag,
+ | a named entity or any other information.
+
+p
+ | The model is then shown the unlabelled text and will make a prediction.
+ | Because we know the correct answer, we can give the model feedback on its
+ | prediction in the form of an #[strong error gradient] of the
+ | #[strong loss function] that calculates the difference between the training
+ | example and the expected output. The greater the difference, the more
+ | significant the gradient and the updates to our model.
+
++aside
+ | #[strong Training data:] Examples and their annotations.#[br]
+ | #[strong Text:] The input text the model should predict a label for.#[br]
+ | #[strong Label:] The label the model should predict.#[br]
+ | #[strong Gradient:] Gradient of the loss function calculating the
+ | difference between input and expected output.
+
++image
+ include ../../../assets/img/docs/training.svg
+ .u-text-right
+ +button("/assets/img/docs/training.svg", false, "secondary").u-text-tag View large graphic
+
+p
+ | When training a model, we don't just want it to memorise our examples –
+ | we want it to come up with theory that can be
+ | #[strong generalised across other examples]. After all, we don't just want
+ | the model to learn that this one instance of "Amazon" right here is a
+ | company – we want it to learn that "Amazon", in contexts #[em like this],
+ | is most likely a company. That's why the training data should always be
+ | representative of the data we want to process. A model trained on
+ | Wikipedia, where sentences in the first person are extremely rare, will
+ | likely perform badly on Twitter. Similarly, a model trained on romantic
+ | novels will likely perform badly on legal text.
+
+p
+ | This also means that in order to know how the model is performing,
+ | and whether it's learning the right things, you don't only need
+ | #[strong training data] – you'll also need #[strong evaluation data]. If
+ | you only test the model with the data it was trained on, you'll have no
+ | idea how well it's generalising. If you want to train a model from scratch,
+ | you usually need at least a few hundred examples for both training and
+ | evaluation. To update an existing model, you can already achieve decent
+ | results with very few examples – as long as they're representative.
diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade
index f33ef70df..7fd0a6d37 100644
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@@ -154,40 +154,29 @@ p
| To provide training examples to the entity recogniser, you'll first need
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
| You can specify your annotations in a stand-off format or as token tags.
-
-+code.
- import random
- import spacy
- from spacy.gold import GoldParse
- from spacy.pipeline import EntityRecognizer
-
- train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
- ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
-
- nlp = spacy.load('en', entity=False, parser=False)
- ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
-
- for itn in range(5):
- random.shuffle(train_data)
- for raw_text, entity_offsets in train_data:
- doc = nlp.make_doc(raw_text)
- gold = GoldParse(doc, entities=entity_offsets)
-
- nlp.tagger(doc)
- ner.update(doc, gold)
-
-p
| If a character offset in your entity annotations don't fall on a token
| boundary, the #[code GoldParse] class will treat that annotation as a
| missing value. This allows for more realistic training, because the
| entity recogniser is allowed to learn from examples that may feature
| tokenizer errors.
-+aside-code("Example").
++code.
+ train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
+ ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
+
++code.
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
- ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
- ner.update(doc, gold)
+
++infobox
+ | For more details on #[strong training and updating] the named entity
+ | recognizer, see the usage guides on #[+a("/docs/usage/training") training]
+ | and #[+a("/docs/usage/training-ner") training the named entity recognizer],
+ | or check out the runnable
+ | #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
+ | on GitHub.
+
++h(3, "updating-biluo") The BILUO Scheme
p
| You can also provide token-level entity annotation, using the
diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade
index e1300b5b0..55e7a030a 100644
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@@ -252,6 +252,12 @@ include _spacy-101/_serialization
include _spacy-101/_training
++infobox
+ | To learn more about #[strong training and updating] models, how to create
+ | training data and how to improve spaCy's named entity recognition models,
+ | see the usage guides on #[+a("/docs/usage/training") training] and
+ | #[+a("/docs/usage/training-ner") training the named entity recognizer].
+
+h(2, "architecture") Architecture
+under-construction
diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade
index 5a0c06462..500bb24ff 100644
--- a/website/docs/usage/training-ner.jade
+++ b/website/docs/usage/training-ner.jade
@@ -8,6 +8,8 @@ p
| particularly useful as a "quick and dirty solution", if you have only a
| few corrections or annotations.
++under-construction
+
+h(2, "improving-accuracy") Improving accuracy on existing entity types
p
@@ -15,16 +17,7 @@ p
| #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
| you want to learn. You will then pass this instance to the
| #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
- | method. For example:
-
-+code.
- import spacy
- from spacy.gold import GoldParse
-
- nlp = spacy.load('en')
- doc = nlp.make_doc(u'Facebook released React in 2014')
- gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
- nlp.entity.update(doc, gold)
+ | method.
p
| You'll usually need to provide many examples to meaningfully improve the
@@ -44,100 +37,6 @@ p
| #[strong experiment on your own data] to find a solution that works best
| for you.
-+h(2, "adding") Adding a new entity type
-
-p
- | You can add new entity types to an existing model. Let's say we want to
- | recognise the category #[code TECHNOLOGY]. The new category will include
- | programming languages, frameworks and platforms. First, we need to
- | register the new entity type:
-
-+code.
- nlp.entity.add_label('TECHNOLOGY')
-
-p
- | Next, iterate over your examples, calling #[code entity.update()]. As
- | above, we want to avoid iterating over only a small number of sentences.
- | A useful compromise is to run the model over a number of plain-text
- | sentences, and pass the entities to #[code GoldParse], as "true"
- | annotations. This encourages the optimizer to find a solution that
- | predicts the new category with minimal difference from the previous
- | output.
-
-+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
-
-+under-construction
-
-p
- | This script shows how to add a new entity type to an existing pre-trained
- | NER model. To keep the example short and simple, only four sentences are
- | provided as examples. In practice, you'll need many more —
- | #[strong a few hundred] would be a good start. You will also likely need
- | to mix in #[strong examples of other entity types], which might be
- | obtained by running the entity recognizer over unlabelled sentences, and
- | adding their annotations to the training set.
-
-p
- | For the full, runnable script of this example, see
- | #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
-
-+code("Training the entity recognizer").
- import spacy
- from spacy.pipeline import EntityRecognizer
- from spacy.gold import GoldParse
- from spacy.tagger import Tagger
- import random
-
- model_name = 'en'
- entity_label = 'ANIMAL'
- output_directory = '/path/to/model'
- train_data = [
- ("Horses are too tall and they pretend to care about your feelings",
- [(0, 6, 'ANIMAL')]),
- ("horses are too tall and they pretend to care about your feelings",
- [(0, 6, 'ANIMAL')]),
- ("horses pretend to care about your feelings",
- [(0, 6, 'ANIMAL')]),
- ("they pretend to care about your feelings, those horses",
- [(48, 54, 'ANIMAL')])
- ]
-
- nlp = spacy.load(model_name)
- nlp.entity.add_label(entity_label)
- ner = train_ner(nlp, train_data, output_directory)
-
- def train_ner(nlp, train_data, output_dir):
- # Add new words to vocab
- for raw_text, _ in train_data:
- doc = nlp.make_doc(raw_text)
- for word in doc:
- _ = nlp.vocab[word.orth]
-
- for itn in range(20):
- random.shuffle(train_data)
- for raw_text, entity_offsets in train_data:
- gold = GoldParse(doc, entities=entity_offsets)
- doc = nlp.make_doc(raw_text)
- nlp.tagger(doc)
- loss = nlp.entity.update(doc, gold)
- nlp.save_to_directory(output_dir)
-
-p
- +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
-
-p
- | The actual training is performed by looping over the examples, and
- | calling #[code nlp.entity.update()]. The #[code update()] method steps
- | through the words of the input. At each word, it makes a prediction. It
- | then consults the annotations provided on the #[code GoldParse] instance,
- | to see whether it was right. If it was wrong, it adjusts its weights so
- | that the correct action will score higher next time.
-
-p
- | After training your model, you can
- | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
- | wrapping models as Python packages, for ease of deployment.
-
+h(2, "saving-loading") Saving and loading
p
diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade
index cff51d250..c1a7c1835 100644
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@@ -10,68 +10,193 @@ p
include _spacy-101/_training
-+h(2, "train-pos-tagger") Training the part-of-speech tagger
++h(3, "training-data") How do I get training data?
+
+p
+ | Collecting training data may sound incredibly painful – and it can be,
+ | if you're planning a large-scale annotation project. However, if your main
+ | goal is to update an existing model's predictions – for example, spaCy's
+ | named entity recognition – the hard is part usually not creating the
+ | actual annotations. It's finding representative examples and
+ | #[strong extracting potential candidates]. The good news is, if you've
+ | been noticing bad performance on your data, you likely
+ | already have some relevant text, and you can use spaCy to
+ | #[strong bootstrap a first set of training examples]. For example,
+ | after processing a few sentences, you may end up with the following
+ | entities, some correct, some incorrect.
+
++aside("How many examples do I need?")
+ | As a rule of thumb, you should allocate at least 10% of your project
+ | resources to creating training and evaluation data. If you're looking to
+ | improve an existing model, you might be able to start off with only a
+ | handful of examples. Keep in mind that you'll always want a lot more than
+ | that for #[strong evaluation] – especially previous errors the model has
+ | made. Otherwise, you won't be able to sufficiently verify that the model
+ | has actually made the #[strong correct generalisations] required for your
+ | use case.
+
++table(["Text", "Entity", "Start", "End", "Label", ""])
+ - var style = [0, 0, 1, 1, 1]
+ +annotation-row(["Uber blew through $1 million a week", "Uber", 0, 4, "ORG"], style)
+ +cell #[+procon("pro")]
+ +annotation-row(["Android Pay expands to Canada", "Android", 0, 7, "PERSON"], style)
+ +cell #[+procon("con")]
+ +annotation-row(["Android Pay expands to Canada", "Canada", 23, 30, "GPE"], style)
+ +cell #[+procon("pro")]
+ +annotation-row(["Spotify steps up Asia expansion", "Spotify", 0, 8, "ORG"], style)
+ +cell #[+procon("pro")]
+ +annotation-row(["Spotify steps up Asia expansion", "Asia", 17, 21, "NORP"], style)
+ +cell #[+procon("con")]
+
+p
+ | Alternatively, the
+ | #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher]
+ | can be a useful tool to extract tokens or combinations of tokens, as
+ | well as their start and end index in a document. In this case, we'll
+ | extract mentions of Google and assume they're an #[code ORG].
+
++table(["Text", "Entity", "Start", "End", "Label", ""])
+ - var style = [0, 0, 1, 1, 1]
+ +annotation-row(["let me google this for you", "google", 7, 13, "ORG"], style)
+ +cell #[+procon("con")]
+ +annotation-row(["Google Maps launches location sharing", "Google", 0, 6, "ORG"], style)
+ +cell #[+procon("con")]
+ +annotation-row(["Google rebrands its business apps", "Google", 0, 6, "ORG"], style)
+ +cell #[+procon("pro")]
+ +annotation-row(["look what i found on google! 😂", "google", 21, 27, "ORG"], style)
+ +cell #[+procon("con")]
+
+p
+ | Based on the few examples above, you can already create six training
+ | sentences with eight entities in total. Of course, what you consider a
+ | "correct annotation" will always depend on
+ | #[strong what you want the model to learn]. While there are some entity
+ | annotations that are more or less universally correct – like Canada being
+ | a geopolitical entity – your application may have its very own definition
+ | of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme].
+code.
- from spacy.vocab import Vocab
- from spacy.tagger import Tagger
- from spacy.tokens import Doc
- from spacy.gold import GoldParse
+ train_data = [
+ ("Uber blew through $1 million a week", [(0, 4, 'ORG')]),
+ ("Android Pay expands to Canada", [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]),
+ ("Spotify steps up Asia expansion", [(0, 8, "ORG"), (17, 21, "LOC")]),
+ ("Google Maps launches location sharing", [(0, 11, "PRODUCT")]),
+ ("Google rebrands its business apps", [(0, 6, "ORG")]),
+ ("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
++h(2) Training with annotations
+p
+ | The #[+api("goldparse") #[code GoldParse]] object collects the annotated
+ | training examples, also called the #[strong gold standard]. It's
+ | initialised with the #[+api("doc") #[code Doc]] object it refers to,
+ | and keyword arguments specifying the annotations, like #[code tags]
+ | or #[code entities]. Its job is to encode the annotations, keep them
+ | aligned and create the C-level data structures required for efficient access.
+ | Here's an example of a simple #[code GoldParse] for part-of-speech tags:
+
++code.
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
- tagger = Tagger(vocab)
-
doc = Doc(vocab, words=['I', 'like', 'stuff'])
gold = GoldParse(doc, tags=['N', 'V', 'N'])
- tagger.update(doc, gold)
p
- +button(gh("spaCy", "examples/training/train_tagger.py"), false, "secondary") Full example
-
-+h(2, "train-entity") Training the named entity recognizer
+ | Using the #[code Doc] and its gold-standard annotations, the model can be
+ | updated to learn a sentence of three words with their assigned
+ | part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map]
+ | is part of the vocabulary and defines the annotation scheme. If you're
+ | training a new language model, this will let you map the tags present in
+ | the treebank you train on to spaCy's tag scheme.
+code.
- from spacy.vocab import Vocab
- from spacy.pipeline import EntityRecognizer
- from spacy.tokens import Doc
-
- vocab = Vocab()
- entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC'])
-
- doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
- entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O'])
+ doc = Doc(Vocab(), words=['Facebook', 'released', 'React', 'in', '2014'])
+ gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
p
- +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
+ | The same goes for named entities. The letters added before the labels
+ | refer to the tags of the
+ | #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme] –
+ | #[code O] is a token outside an entity, #[code U] an single entity unit,
+ | #[code B] the beginning of an entity, #[code I] a token inside an entity
+ | and #[code L] the last token of an entity.
-+h(2, "extend-entity") Extending the named entity recognizer
++aside
+ | #[strong Training data]: The training examples.#[br]
+ | #[strong Text and label]: The current example.#[br]
+ | #[strong Doc]: A #[code Doc] object created from the example text.#[br]
+ | #[strong GoldParse]: A #[code GoldParse] object of the #[code Doc] and label.#[br]
+ | #[strong nlp]: The #[code nlp] object with the model.#[br]
+ | #[strong Optimizer]: A function that holds state between updates.#[br]
+ | #[strong Update]: Update the model's weights.#[br]
+ | #[strong ]
+
++image
+ include ../../assets/img/docs/training-loop.svg
+ .u-text-right
+ +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
p
- | All #[+a("/docs/usage/models") spaCy models] support online learning, so
- | you can update a pre-trained model with new examples. You can even add
- | new classes to an existing model, to recognise a new entity type,
- | part-of-speech, or syntactic relation. Updating an existing model is
- | particularly useful as a "quick and dirty solution", if you have only a
- | few corrections or annotations.
+ | Of course, it's not enough to only show a model a single example once.
+ | Especially if you only have few examples, you'll want to train for a
+ | #[strong number of iterations]. At each iteration, the training data is
+ | #[strong shuffled] to ensure the model doesn't make any generalisations
+ | based on the order of examples. Another technique to improve the learning
+ | results is to set a #[strong dropout rate], a rate at which to randomly
+ | "drop" individual features and representations. This makes it harder for
+ | the model to memorise the training data. For example, a #[code 0.25]
+ | dropout means that each feature or internal representation has a 1/4
+ | likelihood of being dropped.
-p.o-inline-list
- +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
- +button("/docs/usage/training-ner", false, "secondary") Usage guide
++aside
+ | #[+api("language#begin_training") #[code begin_training()]]: Start the
+ | training and return an optimizer function to update the model's weights.#[br]
+ | #[+api("language#update") #[code update()]]: Update the model with the
+ | training example and gold data.#[br]
+ | #[+api("language#to_disk") #[code to_disk()]]: Save the updated model to
+ | a directory.
-+h(2, "train-dependency") Training the dependency parser
++code("Example training loop").
+ optimizer = nlp.begin_training(get_data)
+ for itn in range(100):
+ random.shuffle(train_data)
+ for raw_text, entity_offsets in train_data:
+ doc = nlp.make_doc(raw_text)
+ gold = GoldParse(doc, entities=entity_offsets)
+ nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
+ nlp.to_disk('/model')
-+code.
- from spacy.vocab import Vocab
- from spacy.pipeline import DependencyParser
- from spacy.tokens import Doc
++table(["Name", "Description"])
+ +row
+ +cell #[code train_data]
+ +cell The training data.
- vocab = Vocab()
- parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct'])
+ +row
+ +cell #[code get_data]
+ +cell A function converting the training data to spaCy's JSON format.
- doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
- parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'),
- (1, 'punct')])
+ +row
+ +cell #[code doc]
+ +cell #[+api("doc") #[code Doc]] objects.
-p
- +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
+ +row
+ +cell #[code gold]
+ +cell #[+api("goldparse") #[code GoldParse]] objects.
+
+ +row
+ +cell #[code drop]
+ +cell Dropout rate. Makes it harder for the model to just memorise the data.
+
+ +row
+ +cell #[code optimizer]
+ +cell Callable to update the model's weights.
+
++infobox
+ | For the #[strong full example and more details], see the usage guide on
+ | #[+a("/docs/usage/training-ner") training the named entity recognizer],
+ | or the runnable
+ | #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
+ | on GitHub.
+
++h(2) Examples
+
++under-construction
diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade
index 75c8c2d3c..0d57a17b4 100644
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@@ -170,7 +170,7 @@ p
python -m spacy download de # default German model
python -m spacy download fr # default French model
python -m spacy download es # default Spanish model
- python -m spacy download xx_ent_web_md # multi-language NER
+ python -m spacy download xx_ent_wiki_sm # multi-language NER
p
| spaCy v2.0 comes with new and improved neural network models for English,
@@ -294,9 +294,6 @@ p
+h(2, "migrating") Migrating from spaCy 1.x
p
- | If you've mostly been using spaCy for basic text processing, chances are
- | you won't even have to change your code at all. For all other cases,
- | we've tried to focus...
+infobox("Some tips")
| Before migrating, we strongly recommend writing a few
@@ -339,6 +336,11 @@ p
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
+p
+ | If you've trained models with input from v1.x, you'll need to
+ | #[strong retrain them] with spaCy v2.0. All previous models will not
+ | be compatible with the new version.
+
+h(3, "migrating-strings") Strings and hash values
p