mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
d310b0aab3
|
@ -3,11 +3,11 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '1.8.2'
|
||||
__version__ = '2.0.0'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
__email__ = 'matt@explosion.ai'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
|
||||
__docs_models__ = 'https://spacy.io/docs/usage/models'
|
||||
|
|
|
@ -382,3 +382,4 @@ mixin annotation-row(annots, style)
|
|||
+cell #[code=cell]
|
||||
else
|
||||
+cell=cell
|
||||
block
|
||||
|
|
40
website/assets/img/docs/training-loop.svg
Normal file
40
website/assets/img/docs/training-loop.svg
Normal file
|
@ -0,0 +1,40 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" width="612" height="330" viewBox="-10 -10 622 360">
|
||||
<style>
|
||||
.svg__trainloop__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__trainloop__text-small { fill: #1a1e23; font: 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
</style>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121 264h31.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M158.8 264l-8 4 2-4-2-4z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121 229h31.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M158.8 229l-8 4 2-4-2-4z"/>
|
||||
<rect width="120" height="71" x="1" y="211" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="10.7" ry="10.7"/>
|
||||
<text class="svg__trainloop__text" dy="1em" transform="translate(11.5 236.5)" width="93" height="18">Training data</text>
|
||||
<path fill="none" stroke="#d6b656" stroke-width="2" stroke-miterlimit="10" d="M221 279v22h330v-30.8"/>
|
||||
<path fill="#d6b656" stroke="#d6b656" stroke-width="2" stroke-miterlimit="10" d="M551 264.2l4 8-4-2-4 2z"/>
|
||||
<path fill="#fff2cc" stroke="#d6b656" stroke-width="2" d="M161 249h120v30H161z"/>
|
||||
<text class="svg__trainloop__text-small" dy="1em" transform="translate(202.5 254.5)" width="35" height="18">label</text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M281 229h36.8"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M323.8 229l-8 4 2-4-2-4z"/>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M161 214h120v30H161z"/>
|
||||
<text class="svg__trainloop__text-small" dy="1em" transform="translate(206.5 219.5)" width="27" height="18">text</text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M446 229h36.8"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M488.8 229l-8 4 2-4-2-4z"/>
|
||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M416 194l30.6-48"/>
|
||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M449.8 141l-1 8.8-2.2-4-4.5-.3z"/>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M326 194h120v69H326z"/>
|
||||
<text class="svg__trainloop__text" dy="1em" transform="translate(371.5 218.5)" width="27" height="18">Doc</text>
|
||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M521 195l-35.2-49.3"/>
|
||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M482.3 140.8l8 4.2-4.5.7-2 4z"/>
|
||||
<path fill="#fff2cc" stroke="#d6b656" stroke-width="2" d="M491 195h120v67H491z"/>
|
||||
<text class="svg__trainloop__text" dy="1em" transform="translate(513.5 218.5)" width="73" height="18">GoldParse</text>
|
||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M466 59V21h-40.8"/>
|
||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M419.2 21l8-4-2 4 2 4z"/>
|
||||
<path fill="#f99" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M436 59h60l30 40-30 40h-60l-30-40z"/>
|
||||
<text class="svg__trainloop__text" dy="0.85em" transform="translate(442.5 90.5)" width="45" height="16">update</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M342 1h60l30 40-30 40h-60l-30-40z"/>
|
||||
<text class="svg__trainloop__text" dy="0.8em" transform="translate(360.5 32.5)" width="21" height="16">nlp</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M341 99h56.8"/>
|
||||
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M403.8 99l-8 4 2-4-2-4z"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M251 59h60l30 40-30 40h-60l-30-40z"/>
|
||||
<text class="svg__trainloop__text" dy="0.85em" transform="translate(245.5 90.5)" width="61" height="16">optimizer</text>
|
||||
</svg>
|
After Width: | Height: | Size: 4.0 KiB |
47
website/assets/img/docs/training.svg
Normal file
47
website/assets/img/docs/training.svg
Normal file
|
@ -0,0 +1,47 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="827" height="168" viewBox="-10 -10 837 178">
|
||||
<style>
|
||||
.svg__training__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__training__text-code { fill: #1a1e23; font: bold 16px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<defs>
|
||||
<linearGradient id="a" x1="0%" x2="0%" y1="100%" y2="0%">
|
||||
<stop offset="0%" stop-color="#F99"/>
|
||||
<stop offset="100%" stop-color="#B3FF66"/>
|
||||
</linearGradient>
|
||||
</defs>
|
||||
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M561 103h-6v46H251v-35.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M251 107.2l4 8-4-2-4 2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(372 138.5)" width="80" height="20"/>
|
||||
<text class="svg__training__text-code" dy="1em" transform="translate(378.5 138.5)" width="65" height="16">PREDICT</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 73v6h76.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M703.8 79l-8 4 2-4-2-4z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(630.5 68.5)" width="50" height="20"/>
|
||||
<text class="svg__training__text-code" dy="1em" transform="translate(634.5 68.5)" width="43" height="18">SAVE</text>
|
||||
<rect width="120" height="60" x="501" y="43" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="9" ry="9"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(538.5 63.5)" width="43" height="18">Model</text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M121 54h61.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M188.8 54l-8 4 2-4-2-4z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M121 19h61.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M188.8 19l-8 4 2-4-2-4z"/>
|
||||
<rect width="120" height="71" x="1" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="10.7" ry="10.7"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(13.5 26.5)" width="93" height="18">Training data</text>
|
||||
<path fill="none" stroke="#87e02d" stroke-width="2" stroke-miterlimit="10" d="M311 54h51.8"/>
|
||||
<path fill="#87e02d" stroke="#87e02d" stroke-width="2" stroke-miterlimit="10" d="M368.8 54l-8 4 2-4-2-4z"/>
|
||||
<path fill="#dae8fc" stroke="#09a3d5" stroke-width="2" d="M191 39h120v30H191z"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(232.5 44.5)" width="35" height="18">label</text>
|
||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M311 90h51.8"/>
|
||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M368.8 90l-8 4 2-4-2-4z"/>
|
||||
<path fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" d="M191 75h120v30H191z" stroke-dasharray="2 2"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(232.5 80.5)" width="35" height="18">label</text>
|
||||
<rect width="120" height="60" x="706" y="49" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="9" ry="9"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(734.5 59.5)" width="61" height="38">Updated
|
||||
<tspan dy="1.25em" dx="-3.25em">Model</tspan>
|
||||
</text>
|
||||
<path fill="#dae8fc" stroke="#09a3d5" stroke-width="2" d="M191 4h120v30H191z"/>
|
||||
<text class="svg__training__text" dy="0.9em" transform="translate(236.5 9.5)" width="27" height="18">text</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M461 73h31.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M498.8 73l-8 4 2-4-2-4z"/>
|
||||
<path fill="url(#a)" d="M409.5 21L461 72.5 409.5 124 358 72.5z"/>
|
||||
<text class="svg__training__text-code" dy="0.9em" transform="translate(371.5 64.5)" width="67" height="16">GRADIENT</text>
|
||||
</svg>
|
After Width: | Height: | Size: 3.9 KiB |
|
@ -141,10 +141,10 @@ p
|
|||
p Update the models in the pipeline.
|
||||
|
||||
+aside-code("Example").
|
||||
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
for epoch in trainer.epochs(gold):
|
||||
for docs, golds in epoch:
|
||||
state = nlp.update(docs, golds, sgd=optimizer)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -173,17 +173,13 @@ p Update the models in the pipeline.
|
|||
+cell Results from the update.
|
||||
|
||||
+h(2, "begin_training") Language.begin_training
|
||||
+tag contextmanager
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Allocate models, pre-process training data and acquire a trainer and
|
||||
| optimizer. Used as a contextmanager.
|
||||
| Allocate models, pre-process training data and acquire an optimizer.
|
||||
|
||||
+aside-code("Example").
|
||||
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
for epoch in trainer.epochs(gold):
|
||||
for docs, golds in epoch:
|
||||
state = nlp.update(docs, golds, sgd=optimizer)
|
||||
optimizer = nlp.begin_training(gold_tuples)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -199,7 +195,7 @@ p
|
|||
+footrow
|
||||
+cell yields
|
||||
+cell tuple
|
||||
+cell A trainer and an optimizer.
|
||||
+cell An optimizer.
|
||||
|
||||
+h(2, "use_params") Language.use_params
|
||||
+tag contextmanager
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION
|
||||
|
||||
p
|
||||
| If you've been modifying the pipeline, vocabulary vectors and entities, or made
|
||||
| updates to the model, you'll eventually want
|
||||
| to #[strong save your progress] – for example, everything that's in your #[code nlp]
|
||||
| object. This means you'll have to translate its contents and structure
|
||||
| into a format that can be saved, like a file or a byte string. This
|
||||
| process is called serialization. spaCy comes with
|
||||
| If you've been modifying the pipeline, vocabulary, vectors and entities,
|
||||
| or made updates to the model, you'll eventually want to
|
||||
| #[strong save your progress] – for example, everything that's in your
|
||||
| #[code nlp] object. This means you'll have to translate its contents and
|
||||
| structure into a format that can be saved, like a file or a byte string.
|
||||
| This process is called serialization. spaCy comes with
|
||||
| #[strong built-in serialization methods] and supports the
|
||||
| #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol].
|
||||
|
||||
|
@ -45,11 +45,7 @@ p
|
|||
| #[code Vocab] holds the context-independent information about the words,
|
||||
| tags and labels, and their #[strong hash values]. If the #[code Vocab]
|
||||
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
|
||||
| those IDs – for example, the word text or the dependency labels. You
|
||||
| might be saving #[code 446] for "whale", but in a different vocabulary,
|
||||
| this ID could map to "VERB". Similarly, if your document was processed by
|
||||
| a German model, its vocab will include the specific
|
||||
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
|
||||
| those IDs back to strings.
|
||||
|
||||
+code.
|
||||
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
||||
|
|
|
@ -1,3 +1,52 @@
|
|||
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
|
||||
|
||||
+under-construction
|
||||
p
|
||||
| spaCy's models are #[strong statistical] and every "decision" they make –
|
||||
| for example, which part-of-speech tag to assign, or whether a word is a
|
||||
| named entity – is a #[strong prediction]. This prediction is based
|
||||
| on the examples the model has seen during #[strong training]. To train
|
||||
| a model, you first need training data – examples of text, and the
|
||||
| labels you want the model to predict. This could be a part-of-speech tag,
|
||||
| a named entity or any other information.
|
||||
|
||||
p
|
||||
| The model is then shown the unlabelled text and will make a prediction.
|
||||
| Because we know the correct answer, we can give the model feedback on its
|
||||
| prediction in the form of an #[strong error gradient] of the
|
||||
| #[strong loss function] that calculates the difference between the training
|
||||
| example and the expected output. The greater the difference, the more
|
||||
| significant the gradient and the updates to our model.
|
||||
|
||||
+aside
|
||||
| #[strong Training data:] Examples and their annotations.#[br]
|
||||
| #[strong Text:] The input text the model should predict a label for.#[br]
|
||||
| #[strong Label:] The label the model should predict.#[br]
|
||||
| #[strong Gradient:] Gradient of the loss function calculating the
|
||||
| difference between input and expected output.
|
||||
|
||||
+image
|
||||
include ../../../assets/img/docs/training.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/training.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
p
|
||||
| When training a model, we don't just want it to memorise our examples –
|
||||
| we want it to come up with theory that can be
|
||||
| #[strong generalised across other examples]. After all, we don't just want
|
||||
| the model to learn that this one instance of "Amazon" right here is a
|
||||
| company – we want it to learn that "Amazon", in contexts #[em like this],
|
||||
| is most likely a company. That's why the training data should always be
|
||||
| representative of the data we want to process. A model trained on
|
||||
| Wikipedia, where sentences in the first person are extremely rare, will
|
||||
| likely perform badly on Twitter. Similarly, a model trained on romantic
|
||||
| novels will likely perform badly on legal text.
|
||||
|
||||
p
|
||||
| This also means that in order to know how the model is performing,
|
||||
| and whether it's learning the right things, you don't only need
|
||||
| #[strong training data] – you'll also need #[strong evaluation data]. If
|
||||
| you only test the model with the data it was trained on, you'll have no
|
||||
| idea how well it's generalising. If you want to train a model from scratch,
|
||||
| you usually need at least a few hundred examples for both training and
|
||||
| evaluation. To update an existing model, you can already achieve decent
|
||||
| results with very few examples – as long as they're representative.
|
||||
|
|
|
@ -154,40 +154,29 @@ p
|
|||
| To provide training examples to the entity recogniser, you'll first need
|
||||
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
|
||||
| You can specify your annotations in a stand-off format or as token tags.
|
||||
|
||||
+code.
|
||||
import random
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
|
||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
|
||||
|
||||
nlp = spacy.load('en', entity=False, parser=False)
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
||||
|
||||
for itn in range(5):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
|
||||
nlp.tagger(doc)
|
||||
ner.update(doc, gold)
|
||||
|
||||
p
|
||||
| If a character offset in your entity annotations don't fall on a token
|
||||
| boundary, the #[code GoldParse] class will treat that annotation as a
|
||||
| missing value. This allows for more realistic training, because the
|
||||
| entity recogniser is allowed to learn from examples that may feature
|
||||
| tokenizer errors.
|
||||
|
||||
+aside-code("Example").
|
||||
+code.
|
||||
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
|
||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
|
||||
|
||||
+code.
|
||||
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
|
||||
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
|
||||
ner.update(doc, gold)
|
||||
|
||||
+infobox
|
||||
| For more details on #[strong training and updating] the named entity
|
||||
| recognizer, see the usage guides on #[+a("/docs/usage/training") training]
|
||||
| and #[+a("/docs/usage/training-ner") training the named entity recognizer],
|
||||
| or check out the runnable
|
||||
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
|
||||
| on GitHub.
|
||||
|
||||
+h(3, "updating-biluo") The BILUO Scheme
|
||||
|
||||
p
|
||||
| You can also provide token-level entity annotation, using the
|
||||
|
|
|
@ -252,6 +252,12 @@ include _spacy-101/_serialization
|
|||
|
||||
include _spacy-101/_training
|
||||
|
||||
+infobox
|
||||
| To learn more about #[strong training and updating] models, how to create
|
||||
| training data and how to improve spaCy's named entity recognition models,
|
||||
| see the usage guides on #[+a("/docs/usage/training") training] and
|
||||
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
|
||||
|
||||
+h(2, "architecture") Architecture
|
||||
|
||||
+under-construction
|
||||
|
|
|
@ -8,6 +8,8 @@ p
|
|||
| particularly useful as a "quick and dirty solution", if you have only a
|
||||
| few corrections or annotations.
|
||||
|
||||
+under-construction
|
||||
|
||||
+h(2, "improving-accuracy") Improving accuracy on existing entity types
|
||||
|
||||
p
|
||||
|
@ -15,16 +17,7 @@ p
|
|||
| #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
|
||||
| you want to learn. You will then pass this instance to the
|
||||
| #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
|
||||
| method. For example:
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
nlp = spacy.load('en')
|
||||
doc = nlp.make_doc(u'Facebook released React in 2014')
|
||||
gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
|
||||
nlp.entity.update(doc, gold)
|
||||
| method.
|
||||
|
||||
p
|
||||
| You'll usually need to provide many examples to meaningfully improve the
|
||||
|
@ -44,100 +37,6 @@ p
|
|||
| #[strong experiment on your own data] to find a solution that works best
|
||||
| for you.
|
||||
|
||||
+h(2, "adding") Adding a new entity type
|
||||
|
||||
p
|
||||
| You can add new entity types to an existing model. Let's say we want to
|
||||
| recognise the category #[code TECHNOLOGY]. The new category will include
|
||||
| programming languages, frameworks and platforms. First, we need to
|
||||
| register the new entity type:
|
||||
|
||||
+code.
|
||||
nlp.entity.add_label('TECHNOLOGY')
|
||||
|
||||
p
|
||||
| Next, iterate over your examples, calling #[code entity.update()]. As
|
||||
| above, we want to avoid iterating over only a small number of sentences.
|
||||
| A useful compromise is to run the model over a number of plain-text
|
||||
| sentences, and pass the entities to #[code GoldParse], as "true"
|
||||
| annotations. This encourages the optimizer to find a solution that
|
||||
| predicts the new category with minimal difference from the previous
|
||||
| output.
|
||||
|
||||
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| This script shows how to add a new entity type to an existing pre-trained
|
||||
| NER model. To keep the example short and simple, only four sentences are
|
||||
| provided as examples. In practice, you'll need many more —
|
||||
| #[strong a few hundred] would be a good start. You will also likely need
|
||||
| to mix in #[strong examples of other entity types], which might be
|
||||
| obtained by running the entity recognizer over unlabelled sentences, and
|
||||
| adding their annotations to the training set.
|
||||
|
||||
p
|
||||
| For the full, runnable script of this example, see
|
||||
| #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
|
||||
|
||||
+code("Training the entity recognizer").
|
||||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
import random
|
||||
|
||||
model_name = 'en'
|
||||
entity_label = 'ANIMAL'
|
||||
output_directory = '/path/to/model'
|
||||
train_data = [
|
||||
("Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("horses pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("they pretend to care about your feelings, those horses",
|
||||
[(48, 54, 'ANIMAL')])
|
||||
]
|
||||
|
||||
nlp = spacy.load(model_name)
|
||||
nlp.entity.add_label(entity_label)
|
||||
ner = train_ner(nlp, train_data, output_directory)
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
for itn in range(20):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
nlp.save_to_directory(output_dir)
|
||||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
|
||||
|
||||
p
|
||||
| The actual training is performed by looping over the examples, and
|
||||
| calling #[code nlp.entity.update()]. The #[code update()] method steps
|
||||
| through the words of the input. At each word, it makes a prediction. It
|
||||
| then consults the annotations provided on the #[code GoldParse] instance,
|
||||
| to see whether it was right. If it was wrong, it adjusts its weights so
|
||||
| that the correct action will score higher next time.
|
||||
|
||||
p
|
||||
| After training your model, you can
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
|
||||
| wrapping models as Python packages, for ease of deployment.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
|
|
|
@ -10,68 +10,193 @@ p
|
|||
|
||||
include _spacy-101/_training
|
||||
|
||||
+h(2, "train-pos-tagger") Training the part-of-speech tagger
|
||||
+h(3, "training-data") How do I get training data?
|
||||
|
||||
p
|
||||
| Collecting training data may sound incredibly painful – and it can be,
|
||||
| if you're planning a large-scale annotation project. However, if your main
|
||||
| goal is to update an existing model's predictions – for example, spaCy's
|
||||
| named entity recognition – the hard is part usually not creating the
|
||||
| actual annotations. It's finding representative examples and
|
||||
| #[strong extracting potential candidates]. The good news is, if you've
|
||||
| been noticing bad performance on your data, you likely
|
||||
| already have some relevant text, and you can use spaCy to
|
||||
| #[strong bootstrap a first set of training examples]. For example,
|
||||
| after processing a few sentences, you may end up with the following
|
||||
| entities, some correct, some incorrect.
|
||||
|
||||
+aside("How many examples do I need?")
|
||||
| As a rule of thumb, you should allocate at least 10% of your project
|
||||
| resources to creating training and evaluation data. If you're looking to
|
||||
| improve an existing model, you might be able to start off with only a
|
||||
| handful of examples. Keep in mind that you'll always want a lot more than
|
||||
| that for #[strong evaluation] – especially previous errors the model has
|
||||
| made. Otherwise, you won't be able to sufficiently verify that the model
|
||||
| has actually made the #[strong correct generalisations] required for your
|
||||
| use case.
|
||||
|
||||
+table(["Text", "Entity", "Start", "End", "Label", ""])
|
||||
- var style = [0, 0, 1, 1, 1]
|
||||
+annotation-row(["Uber blew through $1 million a week", "Uber", 0, 4, "ORG"], style)
|
||||
+cell #[+procon("pro")]
|
||||
+annotation-row(["Android Pay expands to Canada", "Android", 0, 7, "PERSON"], style)
|
||||
+cell #[+procon("con")]
|
||||
+annotation-row(["Android Pay expands to Canada", "Canada", 23, 30, "GPE"], style)
|
||||
+cell #[+procon("pro")]
|
||||
+annotation-row(["Spotify steps up Asia expansion", "Spotify", 0, 8, "ORG"], style)
|
||||
+cell #[+procon("pro")]
|
||||
+annotation-row(["Spotify steps up Asia expansion", "Asia", 17, 21, "NORP"], style)
|
||||
+cell #[+procon("con")]
|
||||
|
||||
p
|
||||
| Alternatively, the
|
||||
| #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher]
|
||||
| can be a useful tool to extract tokens or combinations of tokens, as
|
||||
| well as their start and end index in a document. In this case, we'll
|
||||
| extract mentions of Google and assume they're an #[code ORG].
|
||||
|
||||
+table(["Text", "Entity", "Start", "End", "Label", ""])
|
||||
- var style = [0, 0, 1, 1, 1]
|
||||
+annotation-row(["let me google this for you", "google", 7, 13, "ORG"], style)
|
||||
+cell #[+procon("con")]
|
||||
+annotation-row(["Google Maps launches location sharing", "Google", 0, 6, "ORG"], style)
|
||||
+cell #[+procon("con")]
|
||||
+annotation-row(["Google rebrands its business apps", "Google", 0, 6, "ORG"], style)
|
||||
+cell #[+procon("pro")]
|
||||
+annotation-row(["look what i found on google! 😂", "google", 21, 27, "ORG"], style)
|
||||
+cell #[+procon("con")]
|
||||
|
||||
p
|
||||
| Based on the few examples above, you can already create six training
|
||||
| sentences with eight entities in total. Of course, what you consider a
|
||||
| "correct annotation" will always depend on
|
||||
| #[strong what you want the model to learn]. While there are some entity
|
||||
| annotations that are more or less universally correct – like Canada being
|
||||
| a geopolitical entity – your application may have its very own definition
|
||||
| of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme].
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.tokens import Doc
|
||||
from spacy.gold import GoldParse
|
||||
train_data = [
|
||||
("Uber blew through $1 million a week", [(0, 4, 'ORG')]),
|
||||
("Android Pay expands to Canada", [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]),
|
||||
("Spotify steps up Asia expansion", [(0, 8, "ORG"), (17, 21, "LOC")]),
|
||||
("Google Maps launches location sharing", [(0, 11, "PRODUCT")]),
|
||||
("Google rebrands its business apps", [(0, 6, "ORG")]),
|
||||
("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
|
||||
|
||||
+h(2) Training with annotations
|
||||
|
||||
p
|
||||
| The #[+api("goldparse") #[code GoldParse]] object collects the annotated
|
||||
| training examples, also called the #[strong gold standard]. It's
|
||||
| initialised with the #[+api("doc") #[code Doc]] object it refers to,
|
||||
| and keyword arguments specifying the annotations, like #[code tags]
|
||||
| or #[code entities]. Its job is to encode the annotations, keep them
|
||||
| aligned and create the C-level data structures required for efficient access.
|
||||
| Here's an example of a simple #[code GoldParse] for part-of-speech tags:
|
||||
|
||||
+code.
|
||||
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
|
||||
tagger = Tagger(vocab)
|
||||
|
||||
doc = Doc(vocab, words=['I', 'like', 'stuff'])
|
||||
gold = GoldParse(doc, tags=['N', 'V', 'N'])
|
||||
tagger.update(doc, gold)
|
||||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_tagger.py"), false, "secondary") Full example
|
||||
|
||||
+h(2, "train-entity") Training the named entity recognizer
|
||||
| Using the #[code Doc] and its gold-standard annotations, the model can be
|
||||
| updated to learn a sentence of three words with their assigned
|
||||
| part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map]
|
||||
| is part of the vocabulary and defines the annotation scheme. If you're
|
||||
| training a new language model, this will let you map the tags present in
|
||||
| the treebank you train on to spaCy's tag scheme.
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.tokens import Doc
|
||||
|
||||
vocab = Vocab()
|
||||
entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC'])
|
||||
|
||||
doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
|
||||
entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O'])
|
||||
doc = Doc(Vocab(), words=['Facebook', 'released', 'React', 'in', '2014'])
|
||||
gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
|
||||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
|
||||
| The same goes for named entities. The letters added before the labels
|
||||
| refer to the tags of the
|
||||
| #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme] –
|
||||
| #[code O] is a token outside an entity, #[code U] an single entity unit,
|
||||
| #[code B] the beginning of an entity, #[code I] a token inside an entity
|
||||
| and #[code L] the last token of an entity.
|
||||
|
||||
+h(2, "extend-entity") Extending the named entity recognizer
|
||||
+aside
|
||||
| #[strong Training data]: The training examples.#[br]
|
||||
| #[strong Text and label]: The current example.#[br]
|
||||
| #[strong Doc]: A #[code Doc] object created from the example text.#[br]
|
||||
| #[strong GoldParse]: A #[code GoldParse] object of the #[code Doc] and label.#[br]
|
||||
| #[strong nlp]: The #[code nlp] object with the model.#[br]
|
||||
| #[strong Optimizer]: A function that holds state between updates.#[br]
|
||||
| #[strong Update]: Update the model's weights.#[br]
|
||||
| #[strong ]
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/training-loop.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
p
|
||||
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
|
||||
| you can update a pre-trained model with new examples. You can even add
|
||||
| new classes to an existing model, to recognise a new entity type,
|
||||
| part-of-speech, or syntactic relation. Updating an existing model is
|
||||
| particularly useful as a "quick and dirty solution", if you have only a
|
||||
| few corrections or annotations.
|
||||
| Of course, it's not enough to only show a model a single example once.
|
||||
| Especially if you only have few examples, you'll want to train for a
|
||||
| #[strong number of iterations]. At each iteration, the training data is
|
||||
| #[strong shuffled] to ensure the model doesn't make any generalisations
|
||||
| based on the order of examples. Another technique to improve the learning
|
||||
| results is to set a #[strong dropout rate], a rate at which to randomly
|
||||
| "drop" individual features and representations. This makes it harder for
|
||||
| the model to memorise the training data. For example, a #[code 0.25]
|
||||
| dropout means that each feature or internal representation has a 1/4
|
||||
| likelihood of being dropped.
|
||||
|
||||
p.o-inline-list
|
||||
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
|
||||
+button("/docs/usage/training-ner", false, "secondary") Usage guide
|
||||
+aside
|
||||
| #[+api("language#begin_training") #[code begin_training()]]: Start the
|
||||
| training and return an optimizer function to update the model's weights.#[br]
|
||||
| #[+api("language#update") #[code update()]]: Update the model with the
|
||||
| training example and gold data.#[br]
|
||||
| #[+api("language#to_disk") #[code to_disk()]]: Save the updated model to
|
||||
| a directory.
|
||||
|
||||
+h(2, "train-dependency") Training the dependency parser
|
||||
+code("Example training loop").
|
||||
optimizer = nlp.begin_training(get_data)
|
||||
for itn in range(100):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
|
||||
nlp.to_disk('/model')
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.tokens import Doc
|
||||
+table(["Name", "Description"])
|
||||
+row
|
||||
+cell #[code train_data]
|
||||
+cell The training data.
|
||||
|
||||
vocab = Vocab()
|
||||
parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct'])
|
||||
+row
|
||||
+cell #[code get_data]
|
||||
+cell A function converting the training data to spaCy's JSON format.
|
||||
|
||||
doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
|
||||
parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'),
|
||||
(1, 'punct')])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[+api("doc") #[code Doc]] objects.
|
||||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
||||
+row
|
||||
+cell #[code gold]
|
||||
+cell #[+api("goldparse") #[code GoldParse]] objects.
|
||||
|
||||
+row
|
||||
+cell #[code drop]
|
||||
+cell Dropout rate. Makes it harder for the model to just memorise the data.
|
||||
|
||||
+row
|
||||
+cell #[code optimizer]
|
||||
+cell Callable to update the model's weights.
|
||||
|
||||
+infobox
|
||||
| For the #[strong full example and more details], see the usage guide on
|
||||
| #[+a("/docs/usage/training-ner") training the named entity recognizer],
|
||||
| or the runnable
|
||||
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
|
||||
| on GitHub.
|
||||
|
||||
+h(2) Examples
|
||||
|
||||
+under-construction
|
||||
|
|
|
@ -170,7 +170,7 @@ p
|
|||
python -m spacy download de # default German model
|
||||
python -m spacy download fr # default French model
|
||||
python -m spacy download es # default Spanish model
|
||||
python -m spacy download xx_ent_web_md # multi-language NER
|
||||
python -m spacy download xx_ent_wiki_sm # multi-language NER
|
||||
|
||||
p
|
||||
| spaCy v2.0 comes with new and improved neural network models for English,
|
||||
|
@ -294,9 +294,6 @@ p
|
|||
+h(2, "migrating") Migrating from spaCy 1.x
|
||||
|
||||
p
|
||||
| If you've mostly been using spaCy for basic text processing, chances are
|
||||
| you won't even have to change your code at all. For all other cases,
|
||||
| we've tried to focus...
|
||||
|
||||
+infobox("Some tips")
|
||||
| Before migrating, we strongly recommend writing a few
|
||||
|
@ -339,6 +336,11 @@ p
|
|||
nlp.save_to_directory('/model')
|
||||
nlp.vocab.dump('/vocab')
|
||||
|
||||
p
|
||||
| If you've trained models with input from v1.x, you'll need to
|
||||
| #[strong retrain them] with spaCy v2.0. All previous models will not
|
||||
| be compatible with the new version.
|
||||
|
||||
+h(3, "migrating-strings") Strings and hash values
|
||||
|
||||
p
|
||||
|
|
Loading…
Reference in New Issue
Block a user