Update textcat training example and docs

This commit is contained in:
ines 2017-10-27 00:48:45 +02:00
parent b61866a2e4
commit a7b9074b4c
4 changed files with 65 additions and 16 deletions

View File

@ -2,7 +2,7 @@
# coding: utf8 # coding: utf8
"""Train a multi-label convolutional neural network text classifier on the """Train a multi-label convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is then added to automatically via Thinc's built-in dataset loader. The model is added to
spacy.pipeline, and predictions are available via `doc.cats`. spacy.pipeline, and predictions are available via `doc.cats`.
For more details, see the documentation: For more details, see the documentation:
@ -41,7 +41,7 @@ def main(model=None, output_dir=None, n_iter=20):
if 'textcat' not in nlp.pipe_names: if 'textcat' not in nlp.pipe_names:
# textcat = nlp.create_pipe('textcat') # textcat = nlp.create_pipe('textcat')
textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE']) textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE'])
nlp.add_pipe(textcat, first=True) nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it # otherwise, get it, so we can add labels to it
else: else:
textcat = nlp.get_pipe('textcat') textcat = nlp.get_pipe('textcat')

View File

@ -1,13 +1,63 @@
//- 💫 DOCS > USAGE > TRAINING > TEXT CLASSIFICATION //- 💫 DOCS > USAGE > TRAINING > TEXT CLASSIFICATION
+under-construction +h(3, "example-textcat") Adding a text classifier to a spaCy model
+h(3, "example-textcat") Example: Training spaCy's text classifier
+tag-new(2) +tag-new(2)
p p
| This example shows how to use and train spaCy's new | This example shows how to train a multi-label convolutional neural
| #[+api("textcategorizer") #[code TextCategorizer]] pipeline component | network text classifier on IMDB movie reviews, using spaCy's new
| on IMDB movie reviews. | #[+api("textcategorizer") #[code TextCategorizer]] component. The
| dataset will be loaded automatically via Thinc's built-in dataset
| loader. Predictions are available via
| #[+api("doc#attributes") #[code Doc.cats]].
+github("spacy", "examples/training/train_textcat.py") +github("spacy", "examples/training/train_textcat.py")
+h(4) Step by step guide
+list("numbers")
+item
| #[strong Load the model] you want to start with, or create an
| #[strong empty model] using
| #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
| language. If you're using a blank model, don't forget to add the
| parser to the pipeline. If you're using an existing model,
| make sure to disable all other pipeline components during training
| using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
| This way, you'll only be training the parser.
+item
| #[strong Add the text classifier] to the pipeline, and add the labels
| you want to train for example, #[code POSITIVE].
+item
| #[strong Load and pre-process the dataset], shuffle the data and
| split off a part of it to hold back for evaluation. This way, you'll
| be able to see results on each training iteration.
+item
| #[strong Loop over] the training examples, partition them into
| batches and create #[code Doc] and #[code GoldParse] objects for each
| example in the batch.
+item
| #[strong Update the model] by calling
| #[+api("language#update") #[code nlp.update]], which steps
| through the examples and makes a #[strong prediction]. It then
| consults the annotations provided on the #[code GoldParse] instance,
| to see whether it was right. If it was wrong, it adjusts its weights
| so that the correct prediction will score higher next time.
+item
| Optionally, you can also #[strong evaluate the text classifier] on
| each iteration, by checking how it performs on the development data
| held back from the dataset. This lets you print the
| #[strong precision], #[strong recall] and #[strong F-score].
+item
| #[strong Save] the trained model using
| #[+api("language#to_disk") #[code nlp.to_disk]].
+item
| #[strong Test] the model to make sure the text classifier works as
| expected.

View File

@ -113,9 +113,12 @@ include ../_includes/_mixins
+tag-new(2) +tag-new(2)
p p
| This example shows how to use and train spaCy's new | This example shows how to train a multi-label convolutional neural
| #[+api("textcategorizer") #[code TextCategorizer]] pipeline component | network text classifier on IMDB movie reviews, using spaCy's new
| on IMDB movie reviews. | #[+api("textcategorizer") #[code TextCategorizer]] component. The
| dataset will be loaded automatically via Thinc's built-in dataset
| loader. Predictions are available via
| #[+api("doc#attributes") #[code Doc.cats]].
+github("spacy", "examples/training/train_textcat.py") +github("spacy", "examples/training/train_textcat.py")

View File

@ -2,8 +2,4 @@
include ../_includes/_mixins include ../_includes/_mixins
+under-construction include _training/_textcat
+h(2, "example") Example
+github("spacy", "examples/training/train_textcat.py")