spaCy/examples/training/train_textcat.py

#!/usr/bin/env python
# coding: utf8
"""Train a multi-label convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is then added to
spacy.pipeline, and predictions are available via `doc.cats`.

For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Text classification: https://alpha.spacy.io/usage/text-classification

Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
from spacy.pipeline import TextCategorizer


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=20):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        # textcat = nlp.create_pipe('textcat')
        textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE'])
        nlp.add_pipe(textcat, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    # textcat.add_label('POSITIVE')

    # load the IMBD dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
    train_docs = [nlp.tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = list(zip(train_docs, train_gold))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training(lambda: [])
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 128., 1.001))
            for batch in batches:
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)


def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}


if __name__ == '__main__':
    plac.call(main)
Update textcat example 2017-10-27 01:32:19 +03:00			`#!/usr/bin/env python`
			`# coding: utf8`
			`"""Train a multi-label convolutional neural network text classifier on the`
			`IMDB dataset, using the TextCategorizer component. The dataset will be loaded`
			`automatically via Thinc's built-in dataset loader. The model is then added to`
			spacy.pipeline, and predictions are available via `doc.cats`.

			`For more details, see the documentation:`
			`* Training: https://alpha.spacy.io/usage/training`
			`* Text classification: https://alpha.spacy.io/usage/text-classification`

			`Developed for: spaCy 2.0.0a18`
			`Last updated for: spaCy 2.0.0a18`
			`"""`
			`from __future__ import unicode_literals, print_function`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`import plac`
			`import random`
Update textcat example 2017-10-27 01:32:19 +03:00			`from pathlib import Path`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`import thinc.extra.datasets`

Update textcat example 2017-10-27 01:32:19 +03:00			`import spacy`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`from spacy.gold import GoldParse, minibatch`
			`from spacy.util import compounding`
			`from spacy.pipeline import TextCategorizer`

Update textcat example 2017-10-04 16:12:28 +03:00
Update textcat example 2017-10-27 01:32:19 +03:00			`@plac.annotations(`
			`model=("Model name. Defaults to blank 'en' model.", "option", "m", str),`
			`output_dir=("Optional output directory", "option", "o", Path),`
			`n_iter=("Number of training iterations", "option", "n", int))`
			`def main(model=None, output_dir=None, n_iter=20):`
			`if model is not None:`
			`nlp = spacy.load(model) # load existing spaCy model`
			`print("Loaded model '%s'" % model)`
			`else:`
			`nlp = spacy.blank('en') # create blank Language class`
			`print("Created blank 'en' model")`

			`# add the text classifier to the pipeline if it doesn't exist`
			`# nlp.create_pipe works for built-ins that are registered with spaCy`
			`if 'textcat' not in nlp.pipe_names:`
			`# textcat = nlp.create_pipe('textcat')`
			`textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE'])`
			`nlp.add_pipe(textcat, first=True)`
			`# otherwise, get it, so we can add labels to it`
			`else:`
			`textcat = nlp.get_pipe('textcat')`

			`# add label to text classifier`
			`# textcat.add_label('POSITIVE')`

			`# load the IMBD dataset`
			`print("Loading IMDB data...")`
			`(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)`
			`train_docs = [nlp.tokenizer(text) for text in train_texts]`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`train_gold = [GoldParse(doc, cats=cats) for doc, cats in`
			`zip(train_docs, train_cats)]`
Update textcat example 2017-10-04 16:12:28 +03:00			`train_data = list(zip(train_docs, train_gold))`
Update textcat example 2017-10-27 01:32:19 +03:00
			`# get names of other pipes to disable them during training`
			`other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']`
			`with nlp.disable_pipes(*other_pipes): # only train textcat`
			`optimizer = nlp.begin_training(lambda: [])`
			`print("Training the model...")`
			`print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))`
			`for i in range(n_iter):`
			`losses = {}`
			`# batch up the examples using spaCy's minibatch`
			`batches = minibatch(train_data, size=compounding(4., 128., 1.001))`
			`for batch in batches:`
			`docs, golds = zip(*batch)`
			`nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)`
			`with textcat.model.use_params(optimizer.averages):`
			`# evaluate on the dev data split off in load_data()`
			`scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)`
			`print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table`
			`.format(losses['textcat'], scores['textcat_p'],`
			`scores['textcat_r'], scores['textcat_f']))`

			`# test the trained model`
			`test_text = "This movie sucked"`
			`doc = nlp(test_text)`
			`print(test_text, doc.cats)`

			`if output_dir is not None:`
			`output_dir = Path(output_dir)`
			`if not output_dir.exists():`
			`output_dir.mkdir()`
			`nlp.to_disk(output_dir)`
			`print("Saved model to", output_dir)`

			`# test the saved model`
			`print("Loading from", output_dir)`
			`nlp2 = spacy.load(output_dir)`
			`doc2 = nlp2(test_text)`
			`print(test_text, doc2.cats)`


			`def load_data(limit=0, split=0.8):`
			`"""Load data from the IMDB dataset."""`
			`# Partition off part of the train data for evaluation`
			`train_data, _ = thinc.extra.datasets.imdb()`
			`random.shuffle(train_data)`
			`train_data = train_data[-limit:]`
			`texts, labels = zip(*train_data)`
			`cats = [{'POSITIVE': bool(y)} for y in labels]`
			`split = int(len(train_data) * split)`
			`return (texts[:split], cats[:split]), (texts[split:], cats[split:])`
Add example for training text classifier 2017-07-22 21:15:32 +03:00

			`def evaluate(tokenizer, textcat, texts, cats):`
			`docs = (tokenizer(text) for text in texts)`
Update textcat example 2017-10-27 01:32:19 +03:00			`tp = 1e-8 # True positives`
			`fp = 1e-8 # False positives`
			`fn = 1e-8 # False negatives`
			`tn = 1e-8 # True negatives`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`for i, doc in enumerate(textcat.pipe(docs)):`
			`gold = cats[i]`
			`for label, score in doc.cats.items():`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-06 02:43:02 +03:00			`if label not in gold:`
			`continue`
			`if score >= 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`tp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-06 02:43:02 +03:00			`elif score >= 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`fp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-06 02:43:02 +03:00			`elif score < 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`tn += 1`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-06 02:43:02 +03:00			`elif score < 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`fn += 1`
Update textcat example 2017-10-27 01:32:19 +03:00			`precision = tp / (tp + fp)`
Add example for training text classifier 2017-07-22 21:15:32 +03:00			`recall = tp / (tp + fn)`
Update textcat example 2017-10-27 01:32:19 +03:00			`f_score = 2 * (precision * recall) / (precision + recall)`
			`return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}`
Finish text classifier example 2017-07-23 01:34:12 +03:00
Add example for training text classifier 2017-07-22 21:15:32 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`