spaCy/examples/training/train_pytorch_textcat.py
2017-11-06 02:40:44 +01:00

136 lines
5.1 KiB
Python

#!/usr/bin/env python
# coding: utf8
"""Define a text classification model using PyTorch, and wrap it with Thinc's
PytorchWrapper class, so it can be used in spaCy's TextCategorizer component.
The model is added to spacy.pipeline, and predictions are available via
`doc.cats`. For more details, see the documentation:
* Deep learning: https://alpha.spacy.io/usage/deep-learning
* Text classification: https://alpha.spacy.io/usage/text-classification
Developed for: spaCy 2.0.0a19
Last updated for: spaCy 2.0.0a19
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets
import thinc.extra.wrappers
import spacy
from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_texts=("Number of texts to train from", "option", "t", int),
n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# Create the PyTorch neural network model, and wrap it with Thinc. This
# gives it the API that spaCy expects.
pt_model = create_model()
textcat = thinc.extra.wrappers.PyTorchWrapper(pt_model)
nlp.add_pipe(textcat, last=True)
# add label to text classifier
textcat.add_label('POSITIVE')
# load the IMBD dataset
print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using %d training examples" % n_texts)
train_docs = [nlp.tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)]
train_data = list(zip(train_docs, train_gold))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
.format(losses['textcat'], scores['textcat_p'],
scores['textcat_r'], scores['textcat_f']))
# test the trained model
test_text = "This movie sucked"
doc = nlp(test_text)
print(test_text, doc.cats)
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
print(test_text, doc2.cats)
def load_data(limit=0, split=0.8):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [{'POSITIVE': bool(y)} for y in labels]
split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
def evaluate(tokenizer, textcat, texts, cats):
docs = (tokenizer(text) for text in texts)
tp = 1e-8 # True positives
fp = 1e-8 # False positives
fn = 1e-8 # False negatives
tn = 1e-8 # True negatives
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if label not in gold:
continue
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall)
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
if __name__ == '__main__':
plac.call(main)