#!/usr/bin/env python # coding: utf8 """Define a text classification model using PyTorch, and wrap it with Thinc's PytorchWrapper class, so it can be used in spaCy's TextCategorizer component. The model is added to spacy.pipeline, and predictions are available via `doc.cats`. For more details, see the documentation: * Deep learning: https://alpha.spacy.io/usage/deep-learning * Text classification: https://alpha.spacy.io/usage/text-classification Developed for: spaCy 2.0.0a19 Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac import random from pathlib import Path import thinc.extra.datasets import thinc.extra.wrappers import spacy from spacy.gold import GoldParse, minibatch from spacy.util import compounding @plac.annotations( model=("Model name. Defaults to blank 'en' model.", "option", "m", str), output_dir=("Optional output directory", "option", "o", Path), n_texts=("Number of texts to train from", "option", "t", int), n_iter=("Number of training iterations", "option", "n", int)) def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Create the PyTorch neural network model, and wrap it with Thinc. This # gives it the API that spaCy expects. pt_model = create_model() textcat = thinc.extra.wrappers.PytorchWrapper(pt_model) nlp.add_pipe(textcat, last=True) # add label to text classifier textcat.add_label('POSITIVE') # load the IMBD dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using %d training examples" % n_texts) train_docs = [nlp.tokenizer(text) for text in train_texts] train_gold = [GoldParse(doc, cats=cats) for doc, cats in zip(train_docs, train_cats)] train_data = list(zip(train_docs, train_gold)) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats) def load_data(limit=0, split=0.8): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation train_data, _ = thinc.extra.datasets.imdb() random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) cats = [{'POSITIVE': bool(y)} for y in labels] split = int(len(train_data) * split) return (texts[:split], cats[:split]), (texts[split:], cats[split:]) def evaluate(tokenizer, textcat, texts, cats): docs = (tokenizer(text) for text in texts) tp = 1e-8 # True positives fp = 1e-8 # False positives fn = 1e-8 # False negatives tn = 1e-8 # True negatives for i, doc in enumerate(textcat.pipe(docs)): gold = cats[i] for label, score in doc.cats.items(): if label not in gold: continue if score >= 0.5 and gold[label] >= 0.5: tp += 1. elif score >= 0.5 and gold[label] < 0.5: fp += 1. elif score < 0.5 and gold[label] < 0.5: tn += 1 elif score < 0.5 and gold[label] >= 0.5: fn += 1 precision = tp / (tp + fp) recall = tp / (tp + fn) f_score = 2 * (precision * recall) / (precision + recall) return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score} if __name__ == '__main__': plac.call(main)