Add example for training text classifier

2025-12-22 17:43:13 +03:00 · 2017-07-22 20:15:32 +02:00 · 2017-07-22 20:15:32 +02:00 · 2bc7d87c70
commit 2bc7d87c70
parent 9bae0ddc50
1 changed files with 101 additions and 0 deletions
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -0,0 +1,101 @@
 from __future__ import unicode_literals
 import plac
 import random
 import tqdm
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import thinc.extra.datasets
 import spacy.lang.en
 from spacy.gold import GoldParse, minibatch
 from spacy.util import compounding
 from spacy.pipeline import TextCategorizer
 def train_textcat(tokenizer, textcat,
                  train_texts, train_cats, dev_texts, dev_cats,
                  n_iter=20):
    '''
    Train the TextCategorizer without associated pipeline.
    '''
    textcat.begin_training()
    optimizer = Adam(NumpyOps(), 0.001)
    train_docs = [tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = zip(train_docs, train_gold)
    batch_sizes = compounding(4., 128., 1.001)
    for i in range(n_iter):
        losses = {}
        for batch in minibatch(tqdm.tqdm(train_data, leave=False),
                               size=batch_sizes):
            docs, golds = zip(*batch)
            textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
                losses=losses)
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
        yield losses['textcat'], scores
 def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8 # True positives
    fp = 1e-8 # False positives
    fn = 1e-8 # False negatives
    tn = 1e-8 # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if score >= 0.5 and label in gold:
                tp += 1.
            elif score >= 0.5 and label not in gold:
                fp += 1.
            elif score < 0.5 and label not in gold:
                tn += 1
            if score < 0.5 and label in gold:
                fn += 1
    precis = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precis * recall) / (precis + recall)
    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}  
 def load_data():
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    cats = [(['POSITIVE'] if y else []) for y in labels]
    split = int(len(train_data) * 0.8)
    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]
    return (train_texts, train_cats), (dev_texts, dev_cats)
 def main():
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
    print("Load IMDB data")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
                                       train_texts, train_cats,
                                       dev_texts, dev_cats, n_iter=20)):
        print(progress.format(i=i, loss=loss, **scores))
 if __name__ == '__main__':
    plac.call(main)