Update textcat example

2025-10-27 06:01:28 +03:00 · 2017-10-27 00:32:19 +02:00 · 2017-10-27 00:32:19 +02:00 · b61866a2e4
commit b61866a2e4
parent 4eb5bd02e7
1 changed files with 102 additions and 86 deletions
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -1,58 +1,119 @@
-'''Train a multi-label convolutional neural network text classifier,
+#!/usr/bin/env python
-using the spacy.pipeline.TextCategorizer component. The model is then added
+# coding: utf8
-to spacy.pipeline, and predictions are available at `doc.cats`.
+"""Train a multi-label convolutional neural network text classifier on the
-'''
+IMDB dataset, using the TextCategorizer component. The dataset will be loaded
-from __future__ import unicode_literals
+automatically via Thinc's built-in dataset loader. The model is then added to
 spacy.pipeline, and predictions are available via `doc.cats`.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Text classification: https://alpha.spacy.io/usage/text-classification
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
-import tqdm
+from pathlib import Path
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import thinc.extra.datasets
-import spacy.lang.en
+import spacy
 from spacy.gold import GoldParse, minibatch
 from spacy.util import compounding
 from spacy.pipeline import TextCategorizer
 # TODO: Remove this once we're not supporting models trained with thinc <6.9.0
 import thinc.neural._classes.layernorm
 thinc.neural._classes.layernorm.set_compat_six_eight(False)
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(model=None, output_dir=None, n_iter=20):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
-def train_textcat(tokenizer, textcat,
+    # add the text classifier to the pipeline if it doesn't exist
-                  train_texts, train_cats, dev_texts, dev_cats,
+    # nlp.create_pipe works for built-ins that are registered with spaCy
-                  n_iter=20):
+    if 'textcat' not in nlp.pipe_names:
-    '''
+        # textcat = nlp.create_pipe('textcat')
-    Train the TextCategorizer without associated pipeline.
+        textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE'])
-    '''
+        nlp.add_pipe(textcat, first=True)
-    textcat.begin_training()
+    # otherwise, get it, so we can add labels to it
-    optimizer = Adam(NumpyOps(), 0.001)
+    else:
-    train_docs = [tokenizer(text) for text in train_texts]
+        textcat = nlp.get_pipe('textcat')
    # add label to text classifier
    # textcat.add_label('POSITIVE')
    # load the IMBD dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
    train_docs = [nlp.tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = list(zip(train_docs, train_gold))
-    batch_sizes = compounding(4., 128., 1.001)
+
-    for i in range(n_iter):
+    # get names of other pipes to disable them during training
-        losses = {}
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
-        # Progress bar and minibatching
+    with nlp.disable_pipes(*other_pipes):  # only train textcat
-        batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
+        optimizer = nlp.begin_training(lambda: [])
-        for batch in batches:
+        print("Training the model...")
-            docs, golds = zip(*batch)
+        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
-            textcat.update(docs, golds, sgd=optimizer, drop=0.2,
+        for i in range(n_iter):
-                losses=losses)
+            losses = {}
-        with textcat.model.use_params(optimizer.averages):
+            # batch up the examples using spaCy's minibatch
-            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
+            batches = minibatch(train_data, size=compounding(4., 128., 1.001))
-        yield losses['textcat'], scores
+            for batch in batches:
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))
    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
 def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])
 def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8 # True positives
+    tp = 1e-8  # True positives
-    fp = 1e-8 # False positives
+    fp = 1e-8  # False positives
-    fn = 1e-8 # False negatives
+    fn = 1e-8  # False negatives
-    tn = 1e-8 # True negatives
+    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
@ -66,55 +127,10 @@ def evaluate(tokenizer, textcat, texts, cats):
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
-    precis = tp / (tp + fp)
+    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
-    fscore = 2 * (precis * recall) / (precis + recall)
+    f_score = 2 * (precision * recall) / (precision + recall)
-    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
+    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
 def load_data(limit=0):
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * 0.8)
    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]
    return (train_texts, train_cats), (dev_texts, dev_cats)
 def main(model_loc=None):
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
    print("Load IMDB data")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
                                       train_texts, train_cats,
                                       dev_texts, dev_cats, n_iter=20)):
        print(progress.format(i=i, loss=loss, **scores))
    # How to save, load and use
    nlp.pipeline.append(textcat)
    if model_loc is not None:
        nlp.to_disk(model_loc)
        nlp = spacy.load(model_loc)
        doc = nlp(u'This movie sucked!')
        print(doc.cats)
 if __name__ == '__main__':