diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 852635075..6fa79e75b 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -26,8 +26,9 @@ from spacy.pipeline import TextCategorizer @plac.annotations( model=("Model name. Defaults to blank 'en' model.", "option", "m", str), output_dir=("Optional output directory", "option", "o", Path), + n_examples=("Number of texts to train from", "option", "N", int), n_iter=("Number of training iterations", "option", "n", int)) -def main(model=None, output_dir=None, n_iter=20): +def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) @@ -50,7 +51,8 @@ def main(model=None, output_dir=None, n_iter=20): # load the IMBD dataset print("Loading IMDB data...") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) + print("Using %d training examples" % n_texts) + (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) train_docs = [nlp.tokenizer(text) for text in train_texts] train_gold = [GoldParse(doc, cats=cats) for doc, cats in zip(train_docs, train_cats)] @@ -65,14 +67,14 @@ def main(model=None, output_dir=None, n_iter=20): for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(train_data, size=compounding(4., 128., 1.001)) + batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) - print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table + print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) diff --git a/spacy/_ml.py b/spacy/_ml.py index 6bfacb20a..89e3d8ac6 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -434,7 +434,7 @@ def build_text_classifier(nr_class, width=64, **cfg): pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): - if cfg.get('low_data'): + if cfg.get('low_data') and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 842e27069..a2321d1ad 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -11,9 +11,9 @@ import ujson import msgpack from thinc.api import chain -from thinc.v2v import Softmax +from thinc.v2v import Affine, Softmax from thinc.t2v import Pooling, max_pool, mean_pool -from thinc.neural.util import to_categorical +from thinc.neural.util import to_categorical, copy_array from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc @@ -130,6 +130,15 @@ class Pipe(object): documents and their predicted scores.""" raise NotImplementedError + def add_label(self, label): + """Add an output label, to be predicted by the model. + + It's possible to extend pre-trained models with new labels, + but care should be taken to avoid the "catastrophic forgetting" + problem. + """ + raise NotImplementedError + def begin_training(self, gold_tuples=tuple(), pipeline=None): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" @@ -325,6 +334,14 @@ class Tagger(Pipe): self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) + @property + def labels(self): + return self.cfg.setdefault('tag_names', []) + + @labels.setter + def labels(self, value): + self.cfg['tag_names'] = value + def __call__(self, doc): tags = self.predict([doc]) self.set_annotations([doc], tags) @@ -352,6 +369,7 @@ class Tagger(Pipe): cdef Doc doc cdef int idx = 0 cdef Vocab vocab = self.vocab + tags = list(self.labels) for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, 'get'): @@ -359,7 +377,7 @@ class Tagger(Pipe): for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0 and doc.c[j].pos == 0: - vocab.morphology.assign_tag_id(&doc.c[j], tag_id) + vocab.morphology.assign_tag(&doc.c[j], tags[tag_id]) idx += 1 doc.is_tagged = True @@ -420,6 +438,17 @@ class Tagger(Pipe): def Model(cls, n_tags, **cfg): return build_tagger_model(n_tags, **cfg) + def add_label(self, label): + if label in self.labels: + return 0 + smaller = self.model[-1]._layers[-1] + larger = Softmax(len(self.labels)+1, smaller.nI) + copy_array(larger.W[:smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model[-1]._layers[-1] = larger + self.labels.append(label) + return 1 + def use_params(self, params): with self.model.use_params(params): yield @@ -675,7 +704,7 @@ class TextCategorizer(Pipe): @property def labels(self): - return self.cfg.get('labels', ['LABEL']) + return self.cfg.setdefault('labels', ['LABEL']) @labels.setter def labels(self, value): @@ -727,6 +756,17 @@ class TextCategorizer(Pipe): mean_square_error = ((scores-truths)**2).sum(axis=1).mean() return mean_square_error, d_scores + def add_label(self, label): + if label in self.labels: + return 0 + smaller = self.model[-1]._layers[-1] + larger = Affine(len(self.labels)+1, smaller.nI) + copy_array(larger.W[:smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model[-1]._layers[-1] = larger + self.labels.append(label) + return 1 + def begin_training(self, gold_tuples=tuple(), pipeline=None): if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': token_vector_width = pipeline[0].model.nO