diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 1e9505d44..4eae11c75 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf8 """ Example of training an additional entity type @@ -26,11 +27,11 @@ For more details, see the documentation: Developed for: spaCy 1.7.6 Last tested for: spaCy 1.7.6 """ -# coding: utf8 from __future__ import unicode_literals, print_function import random from pathlib import Path +import random import spacy from spacy.gold import GoldParse @@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir): doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth] - - for itn in range(20): + random.seed(0) + # You may need to change the learning rate. It's generally difficult to + # guess what rate you should set, especially when you have limited data. + nlp.entity.model.learn_rate = 0.001 + for itn in range(1000): random.shuffle(train_data) + loss = 0. for raw_text, entity_offsets in train_data: gold = GoldParse(doc, entities=entity_offsets) + # By default, the GoldParse class assumes that the entities + # described by offset are complete, and all other words should + # have the tag 'O'. You can tell it to make no assumptions + # about the tag of a word by giving it the tag '-'. + # However, this allows a trivial solution to the current + # learning problem: if words are either 'any tag' or 'ANIMAL', + # the model can learn that all words can be tagged 'ANIMAL'. + #for i in range(len(gold.ner)): + #if not gold.ner[i].endswith('ANIMAL'): + # gold.ner[i] = '-' doc = nlp.make_doc(raw_text) nlp.tagger(doc) - loss = nlp.entity.update(doc, gold) + # As of 1.9, spaCy's parser now lets you supply a dropout probability + # This might help the model generalize better from only a few + # examples. + loss += nlp.entity.update(doc, gold, drop=0.9) + if loss == 0: + break + # This step averages the model's weights. This may or may not be good for + # your situation --- it's empirical. nlp.end_training() if output_dir: if not output_dir.exists(): @@ -80,13 +102,19 @@ def main(model_name, output_directory=None): ( "they pretend to care about your feelings, those horses", [(48, 54, 'ANIMAL')] + ), + ( + "horses?", + [(0, 6, 'ANIMAL')] ) + ] nlp.entity.add_label('ANIMAL') train_ner(nlp, train_data, output_directory) # Test that the entity is recognized doc = nlp('Do you like horses?') + print("Ents in 'Do you like horses?':") for ent in doc.ents: print(ent.label_, ent.text) if output_directory: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 57606dc76..b9de1e114 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -11,6 +11,8 @@ import ujson cimport cython cimport cython.parallel +import numpy.random + from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t @@ -303,7 +305,7 @@ cdef class Parser: free(eg.is_valid) return 0 - def update(self, Doc tokens, GoldParse gold, itn=0): + def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): """ Update the statistical model. @@ -325,9 +327,11 @@ cdef class Parser: nr_feat=self.model.nr_feat) cdef weight_t loss = 0 cdef Transition action + cdef double dropout_rate = self.cfg.get('dropout', drop) while not stcls.is_final(): eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, stcls.c) + dropout(eg.c.features, eg.c.nr_feat, dropout_rate) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) @@ -378,6 +382,18 @@ cdef class Parser: self.cfg.setdefault('extra_labels', []).append(label) +cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: + if prob <= 0 or prob >= 1.: + return 0 + cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) + cdef double* probs = &py_probs[0] + for i in range(nr_feat): + if probs[i] >= prob: + feats[i].value /= prob + else: + feats[i].value = 0. + + cdef class StepwiseState: cdef readonly StateClass stcls cdef readonly Example eg