Add dropout optin for parser and NER

Dropout can now be specified in the `Parser.update()` method via
the `drop` keyword argument, e.g.

    nlp.entity.update(doc, gold, drop=0.4)

This will randomly drop 40% of features, and multiply the value of the
others by 1. / 0.4. This may be useful for generalising from small data
sets.

This commit also patches the examples/training/train_new_entity_type.py
example, to use dropout and fix the output (previously it did not output
the learned entity).
This commit is contained in:
Matthew Honnibal 2017-04-27 13:18:39 +02:00
parent f0e1606d27
commit 2da16adcc2
2 changed files with 49 additions and 5 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf8
""" """
Example of training an additional entity type Example of training an additional entity type
@ -26,11 +27,11 @@ For more details, see the documentation:
Developed for: spaCy 1.7.6 Developed for: spaCy 1.7.6
Last tested for: spaCy 1.7.6 Last tested for: spaCy 1.7.6
""" """
# coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import random import random
from pathlib import Path from pathlib import Path
import random
import spacy import spacy
from spacy.gold import GoldParse from spacy.gold import GoldParse
@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir):
doc = nlp.make_doc(raw_text) doc = nlp.make_doc(raw_text)
for word in doc: for word in doc:
_ = nlp.vocab[word.orth] _ = nlp.vocab[word.orth]
random.seed(0)
for itn in range(20): # You may need to change the learning rate. It's generally difficult to
# guess what rate you should set, especially when you have limited data.
nlp.entity.model.learn_rate = 0.001
for itn in range(1000):
random.shuffle(train_data) random.shuffle(train_data)
loss = 0.
for raw_text, entity_offsets in train_data: for raw_text, entity_offsets in train_data:
gold = GoldParse(doc, entities=entity_offsets) gold = GoldParse(doc, entities=entity_offsets)
# By default, the GoldParse class assumes that the entities
# described by offset are complete, and all other words should
# have the tag 'O'. You can tell it to make no assumptions
# about the tag of a word by giving it the tag '-'.
# However, this allows a trivial solution to the current
# learning problem: if words are either 'any tag' or 'ANIMAL',
# the model can learn that all words can be tagged 'ANIMAL'.
#for i in range(len(gold.ner)):
#if not gold.ner[i].endswith('ANIMAL'):
# gold.ner[i] = '-'
doc = nlp.make_doc(raw_text) doc = nlp.make_doc(raw_text)
nlp.tagger(doc) nlp.tagger(doc)
loss = nlp.entity.update(doc, gold) # As of 1.9, spaCy's parser now lets you supply a dropout probability
# This might help the model generalize better from only a few
# examples.
loss += nlp.entity.update(doc, gold, drop=0.9)
if loss == 0:
break
# This step averages the model's weights. This may or may not be good for
# your situation --- it's empirical.
nlp.end_training() nlp.end_training()
if output_dir: if output_dir:
if not output_dir.exists(): if not output_dir.exists():
@ -80,13 +102,19 @@ def main(model_name, output_directory=None):
( (
"they pretend to care about your feelings, those horses", "they pretend to care about your feelings, those horses",
[(48, 54, 'ANIMAL')] [(48, 54, 'ANIMAL')]
),
(
"horses?",
[(0, 6, 'ANIMAL')]
) )
] ]
nlp.entity.add_label('ANIMAL') nlp.entity.add_label('ANIMAL')
train_ner(nlp, train_data, output_directory) train_ner(nlp, train_data, output_directory)
# Test that the entity is recognized # Test that the entity is recognized
doc = nlp('Do you like horses?') doc = nlp('Do you like horses?')
print("Ents in 'Do you like horses?':")
for ent in doc.ents: for ent in doc.ents:
print(ent.label_, ent.text) print(ent.label_, ent.text)
if output_directory: if output_directory:

View File

@ -11,6 +11,8 @@ import ujson
cimport cython cimport cython
cimport cython.parallel cimport cython.parallel
import numpy.random
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
@ -303,7 +305,7 @@ cdef class Parser:
free(eg.is_valid) free(eg.is_valid)
return 0 return 0
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
""" """
Update the statistical model. Update the statistical model.
@ -325,9 +327,11 @@ cdef class Parser:
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
cdef weight_t loss = 0 cdef weight_t loss = 0
cdef Transition action cdef Transition action
cdef double dropout_rate = self.cfg.get('dropout', drop)
while not stcls.is_final(): while not stcls.is_final():
eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
stcls.c) stcls.c)
dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
@ -378,6 +382,18 @@ cdef class Parser:
self.cfg.setdefault('extra_labels', []).append(label) self.cfg.setdefault('extra_labels', []).append(label)
cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
if prob <= 0 or prob >= 1.:
return 0
cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
cdef double* probs = &py_probs[0]
for i in range(nr_feat):
if probs[i] >= prob:
feats[i].value /= prob
else:
feats[i].value = 0.
cdef class StepwiseState: cdef class StepwiseState:
cdef readonly StateClass stcls cdef readonly StateClass stcls
cdef readonly Example eg cdef readonly Example eg