mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add dropout optin for parser and NER
Dropout can now be specified in the `Parser.update()` method via the `drop` keyword argument, e.g. nlp.entity.update(doc, gold, drop=0.4) This will randomly drop 40% of features, and multiply the value of the others by 1. / 0.4. This may be useful for generalising from small data sets. This commit also patches the examples/training/train_new_entity_type.py example, to use dropout and fix the output (previously it did not output the learned entity).
This commit is contained in:
parent
f0e1606d27
commit
2da16adcc2
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""
|
||||
Example of training an additional entity type
|
||||
|
||||
|
@ -26,11 +27,11 @@ For more details, see the documentation:
|
|||
Developed for: spaCy 1.7.6
|
||||
Last tested for: spaCy 1.7.6
|
||||
"""
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import random
|
||||
from pathlib import Path
|
||||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
|
@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir):
|
|||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
for itn in range(20):
|
||||
random.seed(0)
|
||||
# You may need to change the learning rate. It's generally difficult to
|
||||
# guess what rate you should set, especially when you have limited data.
|
||||
nlp.entity.model.learn_rate = 0.001
|
||||
for itn in range(1000):
|
||||
random.shuffle(train_data)
|
||||
loss = 0.
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
# By default, the GoldParse class assumes that the entities
|
||||
# described by offset are complete, and all other words should
|
||||
# have the tag 'O'. You can tell it to make no assumptions
|
||||
# about the tag of a word by giving it the tag '-'.
|
||||
# However, this allows a trivial solution to the current
|
||||
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
||||
# the model can learn that all words can be tagged 'ANIMAL'.
|
||||
#for i in range(len(gold.ner)):
|
||||
#if not gold.ner[i].endswith('ANIMAL'):
|
||||
# gold.ner[i] = '-'
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||
# This might help the model generalize better from only a few
|
||||
# examples.
|
||||
loss += nlp.entity.update(doc, gold, drop=0.9)
|
||||
if loss == 0:
|
||||
break
|
||||
# This step averages the model's weights. This may or may not be good for
|
||||
# your situation --- it's empirical.
|
||||
nlp.end_training()
|
||||
if output_dir:
|
||||
if not output_dir.exists():
|
||||
|
@ -80,13 +102,19 @@ def main(model_name, output_directory=None):
|
|||
(
|
||||
"they pretend to care about your feelings, those horses",
|
||||
[(48, 54, 'ANIMAL')]
|
||||
),
|
||||
(
|
||||
"horses?",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
)
|
||||
|
||||
]
|
||||
nlp.entity.add_label('ANIMAL')
|
||||
train_ner(nlp, train_data, output_directory)
|
||||
|
||||
# Test that the entity is recognized
|
||||
doc = nlp('Do you like horses?')
|
||||
print("Ents in 'Do you like horses?':")
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
if output_directory:
|
||||
|
|
|
@ -11,6 +11,8 @@ import ujson
|
|||
cimport cython
|
||||
cimport cython.parallel
|
||||
|
||||
import numpy.random
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
|
@ -303,7 +305,7 @@ cdef class Parser:
|
|||
free(eg.is_valid)
|
||||
return 0
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
|
||||
"""
|
||||
Update the statistical model.
|
||||
|
||||
|
@ -325,9 +327,11 @@ cdef class Parser:
|
|||
nr_feat=self.model.nr_feat)
|
||||
cdef weight_t loss = 0
|
||||
cdef Transition action
|
||||
cdef double dropout_rate = self.cfg.get('dropout', drop)
|
||||
while not stcls.is_final():
|
||||
eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
|
||||
stcls.c)
|
||||
dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
|
||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||
|
@ -378,6 +382,18 @@ cdef class Parser:
|
|||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
|
||||
|
||||
cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
|
||||
if prob <= 0 or prob >= 1.:
|
||||
return 0
|
||||
cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
|
||||
cdef double* probs = &py_probs[0]
|
||||
for i in range(nr_feat):
|
||||
if probs[i] >= prob:
|
||||
feats[i].value /= prob
|
||||
else:
|
||||
feats[i].value = 0.
|
||||
|
||||
|
||||
cdef class StepwiseState:
|
||||
cdef readonly StateClass stcls
|
||||
cdef readonly Example eg
|
||||
|
|
Loading…
Reference in New Issue
Block a user