mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add dropout optin for parser and NER
Dropout can now be specified in the `Parser.update()` method via the `drop` keyword argument, e.g. nlp.entity.update(doc, gold, drop=0.4) This will randomly drop 40% of features, and multiply the value of the others by 1. / 0.4. This may be useful for generalising from small data sets. This commit also patches the examples/training/train_new_entity_type.py example, to use dropout and fix the output (previously it did not output the learned entity).
This commit is contained in:
parent
f0e1606d27
commit
2da16adcc2
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
"""
|
"""
|
||||||
Example of training an additional entity type
|
Example of training an additional entity type
|
||||||
|
|
||||||
|
@ -26,11 +27,11 @@ For more details, see the documentation:
|
||||||
Developed for: spaCy 1.7.6
|
Developed for: spaCy 1.7.6
|
||||||
Last tested for: spaCy 1.7.6
|
Last tested for: spaCy 1.7.6
|
||||||
"""
|
"""
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import random
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
for word in doc:
|
for word in doc:
|
||||||
_ = nlp.vocab[word.orth]
|
_ = nlp.vocab[word.orth]
|
||||||
|
random.seed(0)
|
||||||
for itn in range(20):
|
# You may need to change the learning rate. It's generally difficult to
|
||||||
|
# guess what rate you should set, especially when you have limited data.
|
||||||
|
nlp.entity.model.learn_rate = 0.001
|
||||||
|
for itn in range(1000):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
|
loss = 0.
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
# By default, the GoldParse class assumes that the entities
|
||||||
|
# described by offset are complete, and all other words should
|
||||||
|
# have the tag 'O'. You can tell it to make no assumptions
|
||||||
|
# about the tag of a word by giving it the tag '-'.
|
||||||
|
# However, this allows a trivial solution to the current
|
||||||
|
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
||||||
|
# the model can learn that all words can be tagged 'ANIMAL'.
|
||||||
|
#for i in range(len(gold.ner)):
|
||||||
|
#if not gold.ner[i].endswith('ANIMAL'):
|
||||||
|
# gold.ner[i] = '-'
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
loss = nlp.entity.update(doc, gold)
|
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||||
|
# This might help the model generalize better from only a few
|
||||||
|
# examples.
|
||||||
|
loss += nlp.entity.update(doc, gold, drop=0.9)
|
||||||
|
if loss == 0:
|
||||||
|
break
|
||||||
|
# This step averages the model's weights. This may or may not be good for
|
||||||
|
# your situation --- it's empirical.
|
||||||
nlp.end_training()
|
nlp.end_training()
|
||||||
if output_dir:
|
if output_dir:
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
|
@ -80,13 +102,19 @@ def main(model_name, output_directory=None):
|
||||||
(
|
(
|
||||||
"they pretend to care about your feelings, those horses",
|
"they pretend to care about your feelings, those horses",
|
||||||
[(48, 54, 'ANIMAL')]
|
[(48, 54, 'ANIMAL')]
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"horses?",
|
||||||
|
[(0, 6, 'ANIMAL')]
|
||||||
)
|
)
|
||||||
|
|
||||||
]
|
]
|
||||||
nlp.entity.add_label('ANIMAL')
|
nlp.entity.add_label('ANIMAL')
|
||||||
train_ner(nlp, train_data, output_directory)
|
train_ner(nlp, train_data, output_directory)
|
||||||
|
|
||||||
# Test that the entity is recognized
|
# Test that the entity is recognized
|
||||||
doc = nlp('Do you like horses?')
|
doc = nlp('Do you like horses?')
|
||||||
|
print("Ents in 'Do you like horses?':")
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
if output_directory:
|
if output_directory:
|
||||||
|
|
|
@ -11,6 +11,8 @@ import ujson
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
|
|
||||||
|
import numpy.random
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals
|
from cpython.exc cimport PyErr_CheckSignals
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
|
@ -303,7 +305,7 @@ cdef class Parser:
|
||||||
free(eg.is_valid)
|
free(eg.is_valid)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
|
||||||
"""
|
"""
|
||||||
Update the statistical model.
|
Update the statistical model.
|
||||||
|
|
||||||
|
@ -325,9 +327,11 @@ cdef class Parser:
|
||||||
nr_feat=self.model.nr_feat)
|
nr_feat=self.model.nr_feat)
|
||||||
cdef weight_t loss = 0
|
cdef weight_t loss = 0
|
||||||
cdef Transition action
|
cdef Transition action
|
||||||
|
cdef double dropout_rate = self.cfg.get('dropout', drop)
|
||||||
while not stcls.is_final():
|
while not stcls.is_final():
|
||||||
eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
|
eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
|
||||||
stcls.c)
|
stcls.c)
|
||||||
|
dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
|
||||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||||
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
||||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||||
|
@ -378,6 +382,18 @@ cdef class Parser:
|
||||||
self.cfg.setdefault('extra_labels', []).append(label)
|
self.cfg.setdefault('extra_labels', []).append(label)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
|
||||||
|
if prob <= 0 or prob >= 1.:
|
||||||
|
return 0
|
||||||
|
cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
|
||||||
|
cdef double* probs = &py_probs[0]
|
||||||
|
for i in range(nr_feat):
|
||||||
|
if probs[i] >= prob:
|
||||||
|
feats[i].value /= prob
|
||||||
|
else:
|
||||||
|
feats[i].value = 0.
|
||||||
|
|
||||||
|
|
||||||
cdef class StepwiseState:
|
cdef class StepwiseState:
|
||||||
cdef readonly StateClass stcls
|
cdef readonly StateClass stcls
|
||||||
cdef readonly Example eg
|
cdef readonly Example eg
|
||||||
|
|
Loading…
Reference in New Issue
Block a user