Add dropout optin for parser and NER

Dropout can now be specified in the `Parser.update()` method via the `drop` keyword argument, e.g. nlp.entity.update(doc, gold, drop=0.4) This will randomly drop 40% of features, and multiply the value of the others by 1. / 0.4. This may be useful for generalising from small data sets. This commit also patches the examples/training/train_new_entity_type.py example, to use dropout and fix the output (previously it did not output the learned entity).
2025-08-05 21:00:19 +03:00 · 2017-04-27 13:18:39 +02:00 · 2017-04-27 13:18:39 +02:00 · 2da16adcc2
commit 2da16adcc2
parent f0e1606d27
2 changed files with 49 additions and 5 deletions
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding: utf8
 """
 Example of training an additional entity type

@ -26,11 +27,11 @@ For more details, see the documentation:
 Developed for: spaCy 1.7.6
 Last tested for: spaCy 1.7.6
 """
-# coding: utf8
 from __future__ import unicode_literals, print_function

 import random
 from pathlib import Path
+import random

 import spacy
 from spacy.gold import GoldParse
@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir):
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
-
-    for itn in range(20):
+    random.seed(0)
+    # You may need to change the learning rate. It's generally difficult to
+    # guess what rate you should set, especially when you have limited data.
+    nlp.entity.model.learn_rate = 0.001
+    for itn in range(1000):
        random.shuffle(train_data)
+        loss = 0.
        for raw_text, entity_offsets in train_data:
            gold = GoldParse(doc, entities=entity_offsets)
+            # By default, the GoldParse class assumes that the entities
+            # described by offset are complete, and all other words should
+            # have the tag 'O'. You can tell it to make no assumptions
+            # about the tag of a word by giving it the tag '-'.
+            # However, this allows a trivial solution to the current
+            # learning problem: if words are either 'any tag' or 'ANIMAL',
+            # the model can learn that all words can be tagged 'ANIMAL'.
+            #for i in range(len(gold.ner)):
+                #if not gold.ner[i].endswith('ANIMAL'):
+                #    gold.ner[i] = '-'
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
-            loss = nlp.entity.update(doc, gold)
+            # As of 1.9, spaCy's parser now lets you supply a dropout probability
+            # This might help the model generalize better from only a few
+            # examples.
+            loss += nlp.entity.update(doc, gold, drop=0.9)
+        if loss == 0:
+            break
+    # This step averages the model's weights. This may or may not be good for
+    # your situation --- it's empirical.
    nlp.end_training()
    if output_dir:
        if not output_dir.exists():
@ -80,13 +102,19 @@ def main(model_name, output_directory=None):
        (
            "they pretend to care about your feelings, those horses",
            [(48, 54, 'ANIMAL')]
+        ),
+        (
+            "horses?",
+            [(0, 6, 'ANIMAL')]
        )
+
    ]
    nlp.entity.add_label('ANIMAL')
    train_ner(nlp, train_data, output_directory)

    # Test that the entity is recognized
    doc = nlp('Do you like horses?')
+    print("Ents in 'Do you like horses?':")
    for ent in doc.ents:
        print(ent.label_, ent.text)
    if output_directory:
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -11,6 +11,8 @@ import ujson
 cimport cython
 cimport cython.parallel

+import numpy.random
+
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
@ -303,7 +305,7 @@ cdef class Parser:
        free(eg.is_valid)
        return 0

-    def update(self, Doc tokens, GoldParse gold, itn=0):
+    def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
        """
        Update the statistical model.

@ -325,9 +327,11 @@ cdef class Parser:
                nr_feat=self.model.nr_feat)
        cdef weight_t loss = 0
        cdef Transition action
+        cdef double dropout_rate = self.cfg.get('dropout', drop)
        while not stcls.is_final():
            eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
                                                    stcls.c)
+            dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
@ -378,6 +382,18 @@ cdef class Parser:
                self.cfg.setdefault('extra_labels', []).append(label)


+cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
+    if prob <= 0 or prob >= 1.:
+        return 0
+    cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
+    cdef double* probs = &py_probs[0]
+    for i in range(nr_feat):
+        if probs[i] >= prob:
+            feats[i].value /= prob
+        else:
+            feats[i].value = 0.
+
+
 cdef class StepwiseState:
    cdef readonly StateClass stcls
    cdef readonly Example eg