From 3da1063b36c118c71d3332ca178afdf09f544e16 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 20 Jul 2017 15:02:55 +0200 Subject: [PATCH] Add beam decoding to parser, to allow NER uncertainties --- spacy/syntax/arc_eager.pyx | 22 ++++++ spacy/syntax/ner.pyx | 19 ++++++ spacy/syntax/nn_parser.pyx | 105 ++++++++++++++++++++++++++--- spacy/syntax/transition_system.pyx | 4 ++ 4 files changed, 140 insertions(+), 10 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 7df5fe081..d4367c6df 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -10,6 +10,8 @@ from libc.stdint cimport uint32_t from libc.string cimport memcpy from cymem.cymem cimport Pool from collections import OrderedDict +from thinc.extra.search cimport Beam +import numpy from .stateclass cimport StateClass from ._state cimport StateC, is_space_token @@ -510,3 +512,23 @@ cdef class ArcEager(TransitionSystem): "State at failure:\n" "%s" % (self.n_moves, stcls.print_state(gold.words))) assert n_gold >= 1 + + def get_beam_annot(self, Beam beam): + length = (beam.at(0)).c.length + heads = [{} for _ in range(length)] + deps = [{} for _ in range(length)] + probs = beam.probs + for i in range(beam.size): + stcls = beam.at(i) + self.finalize_state(stcls.c) + if stcls.is_final(): + prob = probs[i] + for j in range(stcls.c.length): + head = j + stcls.c._sent[j].head + dep = stcls.c._sent[j].dep + heads[j].setdefault(head, 0.0) + heads[j][head] += prob + deps[j].setdefault(dep, 0.0) + deps[j][dep] += prob + return heads, deps + diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 656c84e4c..023707aaa 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -2,7 +2,10 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t +from thinc.extra.search cimport Beam from collections import OrderedDict +import numpy +from thinc.neural.ops import NumpyOps from .stateclass cimport StateClass from ._state cimport StateC @@ -122,6 +125,22 @@ cdef class BiluoPushDown(TransitionSystem): gold.c.ner[i] = self.lookup_transition(gold.ner[i]) return gold + def get_beam_annot(self, Beam beam): + entities = {} + probs = beam.probs + for i in range(beam.size): + stcls = beam.at(i) + if stcls.is_final(): + self.finalize_state(stcls.c) + prob = probs[i] + for j in range(stcls.c._e_i): + start = stcls.c._ents[j].start + end = stcls.c._ents[j].end + label = stcls.c._ents[j].label + entities.setdefault((start, end, label), 0.0) + entities[(start, end, label)] += prob + return entities + cdef Transition lookup_transition(self, object name) except *: cdef attr_t label if name == '-' or name == None: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 91a651200..0b39e2216 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -29,6 +29,7 @@ from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec from thinc.structs cimport SparseArrayC, FeatureC, ExampleC from thinc.extra.eg cimport Example +from thinc.extra.search cimport Beam from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 @@ -110,7 +111,6 @@ cdef class precompute_hiddens: self.nO = cached.shape[2] self.nP = getattr(lower_model, 'nP', 1) self.ops = lower_model.ops - self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f') self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached @@ -127,13 +127,12 @@ cdef class precompute_hiddens: return self.begin_update(X)[0] def begin_update(self, token_ids, drop=0.): - self._features.fill(0) + cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') # This is tricky, but (assuming GPU available); # - Input to forward on CPU # - Output from forward on CPU # - Input to backward on GPU! # - Output from backward on GPU - cdef np.ndarray state_vector = self._features[:len(token_ids)] bp_hiddens = self._bp_hiddens feat_weights = self.get_feat_weights() @@ -305,7 +304,7 @@ cdef class Parser: def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) - def __call__(self, Doc doc): + def __call__(self, Doc doc, beam_width=None, beam_density=None): """ Apply the parser or entity recognizer, setting the annotations onto the Doc object. @@ -314,11 +313,26 @@ cdef class Parser: Returns: None """ - states = self.parse_batch([doc], [doc.tensor]) - self.set_annotations([doc], states) - return doc + if beam_width is None: + beam_width = self.cfg.get('beam_width', 1) + if beam_density is None: + beam_density = self.cfg.get('beam_density', 0.001) + cdef Beam beam + if beam_width == 1: + states = self.parse_batch([doc], [doc.tensor]) + self.set_annotations([doc], states) + return doc + else: + beam = self.beam_parse([doc], [doc.tensor], + beam_width=beam_width, beam_density=beam_density)[0] + output = self.moves.get_beam_annot(beam) + state = beam.at(0) + self.set_annotations([doc], [state]) + _cleanup(beam) + return output - def pipe(self, docs, int batch_size=1000, int n_threads=2): + def pipe(self, docs, int batch_size=1000, int n_threads=2, + beam_width=1, beam_density=0.001): """ Process a stream of documents. @@ -336,7 +350,11 @@ cdef class Parser: for docs in cytoolz.partition_all(batch_size, docs): docs = list(docs) tokvecs = [d.tensor for d in docs] - parse_states = self.parse_batch(docs, tokvecs) + if beam_width == 1: + parse_states = self.parse_batch(docs, tokvecs) + else: + parse_states = self.beam_parse(docs, tokvecs, + beam_width=beam_width, beam_density=beam_density) self.set_annotations(docs, parse_states) yield from docs @@ -404,6 +422,45 @@ cdef class Parser: next_step.push_back(st) return states + def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001): + cdef Beam beam + cdef np.ndarray scores + cdef Doc doc + cdef int nr_class = self.moves.n_moves + cdef StateClass stcls, output + tokvecs = self.model[0].ops.flatten(tokvecses) + cuda_stream = get_cuda_stream() + state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, + cuda_stream, 0.0) + beams = [] + cdef int offset = 0 + for doc in docs: + beam = Beam(nr_class, beam_width, min_density=beam_density) + beam.initialize(self.moves.init_beam_state, doc.length, doc.c) + for i in range(beam.width): + stcls = beam.at(i) + stcls.c.offset = offset + offset += len(doc) + beam.check_done(_check_final_state, NULL) + while not beam.is_done: + states = [] + for i in range(beam.size): + stcls = beam.at(i) + states.append(stcls) + token_ids = self.get_token_ids(states) + vectors = state2vec(token_ids) + scores = vec2scores(vectors) + for i in range(beam.size): + stcls = beam.at(i) + if not stcls.is_final(): + self.moves.set_valid(beam.is_valid[i], stcls.c) + for j in range(nr_class): + beam.scores[i][j] = scores[i, j] + beam.advance(_transition_state, _hash_state, self.moves.c) + beam.check_done(_check_final_state, NULL) + beams.append(beam) + return beams + cdef void _parse_step(self, StateC* state, const float* feat_weights, int nr_class, int nr_feat, int nr_piece) nogil: @@ -560,7 +617,8 @@ cdef class Parser: dtype='i', order='C') c_ids = ids.data for i, state in enumerate(states): - state.c.set_context_tokens(c_ids, n_tokens) + if not state.is_final(): + state.c.set_context_tokens(c_ids, n_tokens) c_ids += ids.shape[1] return ids @@ -762,3 +820,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio mode = i score = scores[i] return mode + + +# These are passed as callbacks to thinc.search.Beam +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest.c, moves[clas].label) + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + + +cdef hash_t _hash_state(void* _state, void* _) except 0: + state = _state + if state.c.is_final(): + return 1 + else: + return state.c.hash() diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index e33a29ac2..27b375bba 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -137,6 +137,10 @@ cdef class TransitionSystem: "the entity recognizer\n" "The transition system has %d actions." % (self.n_moves)) + def get_class_name(self, int clas): + act = self.c[clas] + return self.move_name(act.move, act.label) + def add_action(self, int action, label_name): cdef attr_t label_id if not isinstance(label_name, int):