From b9991fbd20e94b42cf2593530b6589c1b096809d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 6 Nov 2015 00:25:59 +1100 Subject: [PATCH] * Update to use thinc 3.0 --- spacy/_ml.pyx | 24 ++++--------------- spacy/syntax/parser.pyx | 2 +- spacy/tagger.pxd | 3 --- spacy/tagger.pyx | 51 +++++++++++++++++------------------------ 4 files changed, 27 insertions(+), 53 deletions(-) diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 2fc7b646c..ed332a16a 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -10,6 +10,8 @@ import json import cython import numpy.random +from libc.string cimport memcpy + from thinc.features cimport Feature, count_feats from thinc.api cimport Example @@ -52,28 +54,12 @@ cdef class Model: cdef const weight_t* score(self, atom_t* context) except NULL: memcpy(self._eg.c.atoms, context, self._eg.c.nr_atom * sizeof(context[0])) self._model(self._eg) - return self._eg.scores + return self._eg.c.scores cdef int set_scores(self, weight_t* scores, atom_t* context) nogil: - cdef int nr_feat = self._model.extractor.set_feats(self._eg.features, context) + cdef int nr_feat = self._extractor.set_feats(self._eg.c.features, context) - self._model.set_scores( - scores, - self._model.weights.c_map, - self._eg.c.features, - nr_feat - ) - - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: - cdef int n_feats - if cost == 0: - self._model.update({}) - else: - feats = self._extractor.get_feats(context, &n_feats) - counts = {gold: {}, guess: {}} - count_feats(counts[gold], feats, n_feats, cost) - count_feats(counts[guess], feats, n_feats, -cost) - self._model.update(counts) + self._model.set_scores(scores, self._eg.c.features, nr_feat) def end_training(self, model_loc=None): if model_loc is None: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 40569b1aa..4b25613ad 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -40,7 +40,7 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass -from .._ml cimport arg_max_if_true +from thinc.learner cimport arg_max_if_true DEBUG = False diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 28d7fc711..ad2a90970 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -7,6 +7,3 @@ cdef class Tagger: cdef readonly Vocab vocab cdef readonly Model model cdef public dict freqs - - cdef int predict(self, int i, const TokenC* tokens) except -1 - cdef int update(self, int i, const TokenC* tokens, int gold) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index e98b28067..9e5f0784e 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -3,6 +3,8 @@ from os import path from collections import defaultdict from thinc.typedefs cimport atom_t, weight_t +from thinc.learner cimport arg_max, arg_max_if_true, arg_max_if_zero +from thinc.api cimport Example from .typedefs cimport attr_t from .tokens.doc cimport Doc @@ -11,7 +13,6 @@ from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * -from ._ml cimport arg_max cpdef enum: @@ -138,12 +139,15 @@ cdef class Tagger: """ if tokens.length == 0: return 0 + + cdef Example eg = self.model._eg cdef int i - cdef const weight_t* scores for i in range(tokens.length): if tokens.c[i].pos == 0: - guess = self.predict(i, tokens.c) - self.vocab.morphology.assign_tag(&tokens.c[i], guess) + eg.wipe() + fill_atoms(eg.c.atoms, tokens.c, i) + self.model(eg) + self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -169,39 +173,26 @@ cdef class Tagger: raise ValueError( [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) correct = 0 + cdef Example eg = self.model._eg for i in range(tokens.length): - guess = self.update(i, tokens.c, golds[i]) - loss = golds[i] != -1 and guess != golds[i] + eg.wipe() + fill_atoms(eg.c.atoms, tokens.c, i) + self.train(eg) - self.vocab.morphology.assign_tag(&tokens.c[i], guess) + self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess) - correct += loss == 0 + correct += eg.c.cost == 0 self.freqs[TAG][tokens.c[i].tag] += 1 return correct - cdef int predict(self, int i, const TokenC* tokens) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - return arg_max(scores, self.model.n_classes) - - cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != gold if gold != -1 else 0 - self.model.update(context, guess, gold, loss) - return guess +cdef inline void fill_atoms(atom_t* atoms, const TokenC* tokens, int i) nogil: + _fill_from_token(&atoms[P2_orth], &tokens[i-2]) + _fill_from_token(&atoms[P1_orth], &tokens[i-1]) + _fill_from_token(&atoms[W_orth], &tokens[i]) + _fill_from_token(&atoms[N1_orth], &tokens[i+1]) + _fill_from_token(&atoms[N2_orth], &tokens[i+2]) + cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: context[0] = t.lex.lower