* Update to use thinc 3.0

This commit is contained in:
Matthew Honnibal 2015-11-06 00:25:59 +11:00
parent 802ad3d71a
commit b9991fbd20
4 changed files with 27 additions and 53 deletions

View File

@ -10,6 +10,8 @@ import json
import cython import cython
import numpy.random import numpy.random
from libc.string cimport memcpy
from thinc.features cimport Feature, count_feats from thinc.features cimport Feature, count_feats
from thinc.api cimport Example from thinc.api cimport Example
@ -52,28 +54,12 @@ cdef class Model:
cdef const weight_t* score(self, atom_t* context) except NULL: cdef const weight_t* score(self, atom_t* context) except NULL:
memcpy(self._eg.c.atoms, context, self._eg.c.nr_atom * sizeof(context[0])) memcpy(self._eg.c.atoms, context, self._eg.c.nr_atom * sizeof(context[0]))
self._model(self._eg) self._model(self._eg)
return self._eg.scores return self._eg.c.scores
cdef int set_scores(self, weight_t* scores, atom_t* context) nogil: cdef int set_scores(self, weight_t* scores, atom_t* context) nogil:
cdef int nr_feat = self._model.extractor.set_feats(self._eg.features, context) cdef int nr_feat = self._extractor.set_feats(self._eg.c.features, context)
self._model.set_scores( self._model.set_scores(scores, self._eg.c.features, nr_feat)
scores,
self._model.weights.c_map,
self._eg.c.features,
nr_feat
)
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
cdef int n_feats
if cost == 0:
self._model.update({})
else:
feats = self._extractor.get_feats(context, &n_feats)
counts = {gold: {}, guess: {}}
count_feats(counts[gold], feats, n_feats, cost)
count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts)
def end_training(self, model_loc=None): def end_training(self, model_loc=None):
if model_loc is None: if model_loc is None:

View File

@ -40,7 +40,7 @@ from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from .._ml cimport arg_max_if_true from thinc.learner cimport arg_max_if_true
DEBUG = False DEBUG = False

View File

@ -7,6 +7,3 @@ cdef class Tagger:
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef readonly Model model cdef readonly Model model
cdef public dict freqs cdef public dict freqs
cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1

View File

@ -3,6 +3,8 @@ from os import path
from collections import defaultdict from collections import defaultdict
from thinc.typedefs cimport atom_t, weight_t from thinc.typedefs cimport atom_t, weight_t
from thinc.learner cimport arg_max, arg_max_if_true, arg_max_if_zero
from thinc.api cimport Example
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
@ -11,7 +13,6 @@ from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport * from .attrs cimport *
from ._ml cimport arg_max
cpdef enum: cpdef enum:
@ -138,12 +139,15 @@ cdef class Tagger:
""" """
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
cdef Example eg = self.model._eg
cdef int i cdef int i
cdef const weight_t* scores
for i in range(tokens.length): for i in range(tokens.length):
if tokens.c[i].pos == 0: if tokens.c[i].pos == 0:
guess = self.predict(i, tokens.c) eg.wipe()
self.vocab.morphology.assign_tag(&tokens.c[i], guess) fill_atoms(eg.c.atoms, tokens.c, i)
self.model(eg)
self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
@ -169,38 +173,25 @@ cdef class Tagger:
raise ValueError( raise ValueError(
[g for g in gold_tag_strs if g is not None and g not in self.tag_names]) [g for g in gold_tag_strs if g is not None and g not in self.tag_names])
correct = 0 correct = 0
cdef Example eg = self.model._eg
for i in range(tokens.length): for i in range(tokens.length):
guess = self.update(i, tokens.c, golds[i]) eg.wipe()
loss = golds[i] != -1 and guess != golds[i] fill_atoms(eg.c.atoms, tokens.c, i)
self.train(eg)
self.vocab.morphology.assign_tag(&tokens.c[i], guess) self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
correct += loss == 0 correct += eg.c.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1 self.freqs[TAG][tokens.c[i].tag] += 1
return correct return correct
cdef int predict(self, int i, const TokenC* tokens) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
return arg_max(scores, self.model.n_classes)
cdef int update(self, int i, const TokenC* tokens, int gold) except -1: cdef inline void fill_atoms(atom_t* atoms, const TokenC* tokens, int i) nogil:
cdef atom_t[N_CONTEXT_FIELDS] context _fill_from_token(&atoms[P2_orth], &tokens[i-2])
_fill_from_token(&context[P2_orth], &tokens[i-2]) _fill_from_token(&atoms[P1_orth], &tokens[i-1])
_fill_from_token(&context[P1_orth], &tokens[i-1]) _fill_from_token(&atoms[W_orth], &tokens[i])
_fill_from_token(&context[W_orth], &tokens[i]) _fill_from_token(&atoms[N1_orth], &tokens[i+1])
_fill_from_token(&context[N1_orth], &tokens[i+1]) _fill_from_token(&atoms[N2_orth], &tokens[i+2])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != gold if gold != -1 else 0
self.model.update(context, guess, gold, loss)
return guess
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: