Try using LinearModel in tagger.

This commit is contained in:
Matthew Honnibal 2017-03-13 11:24:02 +01:00
parent eec3f21c50
commit d44b1b337a
2 changed files with 109 additions and 48 deletions

View File

@ -1,17 +1,20 @@
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC from thinc.structs cimport ExampleC
from thinc.linear.features cimport ConjunctionExtracter
from .structs cimport TokenC from .structs cimport TokenC
from .vocab cimport Vocab from .vocab cimport Vocab
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel:
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except * cdef ConjunctionExtracter extracter
cdef object model
cdef class Tagger: cdef class Tagger:
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef readonly TaggerModel model cdef readonly TaggerModel model
cdef public dict freqs cdef public dict freqs
cdef public object cfg cdef public object cfg
cdef public object optimizer

View File

@ -1,14 +1,25 @@
# cython: infer_types=True
# cython: profile=True
import json import json
import pathlib import pathlib
from collections import defaultdict from collections import defaultdict
from libc.string cimport memset from libc.string cimport memset, memcpy
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int32_t, int64_t
cimport numpy as np
import numpy as np
np.import_array()
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t, weight_t from thinc.typedefs cimport atom_t, weight_t
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport Vec, VecVec
from thinc.linear.linear import LinearModel
from thinc.structs cimport FeatureC
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
@ -69,24 +80,69 @@ cpdef enum:
N_CONTEXT_FIELDS N_CONTEXT_FIELDS
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel:
def update(self, Example eg): def __init__(self, int nr_tag, templates):
self.time += 1 self.extracter = ConjunctionExtracter(templates)
guess = eg.guess self.model = LinearModel(nr_tag)
best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
if guess != best:
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight_ftrl(feat.key, best, -feat.value)
self.update_weight_ftrl(feat.key, guess, feat.value)
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: def begin_update(self, atom_t[:, ::1] contexts, drop=0.):
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) cdef vector[uint64_t]* keys = new vector[uint64_t]()
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) cdef vector[float]* values = new vector[float]()
_fill_from_token(&eg.atoms[W_orth], &tokens[i]) cdef vector[int64_t]* lengths = new vector[int64_t]()
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) features = new vector[FeatureC](self.extracter.nr_templ)
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) features.resize(self.extracter.nr_templ)
cdef FeatureC feat
cdef int i, j
for i in range(contexts.shape[0]):
nr_feat = self.extracter.set_features(features.data(), &contexts[i, 0])
for j in range(nr_feat):
keys.push_back(features.at(j).key)
values.push_back(features.at(j).value)
lengths.push_back(nr_feat)
cdef np.ndarray[uint64_t, ndim=1] py_keys
cdef np.ndarray[float, ndim=1] py_values
cdef np.ndarray[long, ndim=1] py_lengths
py_keys = vector_uint64_2numpy(keys)
py_values = vector_float_2numpy(values)
py_lengths = vector_long_2numpy(lengths)
instance = (py_keys, py_values, py_lengths)
del keys
del values
del lengths
del features
return self.model.begin_update(instance, drop=drop)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) def end_training(self, *args, **kwargs):
pass
def dump(self, *args, **kwargs):
pass
cdef np.ndarray[uint64_t, ndim=1] vector_uint64_2numpy(vector[uint64_t]* vec):
cdef np.ndarray[uint64_t, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='uint64')
memcpy(arr.data, vec.data(), sizeof(uint64_t) * vec.size())
return arr
cdef np.ndarray[long, ndim=1] vector_long_2numpy(vector[int64_t]* vec):
cdef np.ndarray[long, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='int64')
memcpy(arr.data, vec.data(), sizeof(int64_t) * vec.size())
return arr
cdef np.ndarray[float, ndim=1] vector_float_2numpy(vector[float]* vec):
cdef np.ndarray[float, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='float32')
memcpy(arr.data, vec.data(), sizeof(float) * vec.size())
return arr
cdef void fill_context(atom_t* context, const TokenC* tokens, int i) nogil:
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
@ -157,17 +213,17 @@ cdef class Tagger:
The newly constructed object. The newly constructed object.
""" """
if model is None: if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates), model = TaggerModel(vocab.morphology.n_tags,
L1=0.0) cfg.get('features', self.feature_templates))
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map # TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)} self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names: for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1 self.freqs[TAG][0] = 1
self.cfg = cfg self.cfg = cfg
self.optimizer = Adam(NumpyOps(), 0.001)
@property @property
def tag_names(self): def tag_names(self):
@ -194,20 +250,20 @@ cdef class Tagger:
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
cdef Pool mem = Pool() cdef atom_t[1][N_CONTEXT_FIELDS] c_context
memset(c_context, 0, sizeof(c_context))
cdef atom_t[:, ::1] context = c_context
cdef float[:, ::1] scores
cdef int i, tag cdef int nr_class = self.vocab.morphology.n_tags
cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
if tokens.c[i].pos == 0: if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i) fill_context(&context[0, 0], tokens.c, i)
self.model.set_scoresC(eg.c.scores, scores, _ = self.model.begin_update(context)
eg.c.features, eg.c.nr_feat)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) guess = Vec.arg_max(&scores[0, 0], nr_class)
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class) memset(&scores[0, 0], 0, sizeof(float) * scores.size)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
@ -239,6 +295,7 @@ cdef class Tagger:
Returns (int): Returns (int):
Number of tags correct. Number of tags correct.
""" """
cdef int nr_class = self.vocab.morphology.n_tags
gold_tag_strs = gold.tags gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs) assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs: for tag in gold_tag_strs:
@ -248,24 +305,25 @@ cdef class Tagger:
raise ValueError(msg % tag) raise ValueError(msg % tag)
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
cdef int correct = 0 cdef int correct = 0
cdef Pool mem = Pool()
cdef Example eg = Example( cdef atom_t[:, ::1] context = np.zeros((1, N_CONTEXT_FIELDS), dtype='uint64')
nr_atom=N_CONTEXT_FIELDS, cdef float[:, ::1] scores
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
self.model.set_featuresC(&eg.c, tokens.c, i) fill_context(&context[0, 0], tokens.c, i)
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] scores, finish_update = self.model.begin_update(context)
self.model.set_scoresC(eg.c.scores, guess = Vec.arg_max(&scores[0, 0], nr_class)
eg.c.features, eg.c.nr_feat) self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
self.model.update(eg)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) if golds[i] != -1:
scores[0, golds[i]] -= 1
finish_update(scores, lambda *args, **kwargs: None)
correct += eg.cost == 0 if (golds[i] in (guess, -1)):
correct += 1
self.freqs[TAG][tokens.c[i].tag] += 1 self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class) self.optimizer(self.model.model.weights, self.model.model.d_weights,
eg.fill_costs(0, eg.c.nr_class) key=self.model.model.id)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
return correct return correct