2015-07-04 13:26:16 +03:00
|
|
|
from libc.string cimport memcpy
|
|
|
|
from cymem.cymem cimport Pool
|
|
|
|
|
|
|
|
from thinc.learner cimport LinearModel
|
|
|
|
from thinc.features cimport Extractor, Feature
|
|
|
|
|
|
|
|
from thinc.typedefs cimport atom_t, weight_t, feat_t
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-02 01:54:26 +03:00
|
|
|
from .typedefs cimport flags_t
|
|
|
|
from .structs cimport TokenC
|
|
|
|
from .strings cimport StringStore
|
|
|
|
from .tokens cimport Tokens
|
2015-07-03 16:25:41 +03:00
|
|
|
from .senses cimport N_SENSES, encode_sense_strs
|
2015-07-04 13:26:16 +03:00
|
|
|
from .senses cimport NO_SENSE, N_Tops, J_ppl, V_body
|
2015-07-02 01:54:26 +03:00
|
|
|
from .gold cimport GoldParse
|
2015-07-03 16:25:41 +03:00
|
|
|
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
|
|
|
|
|
|
|
|
from . cimport parts_of_speech
|
2015-07-03 14:31:11 +03:00
|
|
|
|
|
|
|
from os import path
|
2015-07-02 01:54:26 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef enum:
|
|
|
|
P2W
|
|
|
|
P2p
|
|
|
|
P2c
|
|
|
|
P2c6
|
|
|
|
P2c4
|
|
|
|
|
|
|
|
P1W
|
|
|
|
P1p
|
|
|
|
P1c
|
|
|
|
P1c6
|
|
|
|
P1c4
|
|
|
|
|
|
|
|
N0W
|
|
|
|
N0p
|
|
|
|
N0c
|
|
|
|
N0c6
|
|
|
|
N0c4
|
|
|
|
|
|
|
|
N1W
|
|
|
|
N1p
|
|
|
|
N1c
|
|
|
|
N1c6
|
|
|
|
N1c4
|
|
|
|
|
|
|
|
N2W
|
|
|
|
N2p
|
|
|
|
N2c
|
|
|
|
N2c6
|
|
|
|
N2c4
|
2015-07-04 13:26:16 +03:00
|
|
|
|
|
|
|
P1s
|
|
|
|
P2s
|
2015-07-02 01:54:26 +03:00
|
|
|
|
|
|
|
CONTEXT_SIZE
|
|
|
|
|
|
|
|
|
|
|
|
unigrams = (
|
|
|
|
(P2W,),
|
|
|
|
(P2p,),
|
|
|
|
(P2W, P2p),
|
|
|
|
(P2c, P2p),
|
|
|
|
(P2c6, P2p),
|
|
|
|
(P2c4, P2p),
|
|
|
|
(P2c,),
|
|
|
|
|
|
|
|
(P1W,),
|
|
|
|
(P1p,),
|
|
|
|
(P1W, P1p),
|
|
|
|
(P1c, P1p),
|
|
|
|
(P1c6, P1p),
|
|
|
|
(P1c4, P1p),
|
|
|
|
(P1c,),
|
|
|
|
(P1W,),
|
|
|
|
(P1p,),
|
|
|
|
(P1W, P1p),
|
|
|
|
(P1c, P1p),
|
|
|
|
(P1c6, P1p),
|
|
|
|
(P1c4, P1p),
|
|
|
|
(P1c,),
|
|
|
|
|
|
|
|
(N0p,),
|
|
|
|
(N0c, N0p),
|
|
|
|
(N0c6, N0p),
|
|
|
|
(N0c4, N0p),
|
|
|
|
(N0c,),
|
|
|
|
(N0p,),
|
|
|
|
(N0c, N0p),
|
|
|
|
(N0c6, N0p),
|
|
|
|
(N0c4, N0p),
|
|
|
|
(N0c,),
|
|
|
|
|
|
|
|
(N1p,),
|
|
|
|
(N1W, N1p),
|
|
|
|
(N1c, N1p),
|
|
|
|
(N1c6, N1p),
|
|
|
|
(N1c4, N1p),
|
|
|
|
(N1c,),
|
|
|
|
(N1W,),
|
|
|
|
(N1p,),
|
|
|
|
(N1W, N1p),
|
|
|
|
(N1c, N1p),
|
|
|
|
(N1c6, N1p),
|
|
|
|
(N1c4, N1p),
|
|
|
|
(N1c,),
|
|
|
|
|
|
|
|
(N2p,),
|
|
|
|
(N2W, N2p),
|
|
|
|
(N2c, N2p),
|
|
|
|
(N2c6, N2p),
|
|
|
|
(N2c4, N2p),
|
|
|
|
(N2c,),
|
|
|
|
(N2W,),
|
|
|
|
(N2p,),
|
|
|
|
(N2W, N2p),
|
|
|
|
(N2c, N2p),
|
|
|
|
(N2c6, N2p),
|
|
|
|
(N2c4, N2p),
|
|
|
|
(N2c,),
|
2015-07-04 13:26:16 +03:00
|
|
|
|
|
|
|
(P1s,),
|
|
|
|
(P2s,),
|
|
|
|
(P1s, P2s,),
|
|
|
|
(P1s, N0p),
|
|
|
|
(P1s, P2s, N0c),
|
2015-07-02 01:54:26 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
bigrams = (
|
|
|
|
(P2p, P1p),
|
|
|
|
(P2W, N0p),
|
|
|
|
(P2c, P1p),
|
|
|
|
(P1c, N0p),
|
|
|
|
(P1c6, N0p),
|
|
|
|
|
|
|
|
(N0p, N1p,),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
trigrams = (
|
|
|
|
(P1p, N0p, N1p),
|
2015-07-03 16:25:41 +03:00
|
|
|
(P2p, P1p,),
|
2015-07-02 01:54:26 +03:00
|
|
|
(P2c4, P1c4, N0c4),
|
|
|
|
|
|
|
|
(P1p, N0p, N1p),
|
2015-07-03 16:25:41 +03:00
|
|
|
(P1p, N0p,),
|
2015-07-02 01:54:26 +03:00
|
|
|
(P1c4, N0c4, N1c4),
|
|
|
|
|
|
|
|
(N0p, N1p, N2p),
|
2015-07-03 16:25:41 +03:00
|
|
|
(N0p, N1p,),
|
2015-07-02 01:54:26 +03:00
|
|
|
(N0c4, N1c4, N2c4),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
|
|
|
|
ctxt[0] = token.lemma
|
|
|
|
ctxt[1] = token.tag
|
|
|
|
ctxt[2] = token.lex.cluster
|
|
|
|
ctxt[3] = token.lex.cluster & 15
|
|
|
|
ctxt[4] = token.lex.cluster & 63
|
|
|
|
|
|
|
|
|
|
|
|
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
|
2015-07-03 14:31:11 +03:00
|
|
|
# NB: we have padding to keep us safe here
|
|
|
|
# See tokens.pyx
|
2015-07-02 01:54:26 +03:00
|
|
|
fill_token(&ctxt[P2W], token - 2)
|
|
|
|
fill_token(&ctxt[P1W], token - 1)
|
|
|
|
|
|
|
|
fill_token(&ctxt[N0W], token)
|
|
|
|
ctxt[N0W] = 0 # Important! Don't condition on this
|
|
|
|
|
|
|
|
fill_token(&ctxt[N1W], token + 1)
|
|
|
|
fill_token(&ctxt[N2W], token + 2)
|
2015-07-04 13:26:16 +03:00
|
|
|
ctxt[P1s] = (token - 1).sense
|
|
|
|
ctxt[P2s] = (token - 2).sense
|
|
|
|
|
|
|
|
|
|
|
|
cdef class FeatureVector:
|
|
|
|
cdef Pool mem
|
|
|
|
cdef Feature* c
|
|
|
|
cdef list extractors
|
|
|
|
cdef int length
|
|
|
|
cdef int _max_length
|
|
|
|
|
|
|
|
def __init__(self, length=100):
|
|
|
|
self.mem = Pool()
|
|
|
|
self.c = <Feature*>self.mem.alloc(length, sizeof(Feature))
|
|
|
|
self.length = 0
|
|
|
|
self._max_length = length
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
cpdef int add(self, feat_t key, weight_t value) except -1:
|
|
|
|
if self.length == self._max_length:
|
|
|
|
self._max_length *= 2
|
|
|
|
self.c = <Feature*>self.mem.realloc(self.c, self._max_length * sizeof(Feature))
|
|
|
|
|
|
|
|
self.c[self.length] = Feature(i=0, key=key, value=value)
|
|
|
|
self.length += 1
|
|
|
|
|
|
|
|
cdef int extend(self, const Feature* new_feats, int n_feats) except -1:
|
|
|
|
new_length = self.length + n_feats
|
|
|
|
if new_length >= self._max_length:
|
|
|
|
self._max_length = 2 * new_length
|
|
|
|
self.c = <Feature*>self.mem.realloc(self.c, new_length * sizeof(Feature))
|
|
|
|
memcpy(&self.c[self.length], new_feats, n_feats * sizeof(Feature))
|
|
|
|
self.length += n_feats
|
|
|
|
|
|
|
|
def clear(self):
|
|
|
|
self.length = 0
|
2015-07-02 01:54:26 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class SenseTagger:
|
|
|
|
cdef readonly StringStore strings
|
2015-07-03 14:31:11 +03:00
|
|
|
cdef readonly LinearModel model
|
|
|
|
cdef readonly Extractor extractor
|
|
|
|
cdef readonly model_dir
|
2015-07-03 16:25:41 +03:00
|
|
|
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
|
2015-07-02 01:54:26 +03:00
|
|
|
|
|
|
|
def __init__(self, StringStore strings, model_dir):
|
2015-07-03 14:31:11 +03:00
|
|
|
if model_dir is not None and path.isdir(model_dir):
|
|
|
|
model_dir = path.join(model_dir, 'model')
|
|
|
|
|
2015-07-02 01:54:26 +03:00
|
|
|
templates = unigrams + bigrams + trigrams
|
2015-07-03 14:31:11 +03:00
|
|
|
self.extractor = Extractor(templates)
|
|
|
|
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
|
|
|
|
self.model_dir = model_dir
|
|
|
|
if self.model_dir and path.exists(self.model_dir):
|
|
|
|
self.model.load(self.model_dir, freq_thresh=0)
|
|
|
|
self.strings = strings
|
2015-07-02 01:54:26 +03:00
|
|
|
|
2015-07-03 16:25:41 +03:00
|
|
|
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.ADJ] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.ADV] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.ADP] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.CONJ] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.DET] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.NOUN] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.NUM] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.PRON] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.PRT] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.X] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
|
|
|
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
|
|
|
|
|
|
|
cdef flags_t sense = 0
|
2015-07-04 13:26:16 +03:00
|
|
|
for sense in range(N_Tops, V_body):
|
2015-07-03 16:25:41 +03:00
|
|
|
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
|
|
|
|
|
2015-07-04 13:26:16 +03:00
|
|
|
for sense in range(V_body, J_ppl):
|
2015-07-03 16:25:41 +03:00
|
|
|
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
|
|
|
|
|
2015-07-02 01:54:26 +03:00
|
|
|
def __call__(self, Tokens tokens):
|
2015-07-04 13:26:16 +03:00
|
|
|
cdef atom_t[CONTEXT_SIZE] local_context
|
2015-07-03 14:31:11 +03:00
|
|
|
cdef int i, guess, n_feats
|
2015-07-04 13:26:16 +03:00
|
|
|
cdef flags_t valid_senses = 0
|
2015-07-03 16:25:41 +03:00
|
|
|
cdef TokenC* token
|
2015-07-04 13:26:16 +03:00
|
|
|
cdef FeatureVector features = FeatureVector(100)
|
2015-07-02 01:54:26 +03:00
|
|
|
for i in range(tokens.length):
|
2015-07-03 14:31:11 +03:00
|
|
|
token = &tokens.data[i]
|
2015-07-04 13:26:16 +03:00
|
|
|
if token.lex.senses == 1:
|
|
|
|
continue
|
|
|
|
assert not (token.lex.senses & (1 << NO_SENSE)), (tokens[i].orth_, token.lex.senses)
|
|
|
|
assert not (self.pos_senses[<int>token.pos] & (1 << NO_SENSE))
|
|
|
|
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
|
|
|
|
assert not (valid_senses & (1 << NO_SENSE))
|
|
|
|
if valid_senses:
|
|
|
|
fill_context(local_context, token)
|
|
|
|
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
|
|
|
features.extend(local_feats, n_feats)
|
|
|
|
scores = self.model.get_scores(features.c, features.length)
|
|
|
|
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
|
|
|
features.clear()
|
2015-07-02 01:54:26 +03:00
|
|
|
|
|
|
|
def train(self, Tokens tokens, GoldParse gold):
|
2015-07-03 14:31:11 +03:00
|
|
|
cdef int i, j
|
2015-07-03 16:25:41 +03:00
|
|
|
cdef TokenC* token
|
2015-07-03 06:45:42 +03:00
|
|
|
for i, ssenses in enumerate(gold.ssenses):
|
2015-07-03 16:25:41 +03:00
|
|
|
token = &tokens.data[i]
|
2015-07-03 06:45:42 +03:00
|
|
|
if ssenses:
|
|
|
|
gold.c.ssenses[i] = encode_sense_strs(ssenses)
|
2015-07-04 13:26:16 +03:00
|
|
|
elif token.lex.senses >= 2 and token.pos in (NOUN, VERB):
|
2015-07-03 16:25:41 +03:00
|
|
|
gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
|
2015-07-04 13:26:16 +03:00
|
|
|
else:
|
|
|
|
gold.c.ssenses[i] = 0
|
2015-07-03 14:31:11 +03:00
|
|
|
|
|
|
|
cdef atom_t[CONTEXT_SIZE] context
|
|
|
|
cdef int n_feats
|
|
|
|
cdef feat_t f_key
|
|
|
|
cdef int f_i
|
2015-07-02 01:54:26 +03:00
|
|
|
cdef int cost = 0
|
|
|
|
for i in range(tokens.length):
|
2015-07-03 14:31:11 +03:00
|
|
|
token = &tokens.data[i]
|
|
|
|
if token.pos in (NOUN, VERB) \
|
|
|
|
and token.lex.senses >= 2 \
|
|
|
|
and gold.c.ssenses[i] >= 2:
|
|
|
|
fill_context(context, token)
|
|
|
|
feats = self.extractor.get_feats(context, &n_feats)
|
|
|
|
scores = self.model.get_scores(feats, n_feats)
|
2015-07-04 13:26:16 +03:00
|
|
|
token.sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
|
2015-07-03 14:31:11 +03:00
|
|
|
best = self.best_in_set(scores, gold.c.ssenses[i])
|
|
|
|
guess_counts = {}
|
|
|
|
gold_counts = {}
|
|
|
|
if token.sense != best:
|
|
|
|
for j in range(n_feats):
|
|
|
|
f_key = feats[j].key
|
|
|
|
f_i = feats[j].i
|
|
|
|
feat = (f_i, f_key)
|
|
|
|
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
|
|
|
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
2015-07-03 16:25:41 +03:00
|
|
|
self.model.update({token.sense: guess_counts, best: gold_counts})
|
2015-07-02 01:54:26 +03:00
|
|
|
return cost
|
|
|
|
|
2015-07-03 14:31:11 +03:00
|
|
|
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
|
|
|
cdef weight_t max_ = 0
|
|
|
|
cdef int argmax = -1
|
|
|
|
cdef flags_t i
|
|
|
|
for i in range(N_SENSES):
|
|
|
|
if (senses & (1 << i)) and (argmax == -1 or scores[i] > max_):
|
|
|
|
max_ = scores[i]
|
|
|
|
argmax = i
|
|
|
|
assert argmax >= 0
|
|
|
|
return argmax
|
2015-07-03 06:18:15 +03:00
|
|
|
|
2015-07-02 01:54:26 +03:00
|
|
|
|
2015-07-03 06:18:15 +03:00
|
|
|
cdef list _set_bits(flags_t flags):
|
|
|
|
bits = []
|
|
|
|
cdef flags_t bit
|
|
|
|
for bit in range(N_SENSES):
|
|
|
|
if flags & (1 << bit):
|
|
|
|
bits.append(bit)
|
|
|
|
return bits
|