from libc.string cimport memcpy
from libc.math cimport exp
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor, Feature
from thinc.typedefs cimport atom_t, weight_t, feat_t
cimport cython
from .typedefs cimport flags_t
from .structs cimport TokenC
from .strings cimport StringStore
from .tokens cimport Tokens
from .senses cimport N_SENSES, encode_sense_strs
from .senses cimport NO_SENSE, N_Tops, J_all, J_pert, A_all, J_ppl, V_body
from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB, ADV, ADJ, N_UNIV_TAGS
from . cimport parts_of_speech
from os import path
import json
cdef enum:
unigrams = (
(P2W, P2p),
(P2c, P2p),
(P2c6, P2p),
(P2c4, P2p),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(P1s, P2s,),
(P1s, N0p),
(P1s, P2s, N0c),
bigrams = (
(P2p, P1p),
(P2W, N0p),
(P2c, P1p),
(P1c, N0p),
(P1c6, N0p),
(N0p, N1p,),
(P2W, P1W),
(P1W, N1W),
(N1W, N2W),
trigrams = (
(P1p, N0p, N1p),
(P2p, P1p,),
(P2c4, P1c4, N0c4),
(P1p, N0p, N1p),
(P1p, N0p,),
(P1c4, N0c4, N1c4),
(N0p, N1p, N2p),
(N0p, N1p,),
(N0c4, N1c4, N2c4),
(P1W, N0p, N0W),
cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[0] = token.lemma
ctxt[1] = token.tag
ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
# NB: we have padding to keep us safe here
# See tokens.pyx
fill_token(&ctxt[P2W], token - 2)
fill_token(&ctxt[P1W], token - 1)
fill_token(&ctxt[N0W], token)
ctxt[N0W] = 0 # Important! Don't condition on this
fill_token(&ctxt[N1W], token + 1)
fill_token(&ctxt[N2W], token + 2)
ctxt[P1s] = (token - 1).sense
ctxt[P2s] = (token - 2).sense
ctxt[N3W] = (token + 3).lemma
ctxt[P3W] = (token - 3).lemma
cdef class FeatureVector:
cdef Pool mem
cdef Feature* c
cdef list extractors
cdef int length
cdef int _max_length
def __init__(self, length=100):
self.mem = Pool()
self.c = <Feature*>self.mem.alloc(length, sizeof(Feature))
self.length = 0
self._max_length = length
def __len__(self):
return self.length
cpdef int add(self, feat_t key, weight_t value) except -1:
if self.length == self._max_length:
self._max_length *= 2
self.c = <Feature*>self.mem.realloc(self.c, self._max_length * sizeof(Feature))
self.c[self.length] = Feature(i=0, key=key, value=value)
self.length += 1
cdef int extend(self, const Feature* new_feats, int n_feats) except -1:
new_length = self.length + n_feats
if new_length >= self._max_length:
self._max_length = 2 * new_length
self.c = <Feature*>self.mem.realloc(self.c, new_length * sizeof(Feature))
memcpy(&self.c[self.length], new_feats, n_feats * sizeof(Feature))
self.length += n_feats
def clear(self):
self.length = 0
cdef class SenseTagger:
cdef readonly StringStore strings
cdef readonly LinearModel model
cdef readonly Extractor extractor
cdef readonly model_dir
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
cdef dict tagdict
def __init__(self, StringStore strings, model_dir):
self.model_dir = model_dir
if path.exists(path.join(model_dir, 'wordnet', 'supersenses.json')):
self.tagdict = json.load(open(path.join(model_dir, 'wordnet', 'supersenses.json')))
self.tagdict = {}
if model_dir is not None and path.isdir(model_dir):
model_dir = path.join(model_dir, 'wsd')
templates = unigrams + bigrams + trigrams
self.extractor = Extractor(templates)
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
self.strings = strings
cdef flags_t all_senses = 0
cdef flags_t sense = 0
cdef flags_t one = 1
for sense in range(1, N_SENSES):
all_senses |= (one << sense)
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
self.pos_senses[<int>parts_of_speech.X] = all_senses
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
for sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
self.pos_senses[<int>parts_of_speech.VERB] = 0
for sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] local_context
cdef int i, guess, n_feats
cdef flags_t valid_senses = 0
cdef TokenC* token
cdef flags_t one = 1
cdef int n_doc_feats
cdef Pool mem = Pool()
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
for i in range(tokens.length):
token = &[i]
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
if valid_senses >= 2:
fill_context(local_context, token)
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats],
scores = self.model.get_scores(feats, n_local_feats)
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.0)[i].sense = self.best_in_set(scores, valid_senses)
token.sense = NO_SENSE
def train(self, Tokens tokens):
cdef int i, j
cdef TokenC* token
cdef atom_t[CONTEXT_SIZE] context
cdef int n_doc_feats, n_local_feats
cdef feat_t f_key
cdef flags_t best_senses = 0
cdef int f_i
cdef int cost = 0
cdef Pool mem = Pool()
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
for i in range(tokens.length):
token = &[i]
pos_senses = self.pos_senses[<int>token.pos]
lex_senses = token.lex.senses & pos_senses
if lex_senses >= 2:
fill_context(context, token)
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats], context)
scores = self.model.get_scores(feats, n_doc_feats + n_local_feats)
guess = self.best_in_set(scores, pos_senses)
best = self.best_in_set(scores, lex_senses)
update = self._make_update(feats, n_doc_feats + n_local_feats,
guess, best)
token.sense = best
cost += guess != best
token.sense = 1
return cost
cdef dict _perceptron_update(self, const Feature* feats, int n_feats, int guess, int best):
guess_counts = {}
gold_counts = {}
if guess != best:
for j in range(n_feats):
f_key = feats[j].key
f_i = feats[j].i
feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
return {guess: guess_counts, best: gold_counts}
cdef Feature* get_doc_feats(self, Pool mem, Tokens tokens, int* n_feats) except NULL:
# Get features for the document
# Start with activation strengths for each supersense
n_feats[0] = N_SENSES
feats = <Feature*>mem.alloc(n_feats[0] + self.extractor.n_templ + 1,
cdef int i, ssense
for ssense in range(N_SENSES):
feats[ssense] = Feature(i=0, key=ssense, value=0)
cdef flags_t pos_senses
cdef flags_t one = 1
for i in range(tokens.length):
sense_probs = self.tagdict.get([i].lemma, {})
pos_senses = self.pos_senses[<int>[i].pos]
for ssense_str, prob in sense_probs.items():
ssense = int(ssense_str + 1)
if pos_senses & (one << <flags_t>ssense):
feats[ssense].value += prob
return feats
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
cdef weight_t max_ = 0
cdef int argmax = -1
cdef flags_t i
cdef flags_t one = 1
for i in range(N_SENSES):
if (senses & (one << i)) and (argmax == -1 or scores[i] > max_):
max_ = scores[i]
argmax = i
assert argmax >= 0
return argmax
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
weight_t a) except -1:
lemma = self.strings[token.lemma]
# First softmax the scores
softmax(scores, N_SENSES)
probs = self.tagdict.get(lemma, {})
for i in range(1, N_SENSES):
prob = probs.get(unicode(i-1), 0)
scores[i] = (a * prob) + ((1 - a) * scores[i])
def end_training(self):
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
cdef void softmax(weight_t* scores, int n_classes) nogil:
cdef int i
cdef double total = 0
for i in range(N_SENSES):
total += exp(scores[i])
for i in range(N_SENSES):
scores[i] = <weight_t>(exp(scores[i]) / total)
cdef list _set_bits(flags_t flags):
bits = []
cdef flags_t bit
cdef flags_t one = 1
for bit in range(N_SENSES):
if flags & (one << bit):
return bits