spaCy/spacy/sense_tagger.pyx
2015-07-06 08:43:24 +02:00

433 lines
11 KiB
Cython

from libc.string cimport memcpy
from libc.math cimport exp
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor, Feature
from thinc.typedefs cimport atom_t, weight_t, feat_t
cimport cython
from .typedefs cimport flags_t
from .structs cimport TokenC
from .strings cimport StringStore
from .tokens cimport Tokens
from .senses cimport N_SENSES, encode_sense_strs
from .senses cimport NO_SENSE, N_Tops, J_all, J_pert, A_all, J_ppl, V_body
from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB, ADV, ADJ, N_UNIV_TAGS
from . cimport parts_of_speech
from os import path
import json
cdef enum:
P2W
P2p
P2c
P2c6
P2c4
P1W
P1p
P1c
P1c6
P1c4
N0W
N0p
N0c
N0c6
N0c4
N1W
N1p
N1c
N1c6
N1c4
N2W
N2p
N2c
N2c6
N2c4
Hw
Hp
Hc
Hc6
Hc4
N3W
P3W
P1s
P2s
CONTEXT_SIZE
unigrams = (
(Hw,),
(Hp,),
(Hw, Hp),
(Hc, Hp),
(Hc6, Hp),
(Hc4, Hp),
(Hc,),
(P2W,),
(P2p,),
(P2W, P2p),
(P2c, P2p),
(P2c6, P2p),
(P2c4, P2p),
(P2c,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(N0p,),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N0p,),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N1W,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(N2W,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(P1s,),
(P2s,),
(P1s, P2s,),
(P1s, N0p),
(P1s, P2s, N0c),
(N3W,),
(P3W,),
)
bigrams = (
(P2p, P1p),
(P2W, N0p),
(P2c, P1p),
(P1c, N0p),
(P1c6, N0p),
(N0p, N1p,),
(P2W, P1W),
(P1W, N1W),
(N1W, N2W),
)
trigrams = (
(P1p, N0p, N1p),
(P2p, P1p,),
(P2c4, P1c4, N0c4),
(P1p, N0p, N1p),
(P1p, N0p,),
(P1c4, N0c4, N1c4),
(N0p, N1p, N2p),
(N0p, N1p,),
(N0c4, N1c4, N2c4),
(P1W, N0p, N0W),
)
cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[0] = token.lemma
ctxt[1] = token.tag
ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
# NB: we have padding to keep us safe here
# See tokens.pyx
fill_token(&ctxt[P2W], token - 2)
fill_token(&ctxt[P1W], token - 1)
fill_token(&ctxt[N0W], token)
ctxt[N0W] = 0 # Important! Don't condition on this
fill_token(&ctxt[N1W], token + 1)
fill_token(&ctxt[N2W], token + 2)
fill_token(&ctxt[Hw], token + token.head)
ctxt[P1s] = (token - 1).sense
ctxt[P2s] = (token - 2).sense
ctxt[N3W] = (token + 3).lemma
ctxt[P3W] = (token - 3).lemma
cdef class FeatureVector:
cdef Pool mem
cdef Feature* c
cdef list extractors
cdef int length
cdef int _max_length
def __init__(self, length=100):
self.mem = Pool()
self.c = <Feature*>self.mem.alloc(length, sizeof(Feature))
self.length = 0
self._max_length = length
def __len__(self):
return self.length
cpdef int add(self, feat_t key, weight_t value) except -1:
if self.length == self._max_length:
self._max_length *= 2
self.c = <Feature*>self.mem.realloc(self.c, self._max_length * sizeof(Feature))
self.c[self.length] = Feature(i=0, key=key, value=value)
self.length += 1
cdef int extend(self, const Feature* new_feats, int n_feats) except -1:
new_length = self.length + n_feats
if new_length >= self._max_length:
self._max_length = 2 * new_length
self.c = <Feature*>self.mem.realloc(self.c, new_length * sizeof(Feature))
memcpy(&self.c[self.length], new_feats, n_feats * sizeof(Feature))
self.length += n_feats
def clear(self):
self.length = 0
cdef class SenseTagger:
cdef readonly StringStore strings
cdef readonly LinearModel model
cdef readonly Extractor extractor
cdef readonly model_dir
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
cdef dict tagdict
def __init__(self, StringStore strings, model_dir):
self.model_dir = model_dir
if path.exists(path.join(model_dir, 'wordnet', 'supersenses.json')):
self.tagdict = json.load(open(path.join(model_dir, 'wordnet', 'supersenses.json')))
else:
self.tagdict = {}
if model_dir is not None and path.isdir(model_dir):
model_dir = path.join(model_dir, 'wsd')
templates = unigrams + bigrams + trigrams
self.extractor = Extractor(templates)
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
self.strings = strings
cdef flags_t all_senses = 0
cdef flags_t sense = 0
cdef flags_t one = 1
for sense in range(1, N_SENSES):
all_senses |= (one << sense)
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
self.pos_senses[<int>parts_of_speech.X] = all_senses
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
for sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
self.pos_senses[<int>parts_of_speech.VERB] = 0
for sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] local_context
cdef int i, guess, n_feats
cdef flags_t valid_senses = 0
cdef TokenC* token
cdef flags_t one = 1
cdef int n_doc_feats
cdef Pool mem = Pool()
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
for i in range(tokens.length):
token = &tokens.data[i]
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
if valid_senses >= 2:
fill_context(local_context, token)
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats],
local_context)
scores = self.model.get_scores(feats, n_local_feats)
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.0)
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
else:
token.sense = NO_SENSE
def train(self, Tokens tokens):
cdef int i, j
cdef TokenC* token
cdef atom_t[CONTEXT_SIZE] context
cdef int n_doc_feats, n_local_feats
cdef feat_t f_key
cdef flags_t best_senses = 0
cdef int f_i
cdef int cost = 0
cdef Pool mem = Pool()
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
for i in range(tokens.length):
token = &tokens.data[i]
pos_senses = self.pos_senses[<int>token.pos]
lex_senses = token.lex.senses & pos_senses
if lex_senses >= 2:
fill_context(context, token)
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats], context)
scores = self.model.get_scores(feats, n_doc_feats + n_local_feats)
guess = self.best_in_set(scores, pos_senses)
best = self.best_in_set(scores, lex_senses)
update = self._make_update(feats, n_doc_feats + n_local_feats,
guess, best)
self.model.update(update)
token.sense = best
cost += guess != best
else:
token.sense = 1
return cost
cdef dict _make_update(self, const Feature* feats, int n_feats, int guess, int best):
guess_counts = {}
gold_counts = {}
if guess != best:
for j in range(n_feats):
f_key = feats[j].key
f_i = feats[j].i
feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
return {guess: guess_counts, best: gold_counts}
cdef Feature* get_doc_feats(self, Pool mem, Tokens tokens, int* n_feats) except NULL:
# Get features for the document
# Start with activation strengths for each supersense
n_feats[0] = N_SENSES
feats = <Feature*>mem.alloc(n_feats[0] + self.extractor.n_templ + 1,
sizeof(Feature))
cdef int i, ssense
for ssense in range(N_SENSES):
feats[ssense] = Feature(i=0, key=ssense, value=0)
cdef flags_t pos_senses
cdef flags_t one = 1
for i in range(tokens.length):
sense_probs = self.tagdict.get(tokens.data[i].lemma, {})
pos_senses = self.pos_senses[<int>tokens.data[i].pos]
for ssense_str, prob in sense_probs.items():
ssense = int(ssense_str + 1)
if pos_senses & (one << <flags_t>ssense):
feats[ssense].value += prob
return feats
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
cdef weight_t max_ = 0
cdef int argmax = -1
cdef flags_t i
cdef flags_t one = 1
for i in range(N_SENSES):
if (senses & (one << i)) and (argmax == -1 or scores[i] > max_):
max_ = scores[i]
argmax = i
assert argmax >= 0
return argmax
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
weight_t a) except -1:
lemma = self.strings[token.lemma]
# First softmax the scores
softmax(scores, N_SENSES)
probs = self.tagdict.get(lemma, {})
for i in range(1, N_SENSES):
prob = probs.get(unicode(i-1), 0)
scores[i] = (a * prob) + ((1 - a) * scores[i])
def end_training(self):
self.model.end_training()
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
@cython.cdivision(True)
cdef void softmax(weight_t* scores, int n_classes) nogil:
cdef int i
cdef double total = 0
for i in range(N_SENSES):
total += exp(scores[i])
for i in range(N_SENSES):
scores[i] = <weight_t>(exp(scores[i]) / total)
cdef list _set_bits(flags_t flags):
bits = []
cdef flags_t bit
cdef flags_t one = 1
for bit in range(N_SENSES):
if flags & (one << bit):
bits.append(bit)
return bits