spaCy/spacy/sense_tagger.pyx

376 lines
9.9 KiB
Cython

from libc.string cimport memcpy
from libc.math cimport exp
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor, Feature
from thinc.typedefs cimport atom_t, weight_t, feat_t
cimport cython
from .typedefs cimport flags_t
from .structs cimport TokenC
from .strings cimport StringStore
from .tokens cimport Tokens
from .senses cimport N_SENSES, encode_sense_strs
from .senses cimport NO_SENSE, N_Tops, J_all, J_pert, A_all, J_ppl, V_body
from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB, ADV, ADJ, N_UNIV_TAGS
from . cimport parts_of_speech
from os import path
import json
cdef enum:
P2W
P2p
P2c
P2c6
P2c4
P1W
P1p
P1c
P1c6
P1c4
N0W
N0p
N0c
N0c6
N0c4
N1W
N1p
N1c
N1c6
N1c4
N2W
N2p
N2c
N2c6
N2c4
P1s
P2s
CONTEXT_SIZE
unigrams = (
(P2W,),
(P2p,),
(P2W, P2p),
(P2c, P2p),
(P2c6, P2p),
(P2c4, P2p),
(P2c,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(N0p,),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N0p,),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N1W,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(N2W,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(P1s,),
(P2s,),
(P1s, P2s,),
(P1s, N0p),
(P1s, P2s, N0c),
)
bigrams = (
(P2p, P1p),
(P2W, N0p),
(P2c, P1p),
(P1c, N0p),
(P1c6, N0p),
(N0p, N1p,),
)
trigrams = (
(P1p, N0p, N1p),
(P2p, P1p,),
(P2c4, P1c4, N0c4),
(P1p, N0p, N1p),
(P1p, N0p,),
(P1c4, N0c4, N1c4),
(N0p, N1p, N2p),
(N0p, N1p,),
(N0c4, N1c4, N2c4),
)
cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[0] = token.lemma
ctxt[1] = token.tag
ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
# NB: we have padding to keep us safe here
# See tokens.pyx
fill_token(&ctxt[P2W], token - 2)
fill_token(&ctxt[P1W], token - 1)
fill_token(&ctxt[N0W], token)
ctxt[N0W] = 0 # Important! Don't condition on this
fill_token(&ctxt[N1W], token + 1)
fill_token(&ctxt[N2W], token + 2)
ctxt[P1s] = (token - 1).sense
ctxt[P2s] = (token - 2).sense
cdef class FeatureVector:
cdef Pool mem
cdef Feature* c
cdef list extractors
cdef int length
cdef int _max_length
def __init__(self, length=100):
self.mem = Pool()
self.c = <Feature*>self.mem.alloc(length, sizeof(Feature))
self.length = 0
self._max_length = length
def __len__(self):
return self.length
cpdef int add(self, feat_t key, weight_t value) except -1:
if self.length == self._max_length:
self._max_length *= 2
self.c = <Feature*>self.mem.realloc(self.c, self._max_length * sizeof(Feature))
self.c[self.length] = Feature(i=0, key=key, value=value)
self.length += 1
cdef int extend(self, const Feature* new_feats, int n_feats) except -1:
new_length = self.length + n_feats
if new_length >= self._max_length:
self._max_length = 2 * new_length
self.c = <Feature*>self.mem.realloc(self.c, new_length * sizeof(Feature))
memcpy(&self.c[self.length], new_feats, n_feats * sizeof(Feature))
self.length += n_feats
def clear(self):
self.length = 0
cdef class SenseTagger:
cdef readonly StringStore strings
cdef readonly LinearModel model
cdef readonly Extractor extractor
cdef readonly model_dir
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
cdef dict tagdict
def __init__(self, StringStore strings, model_dir):
if model_dir is not None and path.isdir(model_dir):
model_dir = path.join(model_dir, 'wsd')
self.model_dir = model_dir
if path.exists(path.join(model_dir, 'supersenses.json')):
self.tagdict = json.load(open(path.join(model_dir, 'supersenses.json')))
else:
self.tagdict = {}
templates = unigrams + bigrams + trigrams
self.extractor = Extractor(templates)
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
model_loc = path.join(self.model_dir, 'model')
if model_loc and path.exists(model_loc):
self.model.load(model_loc, freq_thresh=0)
self.strings = strings
cdef flags_t all_senses = 0
cdef flags_t sense = 0
cdef flags_t one = 1
for sense in range(1, N_SENSES):
all_senses |= (one << sense)
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
self.pos_senses[<int>parts_of_speech.X] = all_senses
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
for sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
self.pos_senses[<int>parts_of_speech.VERB] = 0
for sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] local_context
cdef int i, guess, n_feats
cdef flags_t valid_senses = 0
cdef TokenC* token
cdef flags_t one = 1
cdef FeatureVector features = FeatureVector(100)
for i in range(tokens.length):
token = &tokens.data[i]
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
if valid_senses >= 2:
fill_context(local_context, token)
local_feats = self.extractor.get_feats(local_context, &n_feats)
features.extend(local_feats, n_feats)
scores = self.model.get_scores(features.c, features.length)
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.9)
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
features.clear()
else:
token.sense = NO_SENSE
def train(self, Tokens tokens):
cdef int i, j
cdef TokenC* token
cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats
cdef feat_t f_key
cdef flags_t best_senses = 0
cdef int f_i
cdef int cost = 0
for i in range(tokens.length):
token = &tokens.data[i]
pos_senses = self.pos_senses[<int>token.pos]
lex_senses = token.lex.senses & pos_senses
if lex_senses >= 2:
fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
guess = self.best_in_set(scores, pos_senses)
best = self.best_in_set(scores, lex_senses)
update = self._make_update(feats, n_feats, guess, best)
self.model.update(update)
token.sense = best
cost += guess != best
else:
token.sense = 1
return cost
cdef dict _make_update(self, const Feature* feats, int n_feats, int guess, int best):
guess_counts = {}
gold_counts = {}
if guess != best:
for j in range(n_feats):
f_key = feats[j].key
f_i = feats[j].i
feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
return {guess: guess_counts, best: gold_counts}
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
cdef weight_t max_ = 0
cdef int argmax = -1
cdef flags_t i
cdef flags_t one = 1
for i in range(N_SENSES):
if (senses & (one << i)) and (argmax == -1 or scores[i] > max_):
max_ = scores[i]
argmax = i
assert argmax >= 0
return argmax
@cython.cdivision(True)
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
weight_t a) except -1:
lemma = self.strings[token.lemma]
# First softmax the scores
cdef int i
cdef double total = 0
for i in range(N_SENSES):
total += exp(scores[i])
for i in range(N_SENSES):
scores[i] = <weight_t>(exp(scores[i]) / total)
probs = self.tagdict.get(lemma, {})
for i in range(1, N_SENSES):
prob = probs.get(unicode(i-1), 0)
scores[i] = (a * prob) + ((1 - a) * scores[i])
def end_training(self):
self.model.end_training()
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
cdef list _set_bits(flags_t flags):
bits = []
cdef flags_t bit
cdef flags_t one = 1
for bit in range(N_SENSES):
if flags & (one << bit):
bits.append(bit)
return bits