spaCy/spacy/sense_tagger.pyx

280 lines
6.1 KiB
Cython

from .typedefs cimport flags_t
from .structs cimport TokenC
from .strings cimport StringStore
from .tokens cimport Tokens
from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, weight_t, feat_t
from os import path
cdef enum:
P2W
P2p
P2c
P2c6
P2c4
P2ss
P2s
P1W
P1p
P1c
P1c6
P1c4
P1ss
P1s
N0W
N0p
N0c
N0c6
N0c4
N0ss
N0s
N1W
N1p
N1c
N1c6
N1c4
N1ss
N1s
N2W
N2p
N2c
N2c6
N2c4
N2ss
N2s
CONTEXT_SIZE
unigrams = (
(P2W,),
(P2p,),
(P2W, P2p),
(P2c, P2p),
(P2c6, P2p),
(P2c4, P2p),
(P2c,),
(P2ss,),
(P1s,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(P1W,),
(P1p,),
(P1W, P1p),
(P1c, P1p),
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(P1ss,),
(P1s,),
(N0p,),
(N0W, N0p),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N0W,),
(N0p,),
(N0W, N0p),
(N0c, N0p),
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N0ss,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N1W,),
(N1p,),
(N1W, N1p),
(N1c, N1p),
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N1ss,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(N2W,),
(N2p,),
(N2W, N2p),
(N2c, N2p),
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(N2ss,),
)
bigrams = (
(P2p, P1p),
(P2W, N0p),
(P2c, P1p),
(P1c, N0p),
(P1c6, N0p),
(P2s, P1s),
(P2ss, P1s,),
(P2ss, P1ss,),
(P1ss, N0ss),
(N0p, N1p,),
)
trigrams = (
(P1p, N0p, N1p),
(P2p, P1p, N0ss),
(P2c4, P1c4, N0c4),
(P1p, N0p, N1p),
(P1p, N0p, N1ss),
(P1c4, N0c4, N1c4),
(N0p, N1p, N2p),
(N0p, N1p, N2ss),
(N0c4, N1c4, N2c4),
)
cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[0] = token.lemma
ctxt[1] = token.tag
ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63
ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
ctxt[6] = token.sense
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
# NB: we have padding to keep us safe here
# See tokens.pyx
fill_token(&ctxt[P2W], token - 2)
fill_token(&ctxt[P1W], token - 1)
fill_token(&ctxt[N0W], token)
ctxt[N0W] = 0 # Important! Don't condition on this
fill_token(&ctxt[N1W], token + 1)
fill_token(&ctxt[N2W], token + 2)
cdef class SenseTagger:
cdef readonly StringStore strings
cdef readonly LinearModel model
cdef readonly Extractor extractor
cdef readonly model_dir
def __init__(self, StringStore strings, model_dir):
if model_dir is not None and path.isdir(model_dir):
model_dir = path.join(model_dir, 'model')
templates = unigrams + bigrams + trigrams
self.extractor = Extractor(templates)
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
self.model_dir = model_dir
if self.model_dir and path.exists(self.model_dir):
self.model.load(self.model_dir, freq_thresh=0)
self.strings = strings
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] context
cdef int i, guess, n_feats
cdef const TokenC* token
for i in range(tokens.length):
token = &tokens.data[i]
if token.pos in (NOUN, VERB):
fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
def train(self, Tokens tokens, GoldParse gold):
cdef int i, j
for i, ssenses in enumerate(gold.ssenses):
if ssenses:
gold.c.ssenses[i] = encode_sense_strs(ssenses)
else:
gold.c.ssenses[i] = pos_senses(&tokens.data[i])
cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats
cdef feat_t f_key
cdef int f_i
cdef int cost = 0
for i in range(tokens.length):
token = &tokens.data[i]
if token.pos in (NOUN, VERB) \
and token.lex.senses >= 2 \
and gold.c.ssenses[i] >= 2:
fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
best = self.best_in_set(scores, gold.c.ssenses[i])
guess_counts = {}
gold_counts = {}
if token.sense != best:
for j in range(n_feats):
f_key = feats[j].key
f_i = feats[j].i
feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
#self.model.update({token.sense: guess_counts, best: gold_counts})
return cost
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
cdef weight_t max_ = 0
cdef int argmax = -1
cdef flags_t i
for i in range(N_SENSES):
if (senses & (1 << i)) and (argmax == -1 or scores[i] > max_):
max_ = scores[i]
argmax = i
assert argmax >= 0
return argmax
cdef flags_t pos_senses(const TokenC* token) nogil:
return token.lex.senses & POS_SENSES[<int>token.pos]
cdef list _set_bits(flags_t flags):
bits = []
cdef flags_t bit
for bit in range(N_SENSES):
if flags & (1 << bit):
bits.append(bit)
return bits