mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
* Add document features to sense_tagger.
This commit is contained in:
parent
8f0fe1a4ea
commit
cb628ba352
|
@ -57,6 +57,9 @@ cdef enum:
|
||||||
N2c6
|
N2c6
|
||||||
N2c4
|
N2c4
|
||||||
|
|
||||||
|
N3W
|
||||||
|
P3W
|
||||||
|
|
||||||
P1s
|
P1s
|
||||||
P2s
|
P2s
|
||||||
|
|
||||||
|
@ -131,6 +134,9 @@ unigrams = (
|
||||||
(P1s, P2s,),
|
(P1s, P2s,),
|
||||||
(P1s, N0p),
|
(P1s, N0p),
|
||||||
(P1s, P2s, N0c),
|
(P1s, P2s, N0c),
|
||||||
|
|
||||||
|
(N3W,),
|
||||||
|
(P3W,),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -142,6 +148,9 @@ bigrams = (
|
||||||
(P1c6, N0p),
|
(P1c6, N0p),
|
||||||
|
|
||||||
(N0p, N1p,),
|
(N0p, N1p,),
|
||||||
|
(P2W, P1W),
|
||||||
|
(P1W, N1W),
|
||||||
|
(N1W, N2W),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -157,6 +166,7 @@ trigrams = (
|
||||||
(N0p, N1p, N2p),
|
(N0p, N1p, N2p),
|
||||||
(N0p, N1p,),
|
(N0p, N1p,),
|
||||||
(N0c4, N1c4, N2c4),
|
(N0c4, N1c4, N2c4),
|
||||||
|
(P1W, N0p, N0W),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -181,6 +191,8 @@ cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
|
||||||
fill_token(&ctxt[N2W], token + 2)
|
fill_token(&ctxt[N2W], token + 2)
|
||||||
ctxt[P1s] = (token - 1).sense
|
ctxt[P1s] = (token - 1).sense
|
||||||
ctxt[P2s] = (token - 2).sense
|
ctxt[P2s] = (token - 2).sense
|
||||||
|
ctxt[N3W] = (token + 3).lemma
|
||||||
|
ctxt[P3W] = (token - 3).lemma
|
||||||
|
|
||||||
|
|
||||||
cdef class FeatureVector:
|
cdef class FeatureVector:
|
||||||
|
@ -228,22 +240,19 @@ cdef class SenseTagger:
|
||||||
cdef dict tagdict
|
cdef dict tagdict
|
||||||
|
|
||||||
def __init__(self, StringStore strings, model_dir):
|
def __init__(self, StringStore strings, model_dir):
|
||||||
if model_dir is not None and path.isdir(model_dir):
|
|
||||||
model_dir = path.join(model_dir, 'wsd')
|
|
||||||
|
|
||||||
self.model_dir = model_dir
|
self.model_dir = model_dir
|
||||||
if path.exists(path.join(model_dir, 'supersenses.json')):
|
if path.exists(path.join(model_dir, 'wordnet', 'supersenses.json')):
|
||||||
self.tagdict = json.load(open(path.join(model_dir, 'supersenses.json')))
|
self.tagdict = json.load(open(path.join(model_dir, 'wordnet', 'supersenses.json')))
|
||||||
else:
|
else:
|
||||||
self.tagdict = {}
|
self.tagdict = {}
|
||||||
|
|
||||||
|
if model_dir is not None and path.isdir(model_dir):
|
||||||
|
model_dir = path.join(model_dir, 'wsd')
|
||||||
|
|
||||||
templates = unigrams + bigrams + trigrams
|
templates = unigrams + bigrams + trigrams
|
||||||
self.extractor = Extractor(templates)
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
|
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
|
||||||
|
|
||||||
model_loc = path.join(self.model_dir, 'model')
|
|
||||||
if model_loc and path.exists(model_loc):
|
|
||||||
self.model.load(model_loc, freq_thresh=0)
|
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
cdef flags_t all_senses = 0
|
cdef flags_t all_senses = 0
|
||||||
cdef flags_t sense = 0
|
cdef flags_t sense = 0
|
||||||
|
@ -277,18 +286,19 @@ cdef class SenseTagger:
|
||||||
cdef flags_t valid_senses = 0
|
cdef flags_t valid_senses = 0
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
cdef FeatureVector features = FeatureVector(100)
|
cdef int n_doc_feats
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
token = &tokens.data[i]
|
token = &tokens.data[i]
|
||||||
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
|
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
|
||||||
if valid_senses >= 2:
|
if valid_senses >= 2:
|
||||||
fill_context(local_context, token)
|
fill_context(local_context, token)
|
||||||
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats],
|
||||||
features.extend(local_feats, n_feats)
|
local_context)
|
||||||
scores = self.model.get_scores(features.c, features.length)
|
scores = self.model.get_scores(feats, n_local_feats)
|
||||||
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.9)
|
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.0)
|
||||||
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
||||||
features.clear()
|
|
||||||
else:
|
else:
|
||||||
token.sense = NO_SENSE
|
token.sense = NO_SENSE
|
||||||
|
|
||||||
|
@ -296,22 +306,27 @@ cdef class SenseTagger:
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int n_feats
|
cdef int n_doc_feats, n_local_feats
|
||||||
cdef feat_t f_key
|
cdef feat_t f_key
|
||||||
cdef flags_t best_senses = 0
|
cdef flags_t best_senses = 0
|
||||||
cdef int f_i
|
cdef int f_i
|
||||||
cdef int cost = 0
|
cdef int cost = 0
|
||||||
|
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
feats = self.get_doc_feats(mem, tokens, &n_doc_feats)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
token = &tokens.data[i]
|
token = &tokens.data[i]
|
||||||
pos_senses = self.pos_senses[<int>token.pos]
|
pos_senses = self.pos_senses[<int>token.pos]
|
||||||
lex_senses = token.lex.senses & pos_senses
|
lex_senses = token.lex.senses & pos_senses
|
||||||
if lex_senses >= 2:
|
if lex_senses >= 2:
|
||||||
fill_context(context, token)
|
fill_context(context, token)
|
||||||
feats = self.extractor.get_feats(context, &n_feats)
|
|
||||||
scores = self.model.get_scores(feats, n_feats)
|
n_local_feats = self.extractor.set_feats(&feats[n_doc_feats], context)
|
||||||
guess = self.best_in_set(scores, pos_senses)
|
scores = self.model.get_scores(feats, n_doc_feats + n_local_feats)
|
||||||
best = self.best_in_set(scores, lex_senses)
|
guess = self.best_in_set(scores, pos_senses)
|
||||||
update = self._make_update(feats, n_feats, guess, best)
|
best = self.best_in_set(scores, lex_senses)
|
||||||
|
update = self._make_update(feats, n_doc_feats + n_local_feats,
|
||||||
|
guess, best)
|
||||||
self.model.update(update)
|
self.model.update(update)
|
||||||
token.sense = best
|
token.sense = best
|
||||||
cost += guess != best
|
cost += guess != best
|
||||||
|
@ -319,7 +334,7 @@ cdef class SenseTagger:
|
||||||
token.sense = 1
|
token.sense = 1
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
cdef dict _make_update(self, const Feature* feats, int n_feats, int guess, int best):
|
cdef dict _perceptron_update(self, const Feature* feats, int n_feats, int guess, int best):
|
||||||
guess_counts = {}
|
guess_counts = {}
|
||||||
gold_counts = {}
|
gold_counts = {}
|
||||||
if guess != best:
|
if guess != best:
|
||||||
|
@ -331,6 +346,26 @@ cdef class SenseTagger:
|
||||||
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
||||||
return {guess: guess_counts, best: gold_counts}
|
return {guess: guess_counts, best: gold_counts}
|
||||||
|
|
||||||
|
cdef Feature* get_doc_feats(self, Pool mem, Tokens tokens, int* n_feats) except NULL:
|
||||||
|
# Get features for the document
|
||||||
|
# Start with activation strengths for each supersense
|
||||||
|
n_feats[0] = N_SENSES
|
||||||
|
feats = <Feature*>mem.alloc(n_feats[0] + self.extractor.n_templ + 1,
|
||||||
|
sizeof(Feature))
|
||||||
|
cdef int i, ssense
|
||||||
|
for ssense in range(N_SENSES):
|
||||||
|
feats[ssense] = Feature(i=0, key=ssense, value=0)
|
||||||
|
cdef flags_t pos_senses
|
||||||
|
cdef flags_t one = 1
|
||||||
|
for i in range(tokens.length):
|
||||||
|
sense_probs = self.tagdict.get(tokens.data[i].lemma, {})
|
||||||
|
pos_senses = self.pos_senses[<int>tokens.data[i].pos]
|
||||||
|
for ssense_str, prob in sense_probs.items():
|
||||||
|
ssense = int(ssense_str + 1)
|
||||||
|
if pos_senses & (one << <flags_t>ssense):
|
||||||
|
feats[ssense].value += prob
|
||||||
|
return feats
|
||||||
|
|
||||||
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
||||||
cdef weight_t max_ = 0
|
cdef weight_t max_ = 0
|
||||||
cdef int argmax = -1
|
cdef int argmax = -1
|
||||||
|
@ -343,18 +378,12 @@ cdef class SenseTagger:
|
||||||
assert argmax >= 0
|
assert argmax >= 0
|
||||||
return argmax
|
return argmax
|
||||||
|
|
||||||
@cython.cdivision(True)
|
|
||||||
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
||||||
weight_t a) except -1:
|
weight_t a) except -1:
|
||||||
lemma = self.strings[token.lemma]
|
lemma = self.strings[token.lemma]
|
||||||
|
|
||||||
# First softmax the scores
|
# First softmax the scores
|
||||||
cdef int i
|
softmax(scores, N_SENSES)
|
||||||
cdef double total = 0
|
|
||||||
for i in range(N_SENSES):
|
|
||||||
total += exp(scores[i])
|
|
||||||
for i in range(N_SENSES):
|
|
||||||
scores[i] = <weight_t>(exp(scores[i]) / total)
|
|
||||||
|
|
||||||
probs = self.tagdict.get(lemma, {})
|
probs = self.tagdict.get(lemma, {})
|
||||||
for i in range(1, N_SENSES):
|
for i in range(1, N_SENSES):
|
||||||
|
@ -365,6 +394,18 @@ cdef class SenseTagger:
|
||||||
self.model.end_training()
|
self.model.end_training()
|
||||||
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
|
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
|
||||||
|
|
||||||
|
|
||||||
|
@cython.cdivision(True)
|
||||||
|
cdef void softmax(weight_t* scores, int n_classes) nogil:
|
||||||
|
cdef int i
|
||||||
|
cdef double total = 0
|
||||||
|
for i in range(N_SENSES):
|
||||||
|
total += exp(scores[i])
|
||||||
|
for i in range(N_SENSES):
|
||||||
|
scores[i] = <weight_t>(exp(scores[i]) / total)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef list _set_bits(flags_t flags):
|
cdef list _set_bits(flags_t flags):
|
||||||
bits = []
|
bits = []
|
||||||
cdef flags_t bit
|
cdef flags_t bit
|
||||||
|
|
Loading…
Reference in New Issue
Block a user