From cb628ba3521f2a9d197e65a6fbc0034b73ba13f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Jul 2015 21:05:38 +0200 Subject: [PATCH] * Add document features to sense_tagger. --- spacy/sense_tagger.pyx | 97 ++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 28 deletions(-) diff --git a/spacy/sense_tagger.pyx b/spacy/sense_tagger.pyx index be44f0e0e..d7591bd73 100644 --- a/spacy/sense_tagger.pyx +++ b/spacy/sense_tagger.pyx @@ -57,6 +57,9 @@ cdef enum: N2c6 N2c4 + N3W + P3W + P1s P2s @@ -131,6 +134,9 @@ unigrams = ( (P1s, P2s,), (P1s, N0p), (P1s, P2s, N0c), + + (N3W,), + (P3W,), ) @@ -142,6 +148,9 @@ bigrams = ( (P1c6, N0p), (N0p, N1p,), + (P2W, P1W), + (P1W, N1W), + (N1W, N2W), ) @@ -157,6 +166,7 @@ trigrams = ( (N0p, N1p, N2p), (N0p, N1p,), (N0c4, N1c4, N2c4), + (P1W, N0p, N0W), ) @@ -181,6 +191,8 @@ cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1: fill_token(&ctxt[N2W], token + 2) ctxt[P1s] = (token - 1).sense ctxt[P2s] = (token - 2).sense + ctxt[N3W] = (token + 3).lemma + ctxt[P3W] = (token - 3).lemma cdef class FeatureVector: @@ -228,22 +240,19 @@ cdef class SenseTagger: cdef dict tagdict def __init__(self, StringStore strings, model_dir): - if model_dir is not None and path.isdir(model_dir): - model_dir = path.join(model_dir, 'wsd') - self.model_dir = model_dir - if path.exists(path.join(model_dir, 'supersenses.json')): - self.tagdict = json.load(open(path.join(model_dir, 'supersenses.json'))) + if path.exists(path.join(model_dir, 'wordnet', 'supersenses.json')): + self.tagdict = json.load(open(path.join(model_dir, 'wordnet', 'supersenses.json'))) else: self.tagdict = {} + if model_dir is not None and path.isdir(model_dir): + model_dir = path.join(model_dir, 'wsd') + templates = unigrams + bigrams + trigrams self.extractor = Extractor(templates) self.model = LinearModel(N_SENSES, self.extractor.n_templ) - model_loc = path.join(self.model_dir, 'model') - if model_loc and path.exists(model_loc): - self.model.load(model_loc, freq_thresh=0) self.strings = strings cdef flags_t all_senses = 0 cdef flags_t sense = 0 @@ -277,18 +286,19 @@ cdef class SenseTagger: cdef flags_t valid_senses = 0 cdef TokenC* token cdef flags_t one = 1 - cdef FeatureVector features = FeatureVector(100) + cdef int n_doc_feats + cdef Pool mem = Pool() + feats = self.get_doc_feats(mem, tokens, &n_doc_feats) for i in range(tokens.length): token = &tokens.data[i] valid_senses = token.lex.senses & self.pos_senses[token.pos] if valid_senses >= 2: fill_context(local_context, token) - local_feats = self.extractor.get_feats(local_context, &n_feats) - features.extend(local_feats, n_feats) - scores = self.model.get_scores(features.c, features.length) - self.weight_scores_by_tagdict(scores, token, 0.9) + n_local_feats = self.extractor.set_feats(&feats[n_doc_feats], + local_context) + scores = self.model.get_scores(feats, n_local_feats) + self.weight_scores_by_tagdict(scores, token, 0.0) tokens.data[i].sense = self.best_in_set(scores, valid_senses) - features.clear() else: token.sense = NO_SENSE @@ -296,22 +306,27 @@ cdef class SenseTagger: cdef int i, j cdef TokenC* token cdef atom_t[CONTEXT_SIZE] context - cdef int n_feats + cdef int n_doc_feats, n_local_feats cdef feat_t f_key cdef flags_t best_senses = 0 cdef int f_i cdef int cost = 0 + + cdef Pool mem = Pool() + feats = self.get_doc_feats(mem, tokens, &n_doc_feats) for i in range(tokens.length): token = &tokens.data[i] pos_senses = self.pos_senses[token.pos] lex_senses = token.lex.senses & pos_senses if lex_senses >= 2: fill_context(context, token) - feats = self.extractor.get_feats(context, &n_feats) - scores = self.model.get_scores(feats, n_feats) - guess = self.best_in_set(scores, pos_senses) - best = self.best_in_set(scores, lex_senses) - update = self._make_update(feats, n_feats, guess, best) + + n_local_feats = self.extractor.set_feats(&feats[n_doc_feats], context) + scores = self.model.get_scores(feats, n_doc_feats + n_local_feats) + guess = self.best_in_set(scores, pos_senses) + best = self.best_in_set(scores, lex_senses) + update = self._make_update(feats, n_doc_feats + n_local_feats, + guess, best) self.model.update(update) token.sense = best cost += guess != best @@ -319,7 +334,7 @@ cdef class SenseTagger: token.sense = 1 return cost - cdef dict _make_update(self, const Feature* feats, int n_feats, int guess, int best): + cdef dict _perceptron_update(self, const Feature* feats, int n_feats, int guess, int best): guess_counts = {} gold_counts = {} if guess != best: @@ -331,6 +346,26 @@ cdef class SenseTagger: guess_counts[feat] = guess_counts.get(feat, 0) - 1.0 return {guess: guess_counts, best: gold_counts} + cdef Feature* get_doc_feats(self, Pool mem, Tokens tokens, int* n_feats) except NULL: + # Get features for the document + # Start with activation strengths for each supersense + n_feats[0] = N_SENSES + feats = mem.alloc(n_feats[0] + self.extractor.n_templ + 1, + sizeof(Feature)) + cdef int i, ssense + for ssense in range(N_SENSES): + feats[ssense] = Feature(i=0, key=ssense, value=0) + cdef flags_t pos_senses + cdef flags_t one = 1 + for i in range(tokens.length): + sense_probs = self.tagdict.get(tokens.data[i].lemma, {}) + pos_senses = self.pos_senses[tokens.data[i].pos] + for ssense_str, prob in sense_probs.items(): + ssense = int(ssense_str + 1) + if pos_senses & (one << ssense): + feats[ssense].value += prob + return feats + cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1: cdef weight_t max_ = 0 cdef int argmax = -1 @@ -343,18 +378,12 @@ cdef class SenseTagger: assert argmax >= 0 return argmax - @cython.cdivision(True) cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token, weight_t a) except -1: lemma = self.strings[token.lemma] # First softmax the scores - cdef int i - cdef double total = 0 - for i in range(N_SENSES): - total += exp(scores[i]) - for i in range(N_SENSES): - scores[i] = (exp(scores[i]) / total) + softmax(scores, N_SENSES) probs = self.tagdict.get(lemma, {}) for i in range(1, N_SENSES): @@ -365,6 +394,18 @@ cdef class SenseTagger: self.model.end_training() self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0) + +@cython.cdivision(True) +cdef void softmax(weight_t* scores, int n_classes) nogil: + cdef int i + cdef double total = 0 + for i in range(N_SENSES): + total += exp(scores[i]) + for i in range(N_SENSES): + scores[i] = (exp(scores[i]) / total) + + + cdef list _set_bits(flags_t flags): bits = [] cdef flags_t bit