diff --git a/spacy/sense_tagger.pyx b/spacy/sense_tagger.pyx index d86f26a38..c6f26b8f8 100644 --- a/spacy/sense_tagger.pyx +++ b/spacy/sense_tagger.pyx @@ -1,12 +1,13 @@ from libc.string cimport memcpy +from libc.math cimport exp + from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor, Feature from thinc.typedefs cimport atom_t, weight_t, feat_t - - +cimport cython from .typedefs cimport flags_t @@ -14,13 +15,14 @@ from .structs cimport TokenC from .strings cimport StringStore from .tokens cimport Tokens from .senses cimport N_SENSES, encode_sense_strs -from .senses cimport NO_SENSE, N_Tops, J_ppl, V_body +from .senses cimport NO_SENSE, N_Tops, J_all, J_pert, A_all, J_ppl, V_body from .gold cimport GoldParse -from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS +from .parts_of_speech cimport NOUN, VERB, ADV, ADJ, N_UNIV_TAGS from . cimport parts_of_speech from os import path +import json @@ -223,17 +225,25 @@ cdef class SenseTagger: cdef readonly Extractor extractor cdef readonly model_dir cdef readonly flags_t[N_UNIV_TAGS] pos_senses + cdef dict tagdict def __init__(self, StringStore strings, model_dir): if model_dir is not None and path.isdir(model_dir): - model_dir = path.join(model_dir, 'model') + model_dir = path.join(model_dir, 'wsd') + + self.model_dir = model_dir + if path.exists(path.join(model_dir, 'supersenses.json')): + self.tagdict = json.load(open(path.join(model_dir, 'supersenses.json'))) + else: + self.tagdict = {} templates = unigrams + bigrams + trigrams self.extractor = Extractor(templates) self.model = LinearModel(N_SENSES, self.extractor.n_templ) - self.model_dir = model_dir - if self.model_dir and path.exists(self.model_dir): - self.model.load(self.model_dir, freq_thresh=0) + + model_loc = path.join(self.model_dir, 'model') + if model_loc and path.exists(model_loc): + self.model.load(model_loc, freq_thresh=0) self.strings = strings self.pos_senses[parts_of_speech.NO_TAG] = 0 @@ -252,89 +262,119 @@ cdef class SenseTagger: self.pos_senses[parts_of_speech.EOL] = 0 cdef flags_t sense = 0 + cdef flags_t one = 1 for sense in range(N_Tops, V_body): - self.pos_senses[parts_of_speech.NOUN] |= 1 << sense + self.pos_senses[parts_of_speech.NOUN] |= one << sense for sense in range(V_body, J_ppl): - self.pos_senses[parts_of_speech.VERB] |= 1 << sense + self.pos_senses[parts_of_speech.VERB] |= one << sense + + self.pos_senses[parts_of_speech.ADV] |= one << A_all + self.pos_senses[parts_of_speech.ADJ] |= one << J_all + self.pos_senses[parts_of_speech.ADJ] |= one << J_pert + self.pos_senses[parts_of_speech.ADJ] |= one << J_ppl def __call__(self, Tokens tokens): cdef atom_t[CONTEXT_SIZE] local_context cdef int i, guess, n_feats cdef flags_t valid_senses = 0 cdef TokenC* token + cdef flags_t one = 1 cdef FeatureVector features = FeatureVector(100) for i in range(tokens.length): token = &tokens.data[i] - if token.lex.senses == 1: - continue - assert not (token.lex.senses & (1 << NO_SENSE)), (tokens[i].orth_, token.lex.senses) - assert not (self.pos_senses[token.pos] & (1 << NO_SENSE)) valid_senses = token.lex.senses & self.pos_senses[token.pos] - assert not (valid_senses & (1 << NO_SENSE)) - if valid_senses: + if valid_senses >= 2: fill_context(local_context, token) local_feats = self.extractor.get_feats(local_context, &n_feats) features.extend(local_feats, n_feats) scores = self.model.get_scores(features.c, features.length) + self.weight_scores_by_tagdict(scores, token, 1.0) tokens.data[i].sense = self.best_in_set(scores, valid_senses) features.clear() - def train(self, Tokens tokens, GoldParse gold): + def train(self, Tokens tokens): cdef int i, j cdef TokenC* token - for i, ssenses in enumerate(gold.ssenses): - token = &tokens.data[i] - if ssenses: - gold.c.ssenses[i] = encode_sense_strs(ssenses) - elif token.lex.senses >= 2 and token.pos in (NOUN, VERB): - gold.c.ssenses[i] = token.lex.senses & self.pos_senses[token.pos] - else: - gold.c.ssenses[i] = 0 - cdef atom_t[CONTEXT_SIZE] context cdef int n_feats cdef feat_t f_key + cdef flags_t best_senses = 0 cdef int f_i cdef int cost = 0 for i in range(tokens.length): token = &tokens.data[i] - if token.pos in (NOUN, VERB) \ - and token.lex.senses >= 2 \ - and gold.c.ssenses[i] >= 2: + pos_senses = self.pos_senses[token.pos] + lex_senses = token.lex.senses & pos_senses + if pos_senses >= 2 and lex_senses >= 2: fill_context(context, token) feats = self.extractor.get_feats(context, &n_feats) scores = self.model.get_scores(feats, n_feats) - token.sense = self.best_in_set(scores, self.pos_senses[token.pos]) - best = self.best_in_set(scores, gold.c.ssenses[i]) + #self.weight_scores_by_tagdict(scores, token, 0.1) + guess = self.best_in_set(scores, pos_senses) + best = self.best_in_set(scores, lex_senses) guess_counts = {} gold_counts = {} - if token.sense != best: + if guess != best: + cost += 1 for j in range(n_feats): f_key = feats[j].key f_i = feats[j].i feat = (f_i, f_key) gold_counts[feat] = gold_counts.get(feat, 0) + 1.0 guess_counts[feat] = guess_counts.get(feat, 0) - 1.0 - self.model.update({token.sense: guess_counts, best: gold_counts}) + self.model.update({guess: guess_counts, best: gold_counts}) return cost cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1: cdef weight_t max_ = 0 cdef int argmax = -1 cdef flags_t i + cdef flags_t one = 1 for i in range(N_SENSES): - if (senses & (1 << i)) and (argmax == -1 or scores[i] > max_): + if (senses & (one << i)) and (argmax == -1 or scores[i] > max_): max_ = scores[i] argmax = i assert argmax >= 0 return argmax + @cython.cdivision(True) + cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token, + weight_t a) except -1: + lemma = self.strings[token.lemma] + if token.pos == NOUN: + key = lemma + '/n' + elif token.pos == VERB: + key = lemma + '/v' + elif token.pos == ADJ: + key = lemma + '/j' + elif token.pos == ADV: + key = lemma + '/a' + else: + return 0 + + # First softmax the scores + cdef int i + cdef double total = 0 + for i in range(N_SENSES): + total += exp(scores[i]) + for i in range(N_SENSES): + scores[i] = (exp(scores[i]) / total) + + probs = self.tagdict.get(key, {}) + for i in range(1, N_SENSES): + prob = probs.get(str(i-1), 0) + scores[i] = (a * prob) + ((1 - a) * scores[i]) + + def end_training(self): + self.model.end_training() + self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0) cdef list _set_bits(flags_t flags): bits = [] cdef flags_t bit + cdef flags_t one = 1 for bit in range(N_SENSES): - if flags & (1 << bit): + if flags & (one << bit): bits.append(bit) return bits