From 149a901ea75085d3dda557e0eb01d79d5bff8a78 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Jul 2015 10:50:02 +0200 Subject: [PATCH] * Don't use POS tags in supersense dict --- bin/init_model.py | 10 +++----- spacy/munge/read_wordnet.py | 4 +-- spacy/sense_tagger.pyx | 51 ++++++++++++++----------------------- 3 files changed, 25 insertions(+), 40 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index b81d455ab..449dd5c02 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -82,14 +82,13 @@ def _read_probs(loc): def _read_senses(loc): lexicon = defaultdict(lambda: defaultdict(list)) - pos_tags = [None, NOUN, VERB, ADJ, ADV, ADJ] + pos_tags = [None, NOUN, VERB, ADJ, None, None] for line in codecs.open(str(loc), 'r', 'utf8'): sense_key, synset_offset, sense_number, tag_cnt = line.split() lemma, lex_sense = sense_key.split('%') ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':') pos = pos_tags[int(ss_type)] - if pos is not None: - lexicon[lemma][pos].append(int(lex_filenum) + 1) + lexicon[lemma][pos].append(int(lex_filenum) + 1) return lexicon @@ -118,18 +117,17 @@ def setup_vocab(src_dir, dst_dir): # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() - lemmas = [] + orth_senses.update(senses[word.lower()][None]) for pos in [NOUN, VERB, ADJ]: for lemma in lemmatizer(word.lower(), pos): - lemmas.append(lemma) orth_senses.update(senses[lemma][pos]) - orth_senses.update(senses[word.lower()][ADV]) entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) + def main(lang_data_dir, corpora_dir, model_dir): model_dir = Path(model_dir) lang_data_dir = Path(lang_data_dir) diff --git a/spacy/munge/read_wordnet.py b/spacy/munge/read_wordnet.py index eff502953..6d461b21b 100644 --- a/spacy/munge/read_wordnet.py +++ b/spacy/munge/read_wordnet.py @@ -87,7 +87,7 @@ def make_supersense_dict(wordnet_dir): gather = {} for (word, pos, sense), supersense in sense_to_ssense.items(): key = (word, pos) - gather.setdefault((word, pos), []).append((sense, supersense)) + gather.setdefault((word, pos), []).append((int(sense), supersense)) mapping = {} for (word, pos), senses in gather.items(): n_senses = len(senses) @@ -98,7 +98,7 @@ def make_supersense_dict(wordnet_dir): probs[supersense] = probs.get(supersense, 0.0) + remaining for sense, supersense in sorted(senses): probs[supersense] += remaining / len(senses) - mapping[(word, pos)] = probs + mapping.setdefault(word, {}).update(probs) return mapping diff --git a/spacy/sense_tagger.pyx b/spacy/sense_tagger.pyx index c6f26b8f8..8147b5ad3 100644 --- a/spacy/sense_tagger.pyx +++ b/spacy/sense_tagger.pyx @@ -245,35 +245,32 @@ cdef class SenseTagger: if model_loc and path.exists(model_loc): self.model.load(model_loc, freq_thresh=0) self.strings = strings + cdef flags_t all_senses = 0 + cdef flags_t sense = 0 + cdef flags_t one = 1 + for sense in range(1, N_SENSES): + all_senses |= (one << sense) - self.pos_senses[parts_of_speech.NO_TAG] = 0 - self.pos_senses[parts_of_speech.ADJ] = 0 - self.pos_senses[parts_of_speech.ADV] = 0 - self.pos_senses[parts_of_speech.ADP] = 0 - self.pos_senses[parts_of_speech.CONJ] = 0 - self.pos_senses[parts_of_speech.DET] = 0 - self.pos_senses[parts_of_speech.NOUN] = 0 - self.pos_senses[parts_of_speech.NUM] = 0 - self.pos_senses[parts_of_speech.PRON] = 0 - self.pos_senses[parts_of_speech.PRT] = 0 - self.pos_senses[parts_of_speech.VERB] = 0 - self.pos_senses[parts_of_speech.X] = 0 + self.pos_senses[parts_of_speech.NO_TAG] = all_senses + self.pos_senses[parts_of_speech.ADJ] = all_senses + self.pos_senses[parts_of_speech.ADV] = all_senses + self.pos_senses[parts_of_speech.ADP] = all_senses + self.pos_senses[parts_of_speech.CONJ] = all_senses + self.pos_senses[parts_of_speech.DET] = all_senses + self.pos_senses[parts_of_speech.NUM] = all_senses + self.pos_senses[parts_of_speech.PRON] = all_senses + self.pos_senses[parts_of_speech.PRT] = all_senses + self.pos_senses[parts_of_speech.X] = all_senses self.pos_senses[parts_of_speech.PUNCT] = 0 self.pos_senses[parts_of_speech.EOL] = 0 - cdef flags_t sense = 0 - cdef flags_t one = 1 for sense in range(N_Tops, V_body): self.pos_senses[parts_of_speech.NOUN] |= one << sense + self.pos_senses[parts_of_speech.VERB] = 0 for sense in range(V_body, J_ppl): self.pos_senses[parts_of_speech.VERB] |= one << sense - self.pos_senses[parts_of_speech.ADV] |= one << A_all - self.pos_senses[parts_of_speech.ADJ] |= one << J_all - self.pos_senses[parts_of_speech.ADJ] |= one << J_pert - self.pos_senses[parts_of_speech.ADJ] |= one << J_ppl - def __call__(self, Tokens tokens): cdef atom_t[CONTEXT_SIZE] local_context cdef int i, guess, n_feats @@ -289,7 +286,7 @@ cdef class SenseTagger: local_feats = self.extractor.get_feats(local_context, &n_feats) features.extend(local_feats, n_feats) scores = self.model.get_scores(features.c, features.length) - self.weight_scores_by_tagdict(scores, token, 1.0) + self.weight_scores_by_tagdict(scores, token, 0.95) tokens.data[i].sense = self.best_in_set(scores, valid_senses) features.clear() @@ -342,16 +339,6 @@ cdef class SenseTagger: cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token, weight_t a) except -1: lemma = self.strings[token.lemma] - if token.pos == NOUN: - key = lemma + '/n' - elif token.pos == VERB: - key = lemma + '/v' - elif token.pos == ADJ: - key = lemma + '/j' - elif token.pos == ADV: - key = lemma + '/a' - else: - return 0 # First softmax the scores cdef int i @@ -361,9 +348,9 @@ cdef class SenseTagger: for i in range(N_SENSES): scores[i] = (exp(scores[i]) / total) - probs = self.tagdict.get(key, {}) + probs = self.tagdict.get(lemma, {}) for i in range(1, N_SENSES): - prob = probs.get(str(i-1), 0) + prob = probs.get(unicode(i-1), 0) scores[i] = (a * prob) + ((1 - a) * scores[i]) def end_training(self):