* Don't use POS tags in supersense dict

This commit is contained in:
Matthew Honnibal 2015-07-05 10:50:02 +02:00
parent 4e0cd8def8
commit 149a901ea7
3 changed files with 25 additions and 40 deletions

View File

@ -82,14 +82,13 @@ def _read_probs(loc):
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
pos_tags = [None, NOUN, VERB, ADJ, ADV, ADJ]
pos_tags = [None, NOUN, VERB, ADJ, None, None]
for line in codecs.open(str(loc), 'r', 'utf8'):
sense_key, synset_offset, sense_number, tag_cnt = line.split()
lemma, lex_sense = sense_key.split('%')
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
pos = pos_tags[int(ss_type)]
if pos is not None:
lexicon[lemma][pos].append(int(lex_filenum) + 1)
lexicon[lemma][pos].append(int(lex_filenum) + 1)
return lexicon
@ -118,18 +117,17 @@ def setup_vocab(src_dir, dst_dir):
# the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
lemmas = []
orth_senses.update(senses[word.lower()][None])
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
orth_senses.update(senses[word.lower()][ADV])
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))
def main(lang_data_dir, corpora_dir, model_dir):
model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir)

View File

@ -87,7 +87,7 @@ def make_supersense_dict(wordnet_dir):
gather = {}
for (word, pos, sense), supersense in sense_to_ssense.items():
key = (word, pos)
gather.setdefault((word, pos), []).append((sense, supersense))
gather.setdefault((word, pos), []).append((int(sense), supersense))
mapping = {}
for (word, pos), senses in gather.items():
n_senses = len(senses)
@ -98,7 +98,7 @@ def make_supersense_dict(wordnet_dir):
probs[supersense] = probs.get(supersense, 0.0) + remaining
for sense, supersense in sorted(senses):
probs[supersense] += remaining / len(senses)
mapping[(word, pos)] = probs
mapping.setdefault(word, {}).update(probs)
return mapping

View File

@ -245,35 +245,32 @@ cdef class SenseTagger:
if model_loc and path.exists(model_loc):
self.model.load(model_loc, freq_thresh=0)
self.strings = strings
cdef flags_t all_senses = 0
cdef flags_t sense = 0
cdef flags_t one = 1
for sense in range(1, N_SENSES):
all_senses |= (one << sense)
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
self.pos_senses[<int>parts_of_speech.ADJ] = 0
self.pos_senses[<int>parts_of_speech.ADV] = 0
self.pos_senses[<int>parts_of_speech.ADP] = 0
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NOUN] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = 0
self.pos_senses[<int>parts_of_speech.VERB] = 0
self.pos_senses[<int>parts_of_speech.X] = 0
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
self.pos_senses[<int>parts_of_speech.CONJ] = all_senses
self.pos_senses[<int>parts_of_speech.DET] = all_senses
self.pos_senses[<int>parts_of_speech.NUM] = all_senses
self.pos_senses[<int>parts_of_speech.PRON] = all_senses
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
self.pos_senses[<int>parts_of_speech.X] = all_senses
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
cdef flags_t sense = 0
cdef flags_t one = 1
for sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
self.pos_senses[<int>parts_of_speech.VERB] = 0
for sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
self.pos_senses[<int>parts_of_speech.ADV] |= one << A_all
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_all
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_pert
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_ppl
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] local_context
cdef int i, guess, n_feats
@ -289,7 +286,7 @@ cdef class SenseTagger:
local_feats = self.extractor.get_feats(local_context, &n_feats)
features.extend(local_feats, n_feats)
scores = self.model.get_scores(features.c, features.length)
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 1.0)
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.95)
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
features.clear()
@ -342,16 +339,6 @@ cdef class SenseTagger:
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
weight_t a) except -1:
lemma = self.strings[token.lemma]
if token.pos == NOUN:
key = lemma + '/n'
elif token.pos == VERB:
key = lemma + '/v'
elif token.pos == ADJ:
key = lemma + '/j'
elif token.pos == ADV:
key = lemma + '/a'
else:
return 0
# First softmax the scores
cdef int i
@ -361,9 +348,9 @@ cdef class SenseTagger:
for i in range(N_SENSES):
scores[i] = <weight_t>(exp(scores[i]) / total)
probs = self.tagdict.get(key, {})
probs = self.tagdict.get(lemma, {})
for i in range(1, N_SENSES):
prob = probs.get(str(i-1), 0)
prob = probs.get(unicode(i-1), 0)
scores[i] = (a * prob) + ((1 - a) * scores[i])
def end_training(self):