mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
* Don't use POS tags in supersense dict
This commit is contained in:
parent
4e0cd8def8
commit
149a901ea7
|
@ -82,13 +82,12 @@ def _read_probs(loc):
|
||||||
|
|
||||||
def _read_senses(loc):
|
def _read_senses(loc):
|
||||||
lexicon = defaultdict(lambda: defaultdict(list))
|
lexicon = defaultdict(lambda: defaultdict(list))
|
||||||
pos_tags = [None, NOUN, VERB, ADJ, ADV, ADJ]
|
pos_tags = [None, NOUN, VERB, ADJ, None, None]
|
||||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||||
sense_key, synset_offset, sense_number, tag_cnt = line.split()
|
sense_key, synset_offset, sense_number, tag_cnt = line.split()
|
||||||
lemma, lex_sense = sense_key.split('%')
|
lemma, lex_sense = sense_key.split('%')
|
||||||
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
|
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
|
||||||
pos = pos_tags[int(ss_type)]
|
pos = pos_tags[int(ss_type)]
|
||||||
if pos is not None:
|
|
||||||
lexicon[lemma][pos].append(int(lex_filenum) + 1)
|
lexicon[lemma][pos].append(int(lex_filenum) + 1)
|
||||||
return lexicon
|
return lexicon
|
||||||
|
|
||||||
|
@ -118,18 +117,17 @@ def setup_vocab(src_dir, dst_dir):
|
||||||
# the first 4 bits. See _parse_features.pyx
|
# the first 4 bits. See _parse_features.pyx
|
||||||
entry['cluster'] = int(cluster[::-1], 2)
|
entry['cluster'] = int(cluster[::-1], 2)
|
||||||
orth_senses = set()
|
orth_senses = set()
|
||||||
lemmas = []
|
orth_senses.update(senses[word.lower()][None])
|
||||||
for pos in [NOUN, VERB, ADJ]:
|
for pos in [NOUN, VERB, ADJ]:
|
||||||
for lemma in lemmatizer(word.lower(), pos):
|
for lemma in lemmatizer(word.lower(), pos):
|
||||||
lemmas.append(lemma)
|
|
||||||
orth_senses.update(senses[lemma][pos])
|
orth_senses.update(senses[lemma][pos])
|
||||||
orth_senses.update(senses[word.lower()][ADV])
|
|
||||||
entry['senses'] = list(sorted(orth_senses))
|
entry['senses'] = list(sorted(orth_senses))
|
||||||
vocab[word] = entry
|
vocab[word] = entry
|
||||||
vocab.dump(str(dst_dir / 'lexemes.bin'))
|
vocab.dump(str(dst_dir / 'lexemes.bin'))
|
||||||
vocab.strings.dump(str(dst_dir / 'strings.txt'))
|
vocab.strings.dump(str(dst_dir / 'strings.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(lang_data_dir, corpora_dir, model_dir):
|
def main(lang_data_dir, corpora_dir, model_dir):
|
||||||
model_dir = Path(model_dir)
|
model_dir = Path(model_dir)
|
||||||
lang_data_dir = Path(lang_data_dir)
|
lang_data_dir = Path(lang_data_dir)
|
||||||
|
|
|
@ -87,7 +87,7 @@ def make_supersense_dict(wordnet_dir):
|
||||||
gather = {}
|
gather = {}
|
||||||
for (word, pos, sense), supersense in sense_to_ssense.items():
|
for (word, pos, sense), supersense in sense_to_ssense.items():
|
||||||
key = (word, pos)
|
key = (word, pos)
|
||||||
gather.setdefault((word, pos), []).append((sense, supersense))
|
gather.setdefault((word, pos), []).append((int(sense), supersense))
|
||||||
mapping = {}
|
mapping = {}
|
||||||
for (word, pos), senses in gather.items():
|
for (word, pos), senses in gather.items():
|
||||||
n_senses = len(senses)
|
n_senses = len(senses)
|
||||||
|
@ -98,7 +98,7 @@ def make_supersense_dict(wordnet_dir):
|
||||||
probs[supersense] = probs.get(supersense, 0.0) + remaining
|
probs[supersense] = probs.get(supersense, 0.0) + remaining
|
||||||
for sense, supersense in sorted(senses):
|
for sense, supersense in sorted(senses):
|
||||||
probs[supersense] += remaining / len(senses)
|
probs[supersense] += remaining / len(senses)
|
||||||
mapping[(word, pos)] = probs
|
mapping.setdefault(word, {}).update(probs)
|
||||||
return mapping
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -245,35 +245,32 @@ cdef class SenseTagger:
|
||||||
if model_loc and path.exists(model_loc):
|
if model_loc and path.exists(model_loc):
|
||||||
self.model.load(model_loc, freq_thresh=0)
|
self.model.load(model_loc, freq_thresh=0)
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
|
cdef flags_t all_senses = 0
|
||||||
|
cdef flags_t sense = 0
|
||||||
|
cdef flags_t one = 1
|
||||||
|
for sense in range(1, N_SENSES):
|
||||||
|
all_senses |= (one << sense)
|
||||||
|
|
||||||
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.ADJ] = 0
|
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.ADV] = 0
|
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.ADP] = 0
|
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.CONJ] = 0
|
self.pos_senses[<int>parts_of_speech.CONJ] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.DET] = 0
|
self.pos_senses[<int>parts_of_speech.DET] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.NOUN] = 0
|
self.pos_senses[<int>parts_of_speech.NUM] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.NUM] = 0
|
self.pos_senses[<int>parts_of_speech.PRON] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.PRON] = 0
|
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.PRT] = 0
|
self.pos_senses[<int>parts_of_speech.X] = all_senses
|
||||||
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
|
||||||
self.pos_senses[<int>parts_of_speech.X] = 0
|
|
||||||
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
||||||
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
||||||
|
|
||||||
cdef flags_t sense = 0
|
|
||||||
cdef flags_t one = 1
|
|
||||||
for sense in range(N_Tops, V_body):
|
for sense in range(N_Tops, V_body):
|
||||||
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
|
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
|
||||||
|
|
||||||
|
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
||||||
for sense in range(V_body, J_ppl):
|
for sense in range(V_body, J_ppl):
|
||||||
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
|
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
|
||||||
|
|
||||||
self.pos_senses[<int>parts_of_speech.ADV] |= one << A_all
|
|
||||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_all
|
|
||||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_pert
|
|
||||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_ppl
|
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Tokens tokens):
|
||||||
cdef atom_t[CONTEXT_SIZE] local_context
|
cdef atom_t[CONTEXT_SIZE] local_context
|
||||||
cdef int i, guess, n_feats
|
cdef int i, guess, n_feats
|
||||||
|
@ -289,7 +286,7 @@ cdef class SenseTagger:
|
||||||
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
||||||
features.extend(local_feats, n_feats)
|
features.extend(local_feats, n_feats)
|
||||||
scores = self.model.get_scores(features.c, features.length)
|
scores = self.model.get_scores(features.c, features.length)
|
||||||
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 1.0)
|
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.95)
|
||||||
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
||||||
features.clear()
|
features.clear()
|
||||||
|
|
||||||
|
@ -342,16 +339,6 @@ cdef class SenseTagger:
|
||||||
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
||||||
weight_t a) except -1:
|
weight_t a) except -1:
|
||||||
lemma = self.strings[token.lemma]
|
lemma = self.strings[token.lemma]
|
||||||
if token.pos == NOUN:
|
|
||||||
key = lemma + '/n'
|
|
||||||
elif token.pos == VERB:
|
|
||||||
key = lemma + '/v'
|
|
||||||
elif token.pos == ADJ:
|
|
||||||
key = lemma + '/j'
|
|
||||||
elif token.pos == ADV:
|
|
||||||
key = lemma + '/a'
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# First softmax the scores
|
# First softmax the scores
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -361,9 +348,9 @@ cdef class SenseTagger:
|
||||||
for i in range(N_SENSES):
|
for i in range(N_SENSES):
|
||||||
scores[i] = <weight_t>(exp(scores[i]) / total)
|
scores[i] = <weight_t>(exp(scores[i]) / total)
|
||||||
|
|
||||||
probs = self.tagdict.get(key, {})
|
probs = self.tagdict.get(lemma, {})
|
||||||
for i in range(1, N_SENSES):
|
for i in range(1, N_SENSES):
|
||||||
prob = probs.get(str(i-1), 0)
|
prob = probs.get(unicode(i-1), 0)
|
||||||
scores[i] = (a * prob) + ((1 - a) * scores[i])
|
scores[i] = (a * prob) + ((1 - a) * scores[i])
|
||||||
|
|
||||||
def end_training(self):
|
def end_training(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user