mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
* Don't use POS tags in supersense dict
This commit is contained in:
parent
4e0cd8def8
commit
149a901ea7
|
@ -82,13 +82,12 @@ def _read_probs(loc):
|
|||
|
||||
def _read_senses(loc):
|
||||
lexicon = defaultdict(lambda: defaultdict(list))
|
||||
pos_tags = [None, NOUN, VERB, ADJ, ADV, ADJ]
|
||||
pos_tags = [None, NOUN, VERB, ADJ, None, None]
|
||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||
sense_key, synset_offset, sense_number, tag_cnt = line.split()
|
||||
lemma, lex_sense = sense_key.split('%')
|
||||
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
|
||||
pos = pos_tags[int(ss_type)]
|
||||
if pos is not None:
|
||||
lexicon[lemma][pos].append(int(lex_filenum) + 1)
|
||||
return lexicon
|
||||
|
||||
|
@ -118,18 +117,17 @@ def setup_vocab(src_dir, dst_dir):
|
|||
# the first 4 bits. See _parse_features.pyx
|
||||
entry['cluster'] = int(cluster[::-1], 2)
|
||||
orth_senses = set()
|
||||
lemmas = []
|
||||
orth_senses.update(senses[word.lower()][None])
|
||||
for pos in [NOUN, VERB, ADJ]:
|
||||
for lemma in lemmatizer(word.lower(), pos):
|
||||
lemmas.append(lemma)
|
||||
orth_senses.update(senses[lemma][pos])
|
||||
orth_senses.update(senses[word.lower()][ADV])
|
||||
entry['senses'] = list(sorted(orth_senses))
|
||||
vocab[word] = entry
|
||||
vocab.dump(str(dst_dir / 'lexemes.bin'))
|
||||
vocab.strings.dump(str(dst_dir / 'strings.txt'))
|
||||
|
||||
|
||||
|
||||
def main(lang_data_dir, corpora_dir, model_dir):
|
||||
model_dir = Path(model_dir)
|
||||
lang_data_dir = Path(lang_data_dir)
|
||||
|
|
|
@ -87,7 +87,7 @@ def make_supersense_dict(wordnet_dir):
|
|||
gather = {}
|
||||
for (word, pos, sense), supersense in sense_to_ssense.items():
|
||||
key = (word, pos)
|
||||
gather.setdefault((word, pos), []).append((sense, supersense))
|
||||
gather.setdefault((word, pos), []).append((int(sense), supersense))
|
||||
mapping = {}
|
||||
for (word, pos), senses in gather.items():
|
||||
n_senses = len(senses)
|
||||
|
@ -98,7 +98,7 @@ def make_supersense_dict(wordnet_dir):
|
|||
probs[supersense] = probs.get(supersense, 0.0) + remaining
|
||||
for sense, supersense in sorted(senses):
|
||||
probs[supersense] += remaining / len(senses)
|
||||
mapping[(word, pos)] = probs
|
||||
mapping.setdefault(word, {}).update(probs)
|
||||
return mapping
|
||||
|
||||
|
||||
|
|
|
@ -245,35 +245,32 @@ cdef class SenseTagger:
|
|||
if model_loc and path.exists(model_loc):
|
||||
self.model.load(model_loc, freq_thresh=0)
|
||||
self.strings = strings
|
||||
cdef flags_t all_senses = 0
|
||||
cdef flags_t sense = 0
|
||||
cdef flags_t one = 1
|
||||
for sense in range(1, N_SENSES):
|
||||
all_senses |= (one << sense)
|
||||
|
||||
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADV] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADP] = 0
|
||||
self.pos_senses[<int>parts_of_speech.CONJ] = 0
|
||||
self.pos_senses[<int>parts_of_speech.DET] = 0
|
||||
self.pos_senses[<int>parts_of_speech.NOUN] = 0
|
||||
self.pos_senses[<int>parts_of_speech.NUM] = 0
|
||||
self.pos_senses[<int>parts_of_speech.PRON] = 0
|
||||
self.pos_senses[<int>parts_of_speech.PRT] = 0
|
||||
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
||||
self.pos_senses[<int>parts_of_speech.X] = 0
|
||||
self.pos_senses[<int>parts_of_speech.NO_TAG] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.ADV] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.ADP] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.CONJ] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.DET] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.NUM] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.PRON] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.PRT] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.X] = all_senses
|
||||
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
||||
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
||||
|
||||
cdef flags_t sense = 0
|
||||
cdef flags_t one = 1
|
||||
for sense in range(N_Tops, V_body):
|
||||
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
|
||||
|
||||
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
||||
for sense in range(V_body, J_ppl):
|
||||
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
|
||||
|
||||
self.pos_senses[<int>parts_of_speech.ADV] |= one << A_all
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_all
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_pert
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_ppl
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
cdef atom_t[CONTEXT_SIZE] local_context
|
||||
cdef int i, guess, n_feats
|
||||
|
@ -289,7 +286,7 @@ cdef class SenseTagger:
|
|||
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
||||
features.extend(local_feats, n_feats)
|
||||
scores = self.model.get_scores(features.c, features.length)
|
||||
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 1.0)
|
||||
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.95)
|
||||
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
||||
features.clear()
|
||||
|
||||
|
@ -342,16 +339,6 @@ cdef class SenseTagger:
|
|||
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
||||
weight_t a) except -1:
|
||||
lemma = self.strings[token.lemma]
|
||||
if token.pos == NOUN:
|
||||
key = lemma + '/n'
|
||||
elif token.pos == VERB:
|
||||
key = lemma + '/v'
|
||||
elif token.pos == ADJ:
|
||||
key = lemma + '/j'
|
||||
elif token.pos == ADV:
|
||||
key = lemma + '/a'
|
||||
else:
|
||||
return 0
|
||||
|
||||
# First softmax the scores
|
||||
cdef int i
|
||||
|
@ -361,9 +348,9 @@ cdef class SenseTagger:
|
|||
for i in range(N_SENSES):
|
||||
scores[i] = <weight_t>(exp(scores[i]) / total)
|
||||
|
||||
probs = self.tagdict.get(key, {})
|
||||
probs = self.tagdict.get(lemma, {})
|
||||
for i in range(1, N_SENSES):
|
||||
prob = probs.get(str(i-1), 0)
|
||||
prob = probs.get(unicode(i-1), 0)
|
||||
scores[i] = (a * prob) + ((1 - a) * scores[i])
|
||||
|
||||
def end_training(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user