mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
* Use tagdict in sense_tagger
This commit is contained in:
parent
5e0545be5c
commit
427ea16b27
|
@ -1,12 +1,13 @@
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
from libc.math cimport exp
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from thinc.learner cimport LinearModel
|
from thinc.learner cimport LinearModel
|
||||||
from thinc.features cimport Extractor, Feature
|
from thinc.features cimport Extractor, Feature
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t, weight_t, feat_t
|
from thinc.typedefs cimport atom_t, weight_t, feat_t
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
|
@ -14,13 +15,14 @@ from .structs cimport TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
from .senses cimport N_SENSES, encode_sense_strs
|
from .senses cimport N_SENSES, encode_sense_strs
|
||||||
from .senses cimport NO_SENSE, N_Tops, J_ppl, V_body
|
from .senses cimport NO_SENSE, N_Tops, J_all, J_pert, A_all, J_ppl, V_body
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
|
from .parts_of_speech cimport NOUN, VERB, ADV, ADJ, N_UNIV_TAGS
|
||||||
|
|
||||||
from . cimport parts_of_speech
|
from . cimport parts_of_speech
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -223,17 +225,25 @@ cdef class SenseTagger:
|
||||||
cdef readonly Extractor extractor
|
cdef readonly Extractor extractor
|
||||||
cdef readonly model_dir
|
cdef readonly model_dir
|
||||||
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
|
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
|
||||||
|
cdef dict tagdict
|
||||||
|
|
||||||
def __init__(self, StringStore strings, model_dir):
|
def __init__(self, StringStore strings, model_dir):
|
||||||
if model_dir is not None and path.isdir(model_dir):
|
if model_dir is not None and path.isdir(model_dir):
|
||||||
model_dir = path.join(model_dir, 'model')
|
model_dir = path.join(model_dir, 'wsd')
|
||||||
|
|
||||||
|
self.model_dir = model_dir
|
||||||
|
if path.exists(path.join(model_dir, 'supersenses.json')):
|
||||||
|
self.tagdict = json.load(open(path.join(model_dir, 'supersenses.json')))
|
||||||
|
else:
|
||||||
|
self.tagdict = {}
|
||||||
|
|
||||||
templates = unigrams + bigrams + trigrams
|
templates = unigrams + bigrams + trigrams
|
||||||
self.extractor = Extractor(templates)
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
|
self.model = LinearModel(N_SENSES, self.extractor.n_templ)
|
||||||
self.model_dir = model_dir
|
|
||||||
if self.model_dir and path.exists(self.model_dir):
|
model_loc = path.join(self.model_dir, 'model')
|
||||||
self.model.load(self.model_dir, freq_thresh=0)
|
if model_loc and path.exists(model_loc):
|
||||||
|
self.model.load(model_loc, freq_thresh=0)
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
|
|
||||||
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
||||||
|
@ -252,89 +262,119 @@ cdef class SenseTagger:
|
||||||
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
||||||
|
|
||||||
cdef flags_t sense = 0
|
cdef flags_t sense = 0
|
||||||
|
cdef flags_t one = 1
|
||||||
for sense in range(N_Tops, V_body):
|
for sense in range(N_Tops, V_body):
|
||||||
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
|
self.pos_senses[<int>parts_of_speech.NOUN] |= one << sense
|
||||||
|
|
||||||
for sense in range(V_body, J_ppl):
|
for sense in range(V_body, J_ppl):
|
||||||
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
|
self.pos_senses[<int>parts_of_speech.VERB] |= one << sense
|
||||||
|
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADV] |= one << A_all
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_all
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_pert
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADJ] |= one << J_ppl
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Tokens tokens):
|
||||||
cdef atom_t[CONTEXT_SIZE] local_context
|
cdef atom_t[CONTEXT_SIZE] local_context
|
||||||
cdef int i, guess, n_feats
|
cdef int i, guess, n_feats
|
||||||
cdef flags_t valid_senses = 0
|
cdef flags_t valid_senses = 0
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
|
cdef flags_t one = 1
|
||||||
cdef FeatureVector features = FeatureVector(100)
|
cdef FeatureVector features = FeatureVector(100)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
token = &tokens.data[i]
|
token = &tokens.data[i]
|
||||||
if token.lex.senses == 1:
|
|
||||||
continue
|
|
||||||
assert not (token.lex.senses & (1 << NO_SENSE)), (tokens[i].orth_, token.lex.senses)
|
|
||||||
assert not (self.pos_senses[<int>token.pos] & (1 << NO_SENSE))
|
|
||||||
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
|
valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
|
||||||
assert not (valid_senses & (1 << NO_SENSE))
|
if valid_senses >= 2:
|
||||||
if valid_senses:
|
|
||||||
fill_context(local_context, token)
|
fill_context(local_context, token)
|
||||||
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
local_feats = self.extractor.get_feats(local_context, &n_feats)
|
||||||
features.extend(local_feats, n_feats)
|
features.extend(local_feats, n_feats)
|
||||||
scores = self.model.get_scores(features.c, features.length)
|
scores = self.model.get_scores(features.c, features.length)
|
||||||
|
self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 1.0)
|
||||||
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
tokens.data[i].sense = self.best_in_set(scores, valid_senses)
|
||||||
features.clear()
|
features.clear()
|
||||||
|
|
||||||
def train(self, Tokens tokens, GoldParse gold):
|
def train(self, Tokens tokens):
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
for i, ssenses in enumerate(gold.ssenses):
|
|
||||||
token = &tokens.data[i]
|
|
||||||
if ssenses:
|
|
||||||
gold.c.ssenses[i] = encode_sense_strs(ssenses)
|
|
||||||
elif token.lex.senses >= 2 and token.pos in (NOUN, VERB):
|
|
||||||
gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
|
|
||||||
else:
|
|
||||||
gold.c.ssenses[i] = 0
|
|
||||||
|
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int n_feats
|
cdef int n_feats
|
||||||
cdef feat_t f_key
|
cdef feat_t f_key
|
||||||
|
cdef flags_t best_senses = 0
|
||||||
cdef int f_i
|
cdef int f_i
|
||||||
cdef int cost = 0
|
cdef int cost = 0
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
token = &tokens.data[i]
|
token = &tokens.data[i]
|
||||||
if token.pos in (NOUN, VERB) \
|
pos_senses = self.pos_senses[<int>token.pos]
|
||||||
and token.lex.senses >= 2 \
|
lex_senses = token.lex.senses & pos_senses
|
||||||
and gold.c.ssenses[i] >= 2:
|
if pos_senses >= 2 and lex_senses >= 2:
|
||||||
fill_context(context, token)
|
fill_context(context, token)
|
||||||
feats = self.extractor.get_feats(context, &n_feats)
|
feats = self.extractor.get_feats(context, &n_feats)
|
||||||
scores = self.model.get_scores(feats, n_feats)
|
scores = self.model.get_scores(feats, n_feats)
|
||||||
token.sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
|
#self.weight_scores_by_tagdict(<weight_t*><void*>scores, token, 0.1)
|
||||||
best = self.best_in_set(scores, gold.c.ssenses[i])
|
guess = self.best_in_set(scores, pos_senses)
|
||||||
|
best = self.best_in_set(scores, lex_senses)
|
||||||
guess_counts = {}
|
guess_counts = {}
|
||||||
gold_counts = {}
|
gold_counts = {}
|
||||||
if token.sense != best:
|
if guess != best:
|
||||||
|
cost += 1
|
||||||
for j in range(n_feats):
|
for j in range(n_feats):
|
||||||
f_key = feats[j].key
|
f_key = feats[j].key
|
||||||
f_i = feats[j].i
|
f_i = feats[j].i
|
||||||
feat = (f_i, f_key)
|
feat = (f_i, f_key)
|
||||||
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
||||||
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
||||||
self.model.update({token.sense: guess_counts, best: gold_counts})
|
self.model.update({guess: guess_counts, best: gold_counts})
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
||||||
cdef weight_t max_ = 0
|
cdef weight_t max_ = 0
|
||||||
cdef int argmax = -1
|
cdef int argmax = -1
|
||||||
cdef flags_t i
|
cdef flags_t i
|
||||||
|
cdef flags_t one = 1
|
||||||
for i in range(N_SENSES):
|
for i in range(N_SENSES):
|
||||||
if (senses & (1 << i)) and (argmax == -1 or scores[i] > max_):
|
if (senses & (one << i)) and (argmax == -1 or scores[i] > max_):
|
||||||
max_ = scores[i]
|
max_ = scores[i]
|
||||||
argmax = i
|
argmax = i
|
||||||
assert argmax >= 0
|
assert argmax >= 0
|
||||||
return argmax
|
return argmax
|
||||||
|
|
||||||
|
@cython.cdivision(True)
|
||||||
|
cdef int weight_scores_by_tagdict(self, weight_t* scores, const TokenC* token,
|
||||||
|
weight_t a) except -1:
|
||||||
|
lemma = self.strings[token.lemma]
|
||||||
|
if token.pos == NOUN:
|
||||||
|
key = lemma + '/n'
|
||||||
|
elif token.pos == VERB:
|
||||||
|
key = lemma + '/v'
|
||||||
|
elif token.pos == ADJ:
|
||||||
|
key = lemma + '/j'
|
||||||
|
elif token.pos == ADV:
|
||||||
|
key = lemma + '/a'
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# First softmax the scores
|
||||||
|
cdef int i
|
||||||
|
cdef double total = 0
|
||||||
|
for i in range(N_SENSES):
|
||||||
|
total += exp(scores[i])
|
||||||
|
for i in range(N_SENSES):
|
||||||
|
scores[i] = <weight_t>(exp(scores[i]) / total)
|
||||||
|
|
||||||
|
probs = self.tagdict.get(key, {})
|
||||||
|
for i in range(1, N_SENSES):
|
||||||
|
prob = probs.get(str(i-1), 0)
|
||||||
|
scores[i] = (a * prob) + ((1 - a) * scores[i])
|
||||||
|
|
||||||
|
def end_training(self):
|
||||||
|
self.model.end_training()
|
||||||
|
self.model.dump(path.join(self.model_dir, 'model'), freq_thresh=0)
|
||||||
|
|
||||||
cdef list _set_bits(flags_t flags):
|
cdef list _set_bits(flags_t flags):
|
||||||
bits = []
|
bits = []
|
||||||
cdef flags_t bit
|
cdef flags_t bit
|
||||||
|
cdef flags_t one = 1
|
||||||
for bit in range(N_SENSES):
|
for bit in range(N_SENSES):
|
||||||
if flags & (1 << bit):
|
if flags & (one << bit):
|
||||||
bits.append(bit)
|
bits.append(bit)
|
||||||
return bits
|
return bits
|
||||||
|
|
Loading…
Reference in New Issue
Block a user