* Work on sense tagger

This commit is contained in:
Matthew Honnibal 2015-07-03 15:25:41 +02:00
parent 2fbcdd0ea8
commit fb68df91b8
8 changed files with 101 additions and 116 deletions

View File

@ -69,6 +69,7 @@ cdef class Model:
assert self.n_classes == eg.c.nr_class assert self.n_classes == eg.c.nr_class
memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class) memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
self.set_scores(eg.c.scores, eg.c.atoms) self.set_scores(eg.c.scores, eg.c.atoms)
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
if eg.c.guess == -1: if eg.c.guess == -1:
raise ValidationError("No valid classes during prediction") raise ValidationError("No valid classes during prediction")

View File

@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
if labels[-1].lower() == 'root': if labels[-1].lower() == 'root':
labels[-1] = 'ROOT' labels[-1] = 'ROOT'
ner.append(token.get('ner', '-')) ner.append(token.get('ner', '-'))
t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])] t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
for s in token.get('ssenses', [])]
wsd.append(t_wsd) wsd.append(t_wsd)
sents.append(( sents.append((
(ids, words, tags, heads, labels, ner, wsd), (ids, words, tags, heads, labels, ner, wsd),

View File

@ -133,6 +133,5 @@ class Scorer(object):
if gold_senses and gold.gold_to_cand[i] is not None: if gold_senses and gold.gold_to_cand[i] is not None:
cand_i = gold.gold_to_cand[i] cand_i = gold.gold_to_cand[i]
sense_str = tokens[cand_i].sense_ sense_str = tokens[cand_i].sense_
sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
self.wsd.tp += sense_str in gold_senses self.wsd.tp += sense_str in gold_senses
self.wsd.fn += sense_str not in gold_senses self.wsd.fn += sense_str not in gold_senses

View File

@ -2,9 +2,12 @@ from .typedefs cimport flags_t
from .structs cimport TokenC from .structs cimport TokenC
from .strings cimport StringStore from .strings cimport StringStore
from .tokens cimport Tokens from .tokens cimport Tokens
from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs from .senses cimport N_SENSES, encode_sense_strs
from .senses cimport N_Tops, J_ppl, V_body
from .gold cimport GoldParse from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
from . cimport parts_of_speech
from thinc.learner cimport LinearModel from thinc.learner cimport LinearModel
from thinc.features cimport Extractor from thinc.features cimport Extractor
@ -21,40 +24,30 @@ cdef enum:
P2c P2c
P2c6 P2c6
P2c4 P2c4
P2ss
P2s
P1W P1W
P1p P1p
P1c P1c
P1c6 P1c6
P1c4 P1c4
P1ss
P1s
N0W N0W
N0p N0p
N0c N0c
N0c6 N0c6
N0c4 N0c4
N0ss
N0s
N1W N1W
N1p N1p
N1c N1c
N1c6 N1c6
N1c4 N1c4
N1ss
N1s
N2W N2W
N2p N2p
N2c N2c
N2c6 N2c6
N2c4 N2c4
N2ss
N2s
CONTEXT_SIZE CONTEXT_SIZE
@ -67,8 +60,6 @@ unigrams = (
(P2c6, P2p), (P2c6, P2p),
(P2c4, P2p), (P2c4, P2p),
(P2c,), (P2c,),
(P2ss,),
(P1s,),
(P1W,), (P1W,),
(P1p,), (P1p,),
@ -84,8 +75,6 @@ unigrams = (
(P1c6, P1p), (P1c6, P1p),
(P1c4, P1p), (P1c4, P1p),
(P1c,), (P1c,),
(P1ss,),
(P1s,),
(N0p,), (N0p,),
(N0W, N0p), (N0W, N0p),
@ -100,7 +89,6 @@ unigrams = (
(N0c6, N0p), (N0c6, N0p),
(N0c4, N0p), (N0c4, N0p),
(N0c,), (N0c,),
(N0ss,),
(N1p,), (N1p,),
(N1W, N1p), (N1W, N1p),
@ -115,7 +103,6 @@ unigrams = (
(N1c6, N1p), (N1c6, N1p),
(N1c4, N1p), (N1c4, N1p),
(N1c,), (N1c,),
(N1ss,),
(N2p,), (N2p,),
(N2W, N2p), (N2W, N2p),
@ -130,7 +117,6 @@ unigrams = (
(N2c6, N2p), (N2c6, N2p),
(N2c4, N2p), (N2c4, N2p),
(N2c,), (N2c,),
(N2ss,),
) )
@ -141,27 +127,21 @@ bigrams = (
(P1c, N0p), (P1c, N0p),
(P1c6, N0p), (P1c6, N0p),
(P2s, P1s),
(P2ss, P1s,),
(P2ss, P1ss,),
(P1ss, N0ss),
(N0p, N1p,), (N0p, N1p,),
) )
trigrams = ( trigrams = (
(P1p, N0p, N1p), (P1p, N0p, N1p),
(P2p, P1p, N0ss), (P2p, P1p,),
(P2c4, P1c4, N0c4), (P2c4, P1c4, N0c4),
(P1p, N0p, N1p), (P1p, N0p, N1p),
(P1p, N0p, N1ss), (P1p, N0p,),
(P1c4, N0c4, N1c4), (P1c4, N0c4, N1c4),
(N0p, N1p, N2p), (N0p, N1p, N2p),
(N0p, N1p, N2ss), (N0p, N1p,),
(N0c4, N1c4, N2c4), (N0c4, N1c4, N2c4),
) )
@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[2] = token.lex.cluster ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15 ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63 ctxt[4] = token.lex.cluster & 63
ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
ctxt[6] = token.sense
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1: cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
@ -194,6 +172,7 @@ cdef class SenseTagger:
cdef readonly LinearModel model cdef readonly LinearModel model
cdef readonly Extractor extractor cdef readonly Extractor extractor
cdef readonly model_dir cdef readonly model_dir
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
def __init__(self, StringStore strings, model_dir): def __init__(self, StringStore strings, model_dir):
if model_dir is not None and path.isdir(model_dir): if model_dir is not None and path.isdir(model_dir):
@ -207,25 +186,50 @@ cdef class SenseTagger:
self.model.load(self.model_dir, freq_thresh=0) self.model.load(self.model_dir, freq_thresh=0)
self.strings = strings self.strings = strings
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
self.pos_senses[<int>parts_of_speech.ADJ] = 0
self.pos_senses[<int>parts_of_speech.ADV] = 0
self.pos_senses[<int>parts_of_speech.ADP] = 0
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NOUN] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = 0
self.pos_senses[<int>parts_of_speech.VERB] = 0
self.pos_senses[<int>parts_of_speech.X] = 0
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
cdef flags_t sense = 0
for _sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
for _sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
def __call__(self, Tokens tokens): def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] context cdef atom_t[CONTEXT_SIZE] context
cdef int i, guess, n_feats cdef int i, guess, n_feats
cdef const TokenC* token cdef TokenC* token
for i in range(tokens.length): for i in range(tokens.length):
token = &tokens.data[i] token = &tokens.data[i]
if token.pos in (NOUN, VERB): if token.pos in (NOUN, VERB):
fill_context(context, token) fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats) feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats) scores = self.model.get_scores(feats, n_feats)
tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos]) tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
def train(self, Tokens tokens, GoldParse gold): def train(self, Tokens tokens, GoldParse gold):
cdef int i, j cdef int i, j
cdef TokenC* token
for i, ssenses in enumerate(gold.ssenses): for i, ssenses in enumerate(gold.ssenses):
token = &tokens.data[i]
if ssenses: if ssenses:
gold.c.ssenses[i] = encode_sense_strs(ssenses) gold.c.ssenses[i] = encode_sense_strs(ssenses)
else: else:
gold.c.ssenses[i] = pos_senses(&tokens.data[i]) gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
cdef atom_t[CONTEXT_SIZE] context cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats cdef int n_feats
@ -240,7 +244,7 @@ cdef class SenseTagger:
fill_context(context, token) fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats) feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats) scores = self.model.get_scores(feats, n_feats)
token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos]) token.sense = self.best_in_set(scores, token.lex.senses)
best = self.best_in_set(scores, gold.c.ssenses[i]) best = self.best_in_set(scores, gold.c.ssenses[i])
guess_counts = {} guess_counts = {}
gold_counts = {} gold_counts = {}
@ -251,7 +255,7 @@ cdef class SenseTagger:
feat = (f_i, f_key) feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0 gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0 guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
#self.model.update({token.sense: guess_counts, best: gold_counts}) self.model.update({token.sense: guess_counts, best: gold_counts})
return cost return cost
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1: cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
@ -266,10 +270,6 @@ cdef class SenseTagger:
return argmax return argmax
cdef flags_t pos_senses(const TokenC* token) nogil:
return token.lex.senses & POS_SENSES[<int>token.pos]
cdef list _set_bits(flags_t flags): cdef list _set_bits(flags_t flags):
bits = [] bits = []
cdef flags_t bit cdef flags_t bit

View File

@ -4,6 +4,10 @@ from .typedefs cimport flags_t
cpdef enum: cpdef enum:
NO_SENSE NO_SENSE
J_all
J_pert
A_all
N_Tops
N_act N_act
N_animal N_animal
N_artifact N_artifact
@ -44,10 +48,8 @@ cpdef enum:
V_social V_social
V_stative V_stative
V_weather V_weather
J_ppl
N_SENSES N_SENSES
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
cdef flags_t encode_sense_strs(sense_names) except 0 cdef flags_t encode_sense_strs(sense_names) except 0

View File

@ -2,74 +2,56 @@ from __future__ import unicode_literals
cimport parts_of_speech cimport parts_of_speech
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0 lexnames_str = """
POS_SENSES[<int>parts_of_speech.ADJ] = 0 -1 NO_SENSE -1
POS_SENSES[<int>parts_of_speech.ADV] = 0 00 J_all 3
POS_SENSES[<int>parts_of_speech.ADP] = 0 01 A_pert 3
POS_SENSES[<int>parts_of_speech.CONJ] = 0 02 A_all 4
POS_SENSES[<int>parts_of_speech.DET] = 0 03 N_Tops 1
POS_SENSES[<int>parts_of_speech.NOUN] = 0 04 N_act 1
POS_SENSES[<int>parts_of_speech.NUM] = 0 05 N_animal 1
POS_SENSES[<int>parts_of_speech.PRON] = 0 06 N_artifact 1
POS_SENSES[<int>parts_of_speech.PRT] = 0 07 N_attribute 1
POS_SENSES[<int>parts_of_speech.VERB] = 0 08 N_body 1
POS_SENSES[<int>parts_of_speech.X] = 0 09 N_cognition 1
POS_SENSES[<int>parts_of_speech.PUNCT] = 0 10 N_communication 1
POS_SENSES[<int>parts_of_speech.EOL] = 0 11 N_event 1
12 N_feeling 1
13 N_food 1
14 N_group 1
15 N_location 1
16 N_motive 1
17 N_object 1
18 N_person 1
19 N_phenomenon 1
20 N_plant 1
21 N_possession 1
22 N_process 1
23 N_quantity 1
24 N_relation 1
25 N_shape 1
26 N_state 1
27 N_substance 1
28 N_time 1
29 V_body 2
30 V_change 2
31 V_cognition 2
32 V_communication 2
33 V_competition 2
34 V_consumption 2
35 V_contact 2
36 V_creation 2
37 V_emotion 2
38 V_motion 2
39 V_perception 2
40 V_possession 2
41 V_social 2
42 V_stative 2
43 V_weather 2
44 A_ppl 3
""".strip()
STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
cdef int _sense = 0
for _sense in range(N_act, V_body):
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
for _sense in range(V_body, V_weather+1):
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
STRINGS = (
'-NO_SENSE-',
'N_act',
'N_animal',
'N_artifact',
'N_attribute',
'N_body',
'N_cognition',
'N_communication',
'N_event',
'N_feeling',
'N_food',
'N_group',
'N_location',
'N_motive',
'N_object',
'N_person',
'N_phenomenon',
'N_plant',
'N_possession',
'N_process',
'N_quantity',
'N_relation',
'N_shape',
'N_state',
'N_substance',
'N_time',
'V_body',
'V_change',
'V_cognition',
'V_communication',
'V_competition',
'V_consumption',
'V_contact',
'V_creation',
'V_emotion',
'V_motion',
'V_perception',
'V_possession',
'V_social',
'V_stative',
'V_weather'
)
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS)) IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
return sense_bits | (1 << NO_SENSE) return sense_bits | (1 << NO_SENSE)
cdef flags_t sense_id = 0 cdef flags_t sense_id = 0
for sense_str in sense_names: for sense_str in sense_names:
if '.' in sense_str: sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1] sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
sense_id = IDS[sense_str] sense_id = IDS[sense_str]
sense_bits |= (1 << sense_id) sense_bits |= (1 << sense_id)
return sense_bits return sense_bits

View File

@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[9] = token.lex.shape context[9] = token.lex.shape
context[10] = token.ent_iob context[10] = token.ent_iob
context[11] = token.ent_type context[11] = token.ent_type
context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos] context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]
cdef int fill_context(atom_t* ctxt, StateClass st) nogil: cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
# Take care to fill every element of context! # Take care to fill every element of context!

View File

@ -93,7 +93,7 @@ cdef class Tokens:
else: else:
size = 5 size = 5
self.mem = Pool() self.mem = Pool()
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can # However, we need to remember the true starting places, so that we can
# realloc. # realloc.
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))