* Work on sense tagger

This commit is contained in:
Matthew Honnibal 2015-07-03 15:25:41 +02:00
parent 2fbcdd0ea8
commit fb68df91b8
8 changed files with 101 additions and 116 deletions

View File

@ -69,6 +69,7 @@ cdef class Model:
assert self.n_classes == eg.c.nr_class
memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
self.set_scores(eg.c.scores, eg.c.atoms)
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
if eg.c.guess == -1:
raise ValidationError("No valid classes during prediction")

View File

@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
if labels[-1].lower() == 'root':
labels[-1] = 'ROOT'
ner.append(token.get('ner', '-'))
t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])]
t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
for s in token.get('ssenses', [])]
wsd.append(t_wsd)
sents.append((
(ids, words, tags, heads, labels, ner, wsd),

View File

@ -133,6 +133,5 @@ class Scorer(object):
if gold_senses and gold.gold_to_cand[i] is not None:
cand_i = gold.gold_to_cand[i]
sense_str = tokens[cand_i].sense_
sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
self.wsd.tp += sense_str in gold_senses
self.wsd.fn += sense_str not in gold_senses

View File

@ -2,9 +2,12 @@ from .typedefs cimport flags_t
from .structs cimport TokenC
from .strings cimport StringStore
from .tokens cimport Tokens
from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
from .senses cimport N_SENSES, encode_sense_strs
from .senses cimport N_Tops, J_ppl, V_body
from .gold cimport GoldParse
from .parts_of_speech cimport NOUN, VERB
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
from . cimport parts_of_speech
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
@ -21,40 +24,30 @@ cdef enum:
P2c
P2c6
P2c4
P2ss
P2s
P1W
P1p
P1c
P1c6
P1c4
P1ss
P1s
N0W
N0p
N0c
N0c6
N0c4
N0ss
N0s
N1W
N1p
N1c
N1c6
N1c4
N1ss
N1s
N2W
N2p
N2c
N2c6
N2c4
N2ss
N2s
CONTEXT_SIZE
@ -67,8 +60,6 @@ unigrams = (
(P2c6, P2p),
(P2c4, P2p),
(P2c,),
(P2ss,),
(P1s,),
(P1W,),
(P1p,),
@ -84,8 +75,6 @@ unigrams = (
(P1c6, P1p),
(P1c4, P1p),
(P1c,),
(P1ss,),
(P1s,),
(N0p,),
(N0W, N0p),
@ -100,7 +89,6 @@ unigrams = (
(N0c6, N0p),
(N0c4, N0p),
(N0c,),
(N0ss,),
(N1p,),
(N1W, N1p),
@ -115,7 +103,6 @@ unigrams = (
(N1c6, N1p),
(N1c4, N1p),
(N1c,),
(N1ss,),
(N2p,),
(N2W, N2p),
@ -130,7 +117,6 @@ unigrams = (
(N2c6, N2p),
(N2c4, N2p),
(N2c,),
(N2ss,),
)
@ -141,27 +127,21 @@ bigrams = (
(P1c, N0p),
(P1c6, N0p),
(P2s, P1s),
(P2ss, P1s,),
(P2ss, P1ss,),
(P1ss, N0ss),
(N0p, N1p,),
)
trigrams = (
(P1p, N0p, N1p),
(P2p, P1p, N0ss),
(P2p, P1p,),
(P2c4, P1c4, N0c4),
(P1p, N0p, N1p),
(P1p, N0p, N1ss),
(P1p, N0p,),
(P1c4, N0c4, N1c4),
(N0p, N1p, N2p),
(N0p, N1p, N2ss),
(N0p, N1p,),
(N0c4, N1c4, N2c4),
)
@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
ctxt[2] = token.lex.cluster
ctxt[3] = token.lex.cluster & 15
ctxt[4] = token.lex.cluster & 63
ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
ctxt[6] = token.sense
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
@ -194,6 +172,7 @@ cdef class SenseTagger:
cdef readonly LinearModel model
cdef readonly Extractor extractor
cdef readonly model_dir
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
def __init__(self, StringStore strings, model_dir):
if model_dir is not None and path.isdir(model_dir):
@ -207,25 +186,50 @@ cdef class SenseTagger:
self.model.load(self.model_dir, freq_thresh=0)
self.strings = strings
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
self.pos_senses[<int>parts_of_speech.ADJ] = 0
self.pos_senses[<int>parts_of_speech.ADV] = 0
self.pos_senses[<int>parts_of_speech.ADP] = 0
self.pos_senses[<int>parts_of_speech.CONJ] = 0
self.pos_senses[<int>parts_of_speech.DET] = 0
self.pos_senses[<int>parts_of_speech.NOUN] = 0
self.pos_senses[<int>parts_of_speech.NUM] = 0
self.pos_senses[<int>parts_of_speech.PRON] = 0
self.pos_senses[<int>parts_of_speech.PRT] = 0
self.pos_senses[<int>parts_of_speech.VERB] = 0
self.pos_senses[<int>parts_of_speech.X] = 0
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
self.pos_senses[<int>parts_of_speech.EOL] = 0
cdef flags_t sense = 0
for _sense in range(N_Tops, V_body):
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
for _sense in range(V_body, J_ppl):
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
def __call__(self, Tokens tokens):
cdef atom_t[CONTEXT_SIZE] context
cdef int i, guess, n_feats
cdef const TokenC* token
cdef TokenC* token
for i in range(tokens.length):
token = &tokens.data[i]
if token.pos in (NOUN, VERB):
fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
def train(self, Tokens tokens, GoldParse gold):
cdef int i, j
cdef TokenC* token
for i, ssenses in enumerate(gold.ssenses):
token = &tokens.data[i]
if ssenses:
gold.c.ssenses[i] = encode_sense_strs(ssenses)
else:
gold.c.ssenses[i] = pos_senses(&tokens.data[i])
gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats
@ -240,7 +244,7 @@ cdef class SenseTagger:
fill_context(context, token)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
token.sense = self.best_in_set(scores, token.lex.senses)
best = self.best_in_set(scores, gold.c.ssenses[i])
guess_counts = {}
gold_counts = {}
@ -251,7 +255,7 @@ cdef class SenseTagger:
feat = (f_i, f_key)
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
#self.model.update({token.sense: guess_counts, best: gold_counts})
self.model.update({token.sense: guess_counts, best: gold_counts})
return cost
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
@ -266,10 +270,6 @@ cdef class SenseTagger:
return argmax
cdef flags_t pos_senses(const TokenC* token) nogil:
return token.lex.senses & POS_SENSES[<int>token.pos]
cdef list _set_bits(flags_t flags):
bits = []
cdef flags_t bit

View File

@ -4,13 +4,17 @@ from .typedefs cimport flags_t
cpdef enum:
NO_SENSE
J_all
J_pert
A_all
N_Tops
N_act
N_animal
N_artifact
N_attribute
N_body
N_cognition
N_communication
N_communication
N_event
N_feeling
N_food
@ -44,10 +48,8 @@ cpdef enum:
V_social
V_stative
V_weather
J_ppl
N_SENSES
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
cdef flags_t encode_sense_strs(sense_names) except 0

View File

@ -2,74 +2,56 @@ from __future__ import unicode_literals
cimport parts_of_speech
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
POS_SENSES[<int>parts_of_speech.ADJ] = 0
POS_SENSES[<int>parts_of_speech.ADV] = 0
POS_SENSES[<int>parts_of_speech.ADP] = 0
POS_SENSES[<int>parts_of_speech.CONJ] = 0
POS_SENSES[<int>parts_of_speech.DET] = 0
POS_SENSES[<int>parts_of_speech.NOUN] = 0
POS_SENSES[<int>parts_of_speech.NUM] = 0
POS_SENSES[<int>parts_of_speech.PRON] = 0
POS_SENSES[<int>parts_of_speech.PRT] = 0
POS_SENSES[<int>parts_of_speech.VERB] = 0
POS_SENSES[<int>parts_of_speech.X] = 0
POS_SENSES[<int>parts_of_speech.PUNCT] = 0
POS_SENSES[<int>parts_of_speech.EOL] = 0
lexnames_str = """
-1 NO_SENSE -1
00 J_all 3
01 A_pert 3
02 A_all 4
03 N_Tops 1
04 N_act 1
05 N_animal 1
06 N_artifact 1
07 N_attribute 1
08 N_body 1
09 N_cognition 1
10 N_communication 1
11 N_event 1
12 N_feeling 1
13 N_food 1
14 N_group 1
15 N_location 1
16 N_motive 1
17 N_object 1
18 N_person 1
19 N_phenomenon 1
20 N_plant 1
21 N_possession 1
22 N_process 1
23 N_quantity 1
24 N_relation 1
25 N_shape 1
26 N_state 1
27 N_substance 1
28 N_time 1
29 V_body 2
30 V_change 2
31 V_cognition 2
32 V_communication 2
33 V_competition 2
34 V_consumption 2
35 V_contact 2
36 V_creation 2
37 V_emotion 2
38 V_motion 2
39 V_perception 2
40 V_possession 2
41 V_social 2
42 V_stative 2
43 V_weather 2
44 A_ppl 3
""".strip()
cdef int _sense = 0
for _sense in range(N_act, V_body):
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
for _sense in range(V_body, V_weather+1):
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
STRINGS = (
'-NO_SENSE-',
'N_act',
'N_animal',
'N_artifact',
'N_attribute',
'N_body',
'N_cognition',
'N_communication',
'N_event',
'N_feeling',
'N_food',
'N_group',
'N_location',
'N_motive',
'N_object',
'N_person',
'N_phenomenon',
'N_plant',
'N_possession',
'N_process',
'N_quantity',
'N_relation',
'N_shape',
'N_state',
'N_substance',
'N_time',
'V_body',
'V_change',
'V_cognition',
'V_communication',
'V_competition',
'V_consumption',
'V_contact',
'V_creation',
'V_emotion',
'V_motion',
'V_perception',
'V_possession',
'V_social',
'V_stative',
'V_weather'
)
STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
return sense_bits | (1 << NO_SENSE)
cdef flags_t sense_id = 0
for sense_str in sense_names:
if '.' in sense_str:
sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1]
sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
sense_id = IDS[sense_str]
sense_bits |= (1 << sense_id)
return sense_bits

View File

@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[9] = token.lex.shape
context[10] = token.ent_iob
context[11] = token.ent_type
context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]
cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
# Take care to fill every element of context!

View File

@ -93,7 +93,7 @@ cdef class Tokens:
else:
size = 5
self.mem = Pool()
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))