mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-10 16:40:34 +03:00
* Work on sense tagger
This commit is contained in:
parent
2fbcdd0ea8
commit
fb68df91b8
|
@ -69,6 +69,7 @@ cdef class Model:
|
|||
assert self.n_classes == eg.c.nr_class
|
||||
memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
|
||||
self.set_scores(eg.c.scores, eg.c.atoms)
|
||||
|
||||
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
||||
if eg.c.guess == -1:
|
||||
raise ValidationError("No valid classes during prediction")
|
||||
|
|
|
@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
|
|||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])]
|
||||
t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
|
||||
for s in token.get('ssenses', [])]
|
||||
wsd.append(t_wsd)
|
||||
sents.append((
|
||||
(ids, words, tags, heads, labels, ner, wsd),
|
||||
|
|
|
@ -133,6 +133,5 @@ class Scorer(object):
|
|||
if gold_senses and gold.gold_to_cand[i] is not None:
|
||||
cand_i = gold.gold_to_cand[i]
|
||||
sense_str = tokens[cand_i].sense_
|
||||
sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
|
||||
self.wsd.tp += sense_str in gold_senses
|
||||
self.wsd.fn += sense_str not in gold_senses
|
||||
|
|
|
@ -2,9 +2,12 @@ from .typedefs cimport flags_t
|
|||
from .structs cimport TokenC
|
||||
from .strings cimport StringStore
|
||||
from .tokens cimport Tokens
|
||||
from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
|
||||
from .senses cimport N_SENSES, encode_sense_strs
|
||||
from .senses cimport N_Tops, J_ppl, V_body
|
||||
from .gold cimport GoldParse
|
||||
from .parts_of_speech cimport NOUN, VERB
|
||||
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
|
||||
|
||||
from . cimport parts_of_speech
|
||||
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
|
@ -21,40 +24,30 @@ cdef enum:
|
|||
P2c
|
||||
P2c6
|
||||
P2c4
|
||||
P2ss
|
||||
P2s
|
||||
|
||||
P1W
|
||||
P1p
|
||||
P1c
|
||||
P1c6
|
||||
P1c4
|
||||
P1ss
|
||||
P1s
|
||||
|
||||
N0W
|
||||
N0p
|
||||
N0c
|
||||
N0c6
|
||||
N0c4
|
||||
N0ss
|
||||
N0s
|
||||
|
||||
N1W
|
||||
N1p
|
||||
N1c
|
||||
N1c6
|
||||
N1c4
|
||||
N1ss
|
||||
N1s
|
||||
|
||||
N2W
|
||||
N2p
|
||||
N2c
|
||||
N2c6
|
||||
N2c4
|
||||
N2ss
|
||||
N2s
|
||||
|
||||
CONTEXT_SIZE
|
||||
|
||||
|
@ -67,8 +60,6 @@ unigrams = (
|
|||
(P2c6, P2p),
|
||||
(P2c4, P2p),
|
||||
(P2c,),
|
||||
(P2ss,),
|
||||
(P1s,),
|
||||
|
||||
(P1W,),
|
||||
(P1p,),
|
||||
|
@ -84,8 +75,6 @@ unigrams = (
|
|||
(P1c6, P1p),
|
||||
(P1c4, P1p),
|
||||
(P1c,),
|
||||
(P1ss,),
|
||||
(P1s,),
|
||||
|
||||
(N0p,),
|
||||
(N0W, N0p),
|
||||
|
@ -100,7 +89,6 @@ unigrams = (
|
|||
(N0c6, N0p),
|
||||
(N0c4, N0p),
|
||||
(N0c,),
|
||||
(N0ss,),
|
||||
|
||||
(N1p,),
|
||||
(N1W, N1p),
|
||||
|
@ -115,7 +103,6 @@ unigrams = (
|
|||
(N1c6, N1p),
|
||||
(N1c4, N1p),
|
||||
(N1c,),
|
||||
(N1ss,),
|
||||
|
||||
(N2p,),
|
||||
(N2W, N2p),
|
||||
|
@ -130,7 +117,6 @@ unigrams = (
|
|||
(N2c6, N2p),
|
||||
(N2c4, N2p),
|
||||
(N2c,),
|
||||
(N2ss,),
|
||||
)
|
||||
|
||||
|
||||
|
@ -141,27 +127,21 @@ bigrams = (
|
|||
(P1c, N0p),
|
||||
(P1c6, N0p),
|
||||
|
||||
(P2s, P1s),
|
||||
(P2ss, P1s,),
|
||||
(P2ss, P1ss,),
|
||||
|
||||
(P1ss, N0ss),
|
||||
|
||||
(N0p, N1p,),
|
||||
)
|
||||
|
||||
|
||||
trigrams = (
|
||||
(P1p, N0p, N1p),
|
||||
(P2p, P1p, N0ss),
|
||||
(P2p, P1p,),
|
||||
(P2c4, P1c4, N0c4),
|
||||
|
||||
(P1p, N0p, N1p),
|
||||
(P1p, N0p, N1ss),
|
||||
(P1p, N0p,),
|
||||
(P1c4, N0c4, N1c4),
|
||||
|
||||
(N0p, N1p, N2p),
|
||||
(N0p, N1p, N2ss),
|
||||
(N0p, N1p,),
|
||||
(N0c4, N1c4, N2c4),
|
||||
)
|
||||
|
||||
|
@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
|
|||
ctxt[2] = token.lex.cluster
|
||||
ctxt[3] = token.lex.cluster & 15
|
||||
ctxt[4] = token.lex.cluster & 63
|
||||
ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
|
||||
ctxt[6] = token.sense
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
|
||||
|
@ -194,6 +172,7 @@ cdef class SenseTagger:
|
|||
cdef readonly LinearModel model
|
||||
cdef readonly Extractor extractor
|
||||
cdef readonly model_dir
|
||||
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
|
||||
|
||||
def __init__(self, StringStore strings, model_dir):
|
||||
if model_dir is not None and path.isdir(model_dir):
|
||||
|
@ -207,25 +186,50 @@ cdef class SenseTagger:
|
|||
self.model.load(self.model_dir, freq_thresh=0)
|
||||
self.strings = strings
|
||||
|
||||
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADJ] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADV] = 0
|
||||
self.pos_senses[<int>parts_of_speech.ADP] = 0
|
||||
self.pos_senses[<int>parts_of_speech.CONJ] = 0
|
||||
self.pos_senses[<int>parts_of_speech.DET] = 0
|
||||
self.pos_senses[<int>parts_of_speech.NOUN] = 0
|
||||
self.pos_senses[<int>parts_of_speech.NUM] = 0
|
||||
self.pos_senses[<int>parts_of_speech.PRON] = 0
|
||||
self.pos_senses[<int>parts_of_speech.PRT] = 0
|
||||
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
||||
self.pos_senses[<int>parts_of_speech.X] = 0
|
||||
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
||||
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
||||
|
||||
|
||||
cdef flags_t sense = 0
|
||||
for _sense in range(N_Tops, V_body):
|
||||
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
|
||||
|
||||
for _sense in range(V_body, J_ppl):
|
||||
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef int i, guess, n_feats
|
||||
cdef const TokenC* token
|
||||
cdef TokenC* token
|
||||
for i in range(tokens.length):
|
||||
token = &tokens.data[i]
|
||||
if token.pos in (NOUN, VERB):
|
||||
fill_context(context, token)
|
||||
feats = self.extractor.get_feats(context, &n_feats)
|
||||
scores = self.model.get_scores(feats, n_feats)
|
||||
tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
|
||||
tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
|
||||
|
||||
def train(self, Tokens tokens, GoldParse gold):
|
||||
cdef int i, j
|
||||
cdef TokenC* token
|
||||
for i, ssenses in enumerate(gold.ssenses):
|
||||
token = &tokens.data[i]
|
||||
if ssenses:
|
||||
gold.c.ssenses[i] = encode_sense_strs(ssenses)
|
||||
else:
|
||||
gold.c.ssenses[i] = pos_senses(&tokens.data[i])
|
||||
gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
|
||||
|
||||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef int n_feats
|
||||
|
@ -240,7 +244,7 @@ cdef class SenseTagger:
|
|||
fill_context(context, token)
|
||||
feats = self.extractor.get_feats(context, &n_feats)
|
||||
scores = self.model.get_scores(feats, n_feats)
|
||||
token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
|
||||
token.sense = self.best_in_set(scores, token.lex.senses)
|
||||
best = self.best_in_set(scores, gold.c.ssenses[i])
|
||||
guess_counts = {}
|
||||
gold_counts = {}
|
||||
|
@ -251,7 +255,7 @@ cdef class SenseTagger:
|
|||
feat = (f_i, f_key)
|
||||
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
||||
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
||||
#self.model.update({token.sense: guess_counts, best: gold_counts})
|
||||
self.model.update({token.sense: guess_counts, best: gold_counts})
|
||||
return cost
|
||||
|
||||
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
||||
|
@ -266,10 +270,6 @@ cdef class SenseTagger:
|
|||
return argmax
|
||||
|
||||
|
||||
cdef flags_t pos_senses(const TokenC* token) nogil:
|
||||
return token.lex.senses & POS_SENSES[<int>token.pos]
|
||||
|
||||
|
||||
cdef list _set_bits(flags_t flags):
|
||||
bits = []
|
||||
cdef flags_t bit
|
||||
|
|
|
@ -4,13 +4,17 @@ from .typedefs cimport flags_t
|
|||
|
||||
cpdef enum:
|
||||
NO_SENSE
|
||||
J_all
|
||||
J_pert
|
||||
A_all
|
||||
N_Tops
|
||||
N_act
|
||||
N_animal
|
||||
N_artifact
|
||||
N_attribute
|
||||
N_body
|
||||
N_cognition
|
||||
N_communication
|
||||
N_communication
|
||||
N_event
|
||||
N_feeling
|
||||
N_food
|
||||
|
@ -44,10 +48,8 @@ cpdef enum:
|
|||
V_social
|
||||
V_stative
|
||||
V_weather
|
||||
J_ppl
|
||||
N_SENSES
|
||||
|
||||
|
||||
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
|
||||
|
||||
|
||||
cdef flags_t encode_sense_strs(sense_names) except 0
|
||||
|
|
120
spacy/senses.pyx
120
spacy/senses.pyx
|
@ -2,74 +2,56 @@ from __future__ import unicode_literals
|
|||
cimport parts_of_speech
|
||||
|
||||
|
||||
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADJ] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADV] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADP] = 0
|
||||
POS_SENSES[<int>parts_of_speech.CONJ] = 0
|
||||
POS_SENSES[<int>parts_of_speech.DET] = 0
|
||||
POS_SENSES[<int>parts_of_speech.NOUN] = 0
|
||||
POS_SENSES[<int>parts_of_speech.NUM] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PRON] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PRT] = 0
|
||||
POS_SENSES[<int>parts_of_speech.VERB] = 0
|
||||
POS_SENSES[<int>parts_of_speech.X] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PUNCT] = 0
|
||||
POS_SENSES[<int>parts_of_speech.EOL] = 0
|
||||
lexnames_str = """
|
||||
-1 NO_SENSE -1
|
||||
00 J_all 3
|
||||
01 A_pert 3
|
||||
02 A_all 4
|
||||
03 N_Tops 1
|
||||
04 N_act 1
|
||||
05 N_animal 1
|
||||
06 N_artifact 1
|
||||
07 N_attribute 1
|
||||
08 N_body 1
|
||||
09 N_cognition 1
|
||||
10 N_communication 1
|
||||
11 N_event 1
|
||||
12 N_feeling 1
|
||||
13 N_food 1
|
||||
14 N_group 1
|
||||
15 N_location 1
|
||||
16 N_motive 1
|
||||
17 N_object 1
|
||||
18 N_person 1
|
||||
19 N_phenomenon 1
|
||||
20 N_plant 1
|
||||
21 N_possession 1
|
||||
22 N_process 1
|
||||
23 N_quantity 1
|
||||
24 N_relation 1
|
||||
25 N_shape 1
|
||||
26 N_state 1
|
||||
27 N_substance 1
|
||||
28 N_time 1
|
||||
29 V_body 2
|
||||
30 V_change 2
|
||||
31 V_cognition 2
|
||||
32 V_communication 2
|
||||
33 V_competition 2
|
||||
34 V_consumption 2
|
||||
35 V_contact 2
|
||||
36 V_creation 2
|
||||
37 V_emotion 2
|
||||
38 V_motion 2
|
||||
39 V_perception 2
|
||||
40 V_possession 2
|
||||
41 V_social 2
|
||||
42 V_stative 2
|
||||
43 V_weather 2
|
||||
44 A_ppl 3
|
||||
""".strip()
|
||||
|
||||
|
||||
cdef int _sense = 0
|
||||
|
||||
for _sense in range(N_act, V_body):
|
||||
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
|
||||
|
||||
for _sense in range(V_body, V_weather+1):
|
||||
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
|
||||
|
||||
|
||||
STRINGS = (
|
||||
'-NO_SENSE-',
|
||||
'N_act',
|
||||
'N_animal',
|
||||
'N_artifact',
|
||||
'N_attribute',
|
||||
'N_body',
|
||||
'N_cognition',
|
||||
'N_communication',
|
||||
'N_event',
|
||||
'N_feeling',
|
||||
'N_food',
|
||||
'N_group',
|
||||
'N_location',
|
||||
'N_motive',
|
||||
'N_object',
|
||||
'N_person',
|
||||
'N_phenomenon',
|
||||
'N_plant',
|
||||
'N_possession',
|
||||
'N_process',
|
||||
'N_quantity',
|
||||
'N_relation',
|
||||
'N_shape',
|
||||
'N_state',
|
||||
'N_substance',
|
||||
'N_time',
|
||||
'V_body',
|
||||
'V_change',
|
||||
'V_cognition',
|
||||
'V_communication',
|
||||
'V_competition',
|
||||
'V_consumption',
|
||||
'V_contact',
|
||||
'V_creation',
|
||||
'V_emotion',
|
||||
'V_motion',
|
||||
'V_perception',
|
||||
'V_possession',
|
||||
'V_social',
|
||||
'V_stative',
|
||||
'V_weather'
|
||||
)
|
||||
STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
|
||||
|
||||
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
|
||||
|
||||
|
@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
|
|||
return sense_bits | (1 << NO_SENSE)
|
||||
cdef flags_t sense_id = 0
|
||||
for sense_str in sense_names:
|
||||
if '.' in sense_str:
|
||||
sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1]
|
||||
sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
|
||||
sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
|
||||
sense_id = IDS[sense_str]
|
||||
sense_bits |= (1 << sense_id)
|
||||
return sense_bits
|
||||
|
|
|
@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
|||
context[9] = token.lex.shape
|
||||
context[10] = token.ent_iob
|
||||
context[11] = token.ent_type
|
||||
context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
|
||||
context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]
|
||||
|
||||
cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
|
||||
# Take care to fill every element of context!
|
||||
|
|
|
@ -93,7 +93,7 @@ cdef class Tokens:
|
|||
else:
|
||||
size = 5
|
||||
self.mem = Pool()
|
||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||
# Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
|
||||
# However, we need to remember the true starting places, so that we can
|
||||
# realloc.
|
||||
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||
|
|
Loading…
Reference in New Issue
Block a user