diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 706396fd1..4ae8630a1 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -69,6 +69,7 @@ cdef class Model: assert self.n_classes == eg.c.nr_class memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class) self.set_scores(eg.c.scores, eg.c.atoms) + eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) if eg.c.guess == -1: raise ValidationError("No valid classes during prediction") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 2215984a8..eb2a58952 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None): if labels[-1].lower() == 'root': labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) - t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])] + t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_') + for s in token.get('ssenses', [])] wsd.append(t_wsd) sents.append(( (ids, words, tags, heads, labels, ner, wsd), diff --git a/spacy/scorer.py b/spacy/scorer.py index c68c6c892..d06afb593 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -133,6 +133,5 @@ class Scorer(object): if gold_senses and gold.gold_to_cand[i] is not None: cand_i = gold.gold_to_cand[i] sense_str = tokens[cand_i].sense_ - sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.') self.wsd.tp += sense_str in gold_senses self.wsd.fn += sense_str not in gold_senses diff --git a/spacy/sense_tagger.pyx b/spacy/sense_tagger.pyx index 82e07710a..883e35f76 100644 --- a/spacy/sense_tagger.pyx +++ b/spacy/sense_tagger.pyx @@ -2,9 +2,12 @@ from .typedefs cimport flags_t from .structs cimport TokenC from .strings cimport StringStore from .tokens cimport Tokens -from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs +from .senses cimport N_SENSES, encode_sense_strs +from .senses cimport N_Tops, J_ppl, V_body from .gold cimport GoldParse -from .parts_of_speech cimport NOUN, VERB +from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS + +from . cimport parts_of_speech from thinc.learner cimport LinearModel from thinc.features cimport Extractor @@ -21,40 +24,30 @@ cdef enum: P2c P2c6 P2c4 - P2ss - P2s P1W P1p P1c P1c6 P1c4 - P1ss - P1s N0W N0p N0c N0c6 N0c4 - N0ss - N0s N1W N1p N1c N1c6 N1c4 - N1ss - N1s N2W N2p N2c N2c6 N2c4 - N2ss - N2s CONTEXT_SIZE @@ -67,8 +60,6 @@ unigrams = ( (P2c6, P2p), (P2c4, P2p), (P2c,), - (P2ss,), - (P1s,), (P1W,), (P1p,), @@ -84,8 +75,6 @@ unigrams = ( (P1c6, P1p), (P1c4, P1p), (P1c,), - (P1ss,), - (P1s,), (N0p,), (N0W, N0p), @@ -100,7 +89,6 @@ unigrams = ( (N0c6, N0p), (N0c4, N0p), (N0c,), - (N0ss,), (N1p,), (N1W, N1p), @@ -115,7 +103,6 @@ unigrams = ( (N1c6, N1p), (N1c4, N1p), (N1c,), - (N1ss,), (N2p,), (N2W, N2p), @@ -130,7 +117,6 @@ unigrams = ( (N2c6, N2p), (N2c4, N2p), (N2c,), - (N2ss,), ) @@ -141,27 +127,21 @@ bigrams = ( (P1c, N0p), (P1c6, N0p), - (P2s, P1s), - (P2ss, P1s,), - (P2ss, P1ss,), - - (P1ss, N0ss), - (N0p, N1p,), ) trigrams = ( (P1p, N0p, N1p), - (P2p, P1p, N0ss), + (P2p, P1p,), (P2c4, P1c4, N0c4), (P1p, N0p, N1p), - (P1p, N0p, N1ss), + (P1p, N0p,), (P1c4, N0c4, N1c4), (N0p, N1p, N2p), - (N0p, N1p, N2ss), + (N0p, N1p,), (N0c4, N1c4, N2c4), ) @@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1: ctxt[2] = token.lex.cluster ctxt[3] = token.lex.cluster & 15 ctxt[4] = token.lex.cluster & 63 - ctxt[5] = token.lex.senses & POS_SENSES[token.pos] - ctxt[6] = token.sense cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1: @@ -194,6 +172,7 @@ cdef class SenseTagger: cdef readonly LinearModel model cdef readonly Extractor extractor cdef readonly model_dir + cdef readonly flags_t[N_UNIV_TAGS] pos_senses def __init__(self, StringStore strings, model_dir): if model_dir is not None and path.isdir(model_dir): @@ -207,25 +186,50 @@ cdef class SenseTagger: self.model.load(self.model_dir, freq_thresh=0) self.strings = strings + self.pos_senses[parts_of_speech.NO_TAG] = 0 + self.pos_senses[parts_of_speech.ADJ] = 0 + self.pos_senses[parts_of_speech.ADV] = 0 + self.pos_senses[parts_of_speech.ADP] = 0 + self.pos_senses[parts_of_speech.CONJ] = 0 + self.pos_senses[parts_of_speech.DET] = 0 + self.pos_senses[parts_of_speech.NOUN] = 0 + self.pos_senses[parts_of_speech.NUM] = 0 + self.pos_senses[parts_of_speech.PRON] = 0 + self.pos_senses[parts_of_speech.PRT] = 0 + self.pos_senses[parts_of_speech.VERB] = 0 + self.pos_senses[parts_of_speech.X] = 0 + self.pos_senses[parts_of_speech.PUNCT] = 0 + self.pos_senses[parts_of_speech.EOL] = 0 + + + cdef flags_t sense = 0 + for _sense in range(N_Tops, V_body): + self.pos_senses[parts_of_speech.NOUN] |= 1 << sense + + for _sense in range(V_body, J_ppl): + self.pos_senses[parts_of_speech.VERB] |= 1 << sense + def __call__(self, Tokens tokens): cdef atom_t[CONTEXT_SIZE] context cdef int i, guess, n_feats - cdef const TokenC* token + cdef TokenC* token for i in range(tokens.length): token = &tokens.data[i] if token.pos in (NOUN, VERB): fill_context(context, token) feats = self.extractor.get_feats(context, &n_feats) scores = self.model.get_scores(feats, n_feats) - tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[token.pos]) + tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[token.pos]) def train(self, Tokens tokens, GoldParse gold): cdef int i, j + cdef TokenC* token for i, ssenses in enumerate(gold.ssenses): + token = &tokens.data[i] if ssenses: gold.c.ssenses[i] = encode_sense_strs(ssenses) else: - gold.c.ssenses[i] = pos_senses(&tokens.data[i]) + gold.c.ssenses[i] = token.lex.senses & self.pos_senses[token.pos] cdef atom_t[CONTEXT_SIZE] context cdef int n_feats @@ -240,7 +244,7 @@ cdef class SenseTagger: fill_context(context, token) feats = self.extractor.get_feats(context, &n_feats) scores = self.model.get_scores(feats, n_feats) - token.sense = self.best_in_set(scores, POS_SENSES[token.pos]) + token.sense = self.best_in_set(scores, token.lex.senses) best = self.best_in_set(scores, gold.c.ssenses[i]) guess_counts = {} gold_counts = {} @@ -251,7 +255,7 @@ cdef class SenseTagger: feat = (f_i, f_key) gold_counts[feat] = gold_counts.get(feat, 0) + 1.0 guess_counts[feat] = guess_counts.get(feat, 0) - 1.0 - #self.model.update({token.sense: guess_counts, best: gold_counts}) + self.model.update({token.sense: guess_counts, best: gold_counts}) return cost cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1: @@ -266,10 +270,6 @@ cdef class SenseTagger: return argmax -cdef flags_t pos_senses(const TokenC* token) nogil: - return token.lex.senses & POS_SENSES[token.pos] - - cdef list _set_bits(flags_t flags): bits = [] cdef flags_t bit diff --git a/spacy/senses.pxd b/spacy/senses.pxd index dfb1e0263..d6e73dbc3 100644 --- a/spacy/senses.pxd +++ b/spacy/senses.pxd @@ -4,13 +4,17 @@ from .typedefs cimport flags_t cpdef enum: NO_SENSE + J_all + J_pert + A_all + N_Tops N_act N_animal N_artifact N_attribute N_body N_cognition - N_communication + N_communication N_event N_feeling N_food @@ -44,10 +48,8 @@ cpdef enum: V_social V_stative V_weather + J_ppl N_SENSES -cdef flags_t[parts_of_speech.N_UNIV_TAGS] POS_SENSES - - cdef flags_t encode_sense_strs(sense_names) except 0 diff --git a/spacy/senses.pyx b/spacy/senses.pyx index f3dea35d9..c029dd49d 100644 --- a/spacy/senses.pyx +++ b/spacy/senses.pyx @@ -2,74 +2,56 @@ from __future__ import unicode_literals cimport parts_of_speech -POS_SENSES[parts_of_speech.NO_TAG] = 0 -POS_SENSES[parts_of_speech.ADJ] = 0 -POS_SENSES[parts_of_speech.ADV] = 0 -POS_SENSES[parts_of_speech.ADP] = 0 -POS_SENSES[parts_of_speech.CONJ] = 0 -POS_SENSES[parts_of_speech.DET] = 0 -POS_SENSES[parts_of_speech.NOUN] = 0 -POS_SENSES[parts_of_speech.NUM] = 0 -POS_SENSES[parts_of_speech.PRON] = 0 -POS_SENSES[parts_of_speech.PRT] = 0 -POS_SENSES[parts_of_speech.VERB] = 0 -POS_SENSES[parts_of_speech.X] = 0 -POS_SENSES[parts_of_speech.PUNCT] = 0 -POS_SENSES[parts_of_speech.EOL] = 0 +lexnames_str = """ +-1 NO_SENSE -1 +00 J_all 3 +01 A_pert 3 +02 A_all 4 +03 N_Tops 1 +04 N_act 1 +05 N_animal 1 +06 N_artifact 1 +07 N_attribute 1 +08 N_body 1 +09 N_cognition 1 +10 N_communication 1 +11 N_event 1 +12 N_feeling 1 +13 N_food 1 +14 N_group 1 +15 N_location 1 +16 N_motive 1 +17 N_object 1 +18 N_person 1 +19 N_phenomenon 1 +20 N_plant 1 +21 N_possession 1 +22 N_process 1 +23 N_quantity 1 +24 N_relation 1 +25 N_shape 1 +26 N_state 1 +27 N_substance 1 +28 N_time 1 +29 V_body 2 +30 V_change 2 +31 V_cognition 2 +32 V_communication 2 +33 V_competition 2 +34 V_consumption 2 +35 V_contact 2 +36 V_creation 2 +37 V_emotion 2 +38 V_motion 2 +39 V_perception 2 +40 V_possession 2 +41 V_social 2 +42 V_stative 2 +43 V_weather 2 +44 A_ppl 3 +""".strip() - -cdef int _sense = 0 - -for _sense in range(N_act, V_body): - POS_SENSES[parts_of_speech.NOUN] |= 1 << _sense - -for _sense in range(V_body, V_weather+1): - POS_SENSES[parts_of_speech.VERB] |= 1 << _sense - - -STRINGS = ( - '-NO_SENSE-', - 'N_act', - 'N_animal', - 'N_artifact', - 'N_attribute', - 'N_body', - 'N_cognition', - 'N_communication', - 'N_event', - 'N_feeling', - 'N_food', - 'N_group', - 'N_location', - 'N_motive', - 'N_object', - 'N_person', - 'N_phenomenon', - 'N_plant', - 'N_possession', - 'N_process', - 'N_quantity', - 'N_relation', - 'N_shape', - 'N_state', - 'N_substance', - 'N_time', - 'V_body', - 'V_change', - 'V_cognition', - 'V_communication', - 'V_competition', - 'V_consumption', - 'V_contact', - 'V_creation', - 'V_emotion', - 'V_motion', - 'V_perception', - 'V_possession', - 'V_social', - 'V_stative', - 'V_weather' -) +STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n')) IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS)) @@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0: return sense_bits | (1 << NO_SENSE) cdef flags_t sense_id = 0 for sense_str in sense_names: - if '.' in sense_str: - sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1] + sense_str = sense_str.replace('noun', 'N').replace('verb', 'V') + sense_str = sense_str.replace('adj', 'J').replace('adv', 'A') sense_id = IDS[sense_str] sense_bits |= (1 << sense_id) return sense_bits diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index bbf0a9c4c..3f503c3d2 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[9] = token.lex.shape context[10] = token.ent_iob context[11] = token.ent_type - context[12] = token.lex.senses & senses.POS_SENSES[token.pos] + context[12] = 0 # token.lex.senses & senses.POS_SENSES[token.pos] cdef int fill_context(atom_t* ctxt, StateClass st) nogil: # Take care to fill every element of context! diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 1e1d28fe6..7cd04730d 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -93,7 +93,7 @@ cdef class Tokens: else: size = 5 self.mem = Pool() - # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds + # Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. data_start = self.mem.alloc(size + (PADDING*2), sizeof(TokenC))