* Work on sense tagger

2025-10-18 17:54:17 +03:00 · 2015-07-03 15:25:41 +02:00 · 2015-07-03 15:25:41 +02:00 · fb68df91b8
commit fb68df91b8
parent 2fbcdd0ea8
8 changed files with 101 additions and 116 deletions
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -69,6 +69,7 @@ cdef class Model:
        assert self.n_classes == eg.c.nr_class
        memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
        self.set_scores(eg.c.scores, eg.c.atoms)
        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
        if eg.c.guess == -1:
            raise ValidationError("No valid classes during prediction")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
-                        t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])]
+                        t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
                                 for s in token.get('ssenses', [])]
                        wsd.append(t_wsd)
                    sents.append((
                        (ids, words, tags, heads, labels, ner, wsd),
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -133,6 +133,5 @@ class Scorer(object):
            if gold_senses and gold.gold_to_cand[i] is not None:
                cand_i = gold.gold_to_cand[i]
                sense_str = tokens[cand_i].sense_
                sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
                self.wsd.tp += sense_str in gold_senses
                self.wsd.fn += sense_str not in gold_senses
--- a/spacy/sense_tagger.pyx
+++ b/spacy/sense_tagger.pyx
@ -2,9 +2,12 @@ from .typedefs cimport flags_t
 from .structs cimport TokenC
 from .strings cimport StringStore
 from .tokens cimport Tokens
-from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
+from .senses cimport N_SENSES, encode_sense_strs
 from .senses cimport N_Tops, J_ppl, V_body
 from .gold cimport GoldParse
-from .parts_of_speech cimport NOUN, VERB
+from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
 from . cimport parts_of_speech
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
@ -21,40 +24,30 @@ cdef enum:
    P2c
    P2c6
    P2c4
    P2ss
    P2s
    P1W
    P1p
    P1c
    P1c6
    P1c4
    P1ss
    P1s
    N0W
    N0p
    N0c
    N0c6
    N0c4
    N0ss
    N0s
    N1W
    N1p
    N1c
    N1c6
    N1c4
    N1ss
    N1s
    N2W
    N2p
    N2c
    N2c6
    N2c4
    N2ss
    N2s
    CONTEXT_SIZE
@ -67,8 +60,6 @@ unigrams = (
    (P2c6, P2p),
    (P2c4, P2p),
    (P2c,),
    (P2ss,),
    (P1s,),
    (P1W,),
    (P1p,),
@ -84,8 +75,6 @@ unigrams = (
    (P1c6, P1p),
    (P1c4, P1p),
    (P1c,),
    (P1ss,),
    (P1s,),
    (N0p,),
    (N0W, N0p),
@ -100,7 +89,6 @@ unigrams = (
    (N0c6, N0p),
    (N0c4, N0p),
    (N0c,),
    (N0ss,),
    (N1p,),
    (N1W, N1p),
@ -115,7 +103,6 @@ unigrams = (
    (N1c6, N1p),
    (N1c4, N1p),
    (N1c,),
    (N1ss,),
    (N2p,),
    (N2W, N2p),
@ -130,7 +117,6 @@ unigrams = (
    (N2c6, N2p),
    (N2c4, N2p),
    (N2c,),
    (N2ss,),
 )
@ -141,27 +127,21 @@ bigrams = (
    (P1c, N0p),
    (P1c6, N0p),
    (P2s, P1s),
    (P2ss, P1s,),
    (P2ss, P1ss,),
    (P1ss, N0ss),
    (N0p, N1p,),
 )
 trigrams = (
    (P1p, N0p, N1p),
-    (P2p, P1p, N0ss),
+    (P2p, P1p,),
    (P2c4, P1c4, N0c4),
    (P1p, N0p, N1p),
-    (P1p, N0p, N1ss),
+    (P1p, N0p,),
    (P1c4, N0c4, N1c4),
    (N0p, N1p, N2p),
-    (N0p, N1p, N2ss),
+    (N0p, N1p,),
    (N0c4, N1c4, N2c4),
 )
@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
    ctxt[2] = token.lex.cluster
    ctxt[3] = token.lex.cluster & 15
    ctxt[4] = token.lex.cluster & 63
    ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
    ctxt[6] = token.sense
 cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
@ -194,6 +172,7 @@ cdef class SenseTagger:
    cdef readonly LinearModel model
    cdef readonly Extractor extractor
    cdef readonly model_dir
    cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
    def __init__(self, StringStore strings, model_dir):
        if model_dir is not None and path.isdir(model_dir):
@ -207,25 +186,50 @@ cdef class SenseTagger:
            self.model.load(self.model_dir, freq_thresh=0)
        self.strings = strings
        self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
        self.pos_senses[<int>parts_of_speech.ADJ] = 0
        self.pos_senses[<int>parts_of_speech.ADV] = 0
        self.pos_senses[<int>parts_of_speech.ADP] = 0
        self.pos_senses[<int>parts_of_speech.CONJ] = 0
        self.pos_senses[<int>parts_of_speech.DET] = 0
        self.pos_senses[<int>parts_of_speech.NOUN] = 0
        self.pos_senses[<int>parts_of_speech.NUM] = 0
        self.pos_senses[<int>parts_of_speech.PRON] = 0
        self.pos_senses[<int>parts_of_speech.PRT] = 0
        self.pos_senses[<int>parts_of_speech.VERB] = 0
        self.pos_senses[<int>parts_of_speech.X] = 0
        self.pos_senses[<int>parts_of_speech.PUNCT] = 0
        self.pos_senses[<int>parts_of_speech.EOL] = 0
        cdef flags_t sense = 0
        for _sense in range(N_Tops, V_body):
            self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
        for _sense in range(V_body, J_ppl):
            self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
    def __call__(self, Tokens tokens):
        cdef atom_t[CONTEXT_SIZE] context
        cdef int i, guess, n_feats
-        cdef const TokenC* token
+        cdef TokenC* token
        for i in range(tokens.length):
            token = &tokens.data[i]
            if token.pos in (NOUN, VERB):
                fill_context(context, token)
                feats = self.extractor.get_feats(context, &n_feats)
                scores = self.model.get_scores(feats, n_feats)
-                tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
+                tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
    def train(self, Tokens tokens, GoldParse gold):
        cdef int i, j
        cdef TokenC* token
        for i, ssenses in enumerate(gold.ssenses):
            token = &tokens.data[i]
            if ssenses:
                gold.c.ssenses[i] = encode_sense_strs(ssenses)
            else:
-                gold.c.ssenses[i] = pos_senses(&tokens.data[i])
+                gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
        cdef atom_t[CONTEXT_SIZE] context
        cdef int n_feats
@ -240,7 +244,7 @@ cdef class SenseTagger:
                fill_context(context, token)
                feats = self.extractor.get_feats(context, &n_feats)
                scores = self.model.get_scores(feats, n_feats)
-                token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
+                token.sense = self.best_in_set(scores, token.lex.senses)
                best = self.best_in_set(scores, gold.c.ssenses[i])
                guess_counts = {}
                gold_counts = {}
@ -251,7 +255,7 @@ cdef class SenseTagger:
                        feat = (f_i, f_key)
                        gold_counts[feat]  = gold_counts.get(feat, 0) + 1.0
                        guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
-                #self.model.update({token.sense: guess_counts, best: gold_counts})
+                self.model.update({token.sense: guess_counts, best: gold_counts})
        return cost
    cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
@ -266,10 +270,6 @@ cdef class SenseTagger:
        return argmax
 cdef flags_t pos_senses(const TokenC* token) nogil:
    return token.lex.senses & POS_SENSES[<int>token.pos]
 cdef list _set_bits(flags_t flags):
    bits = []
    cdef flags_t bit
--- a/spacy/senses.pxd
+++ b/spacy/senses.pxd
@ -4,13 +4,17 @@ from .typedefs cimport flags_t
 cpdef enum:
    NO_SENSE
    J_all
    J_pert
    A_all
    N_Tops
    N_act
    N_animal
    N_artifact
    N_attribute
    N_body
    N_cognition
-    N_communication
+    N_communication  
    N_event
    N_feeling
    N_food
@ -44,10 +48,8 @@ cpdef enum:
    V_social
    V_stative
    V_weather
    J_ppl
    N_SENSES
 cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
 cdef flags_t encode_sense_strs(sense_names) except 0
--- a/spacy/senses.pyx
+++ b/spacy/senses.pyx
@ -2,74 +2,56 @@ from __future__ import unicode_literals
 cimport parts_of_speech
-POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
+lexnames_str = """
-POS_SENSES[<int>parts_of_speech.ADJ] = 0
+-1      NO_SENSE       -1
-POS_SENSES[<int>parts_of_speech.ADV] = 0
+00      J_all 3
-POS_SENSES[<int>parts_of_speech.ADP] = 0
+01      A_pert        3 
-POS_SENSES[<int>parts_of_speech.CONJ] = 0
+02      A_all 4
-POS_SENSES[<int>parts_of_speech.DET] = 0
+03      N_Tops       1  
-POS_SENSES[<int>parts_of_speech.NOUN] = 0
+04      N_act        1
-POS_SENSES[<int>parts_of_speech.NUM] = 0
+05      N_animal     1
-POS_SENSES[<int>parts_of_speech.PRON] = 0
+06      N_artifact   1
-POS_SENSES[<int>parts_of_speech.PRT] = 0
+07      N_attribute  1
-POS_SENSES[<int>parts_of_speech.VERB] = 0
+08      N_body       1
-POS_SENSES[<int>parts_of_speech.X] = 0
+09      N_cognition  1
-POS_SENSES[<int>parts_of_speech.PUNCT] = 0
+10      N_communication      1
-POS_SENSES[<int>parts_of_speech.EOL] = 0
+11      N_event      1
 12      N_feeling    1
 13      N_food       1
 14      N_group      1
 15      N_location   1
 16      N_motive     1
 17      N_object     1
 18      N_person     1
 19      N_phenomenon 1
 20      N_plant      1
 21      N_possession 1
 22      N_process    1
 23      N_quantity   1
 24      N_relation   1
 25      N_shape      1
 26      N_state      1
 27      N_substance  1
 28      N_time       1
 29      V_body       2
 30      V_change     2
 31      V_cognition  2
 32      V_communication      2
 33      V_competition        2
 34      V_consumption        2
 35      V_contact    2
 36      V_creation   2
 37      V_emotion    2
 38      V_motion     2
 39      V_perception 2
 40      V_possession 2
 41      V_social     2
 42      V_stative    2
 43      V_weather    2
 44      A_ppl 3
 """.strip()
-
+STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
 cdef int _sense = 0
 for _sense in range(N_act, V_body):
    POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
 for _sense in range(V_body, V_weather+1):
    POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
 STRINGS = (
    '-NO_SENSE-',
    'N_act',
    'N_animal',
    'N_artifact',
    'N_attribute',
    'N_body',
    'N_cognition',
    'N_communication',
    'N_event',
    'N_feeling',
    'N_food',
    'N_group',
    'N_location',
    'N_motive',
    'N_object',
    'N_person',
    'N_phenomenon',
    'N_plant',
    'N_possession',
    'N_process',
    'N_quantity',
    'N_relation',
    'N_shape',
    'N_state',
    'N_substance',
    'N_time',
    'V_body',
    'V_change',
    'V_cognition',
    'V_communication',
    'V_competition',
    'V_consumption',
    'V_contact',
    'V_creation',
    'V_emotion',
    'V_motion',
    'V_perception',
    'V_possession',
    'V_social',
    'V_stative',
    'V_weather'
 )
 IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
        return sense_bits | (1 << NO_SENSE)
    cdef flags_t sense_id = 0
    for sense_str in sense_names:
-        if '.' in sense_str:
+        sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
-            sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1]
+        sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
        sense_id = IDS[sense_str]
        sense_bits |= (1 << sense_id)
    return sense_bits
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type
-        context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
+        context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]
 cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
    # Take care to fill every element of context!
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -93,7 +93,7 @@ cdef class Tokens:
        else:
            size = 5
        self.mem = Pool()
-        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
+        # Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))