* Work on sense tagger

2025-11-30 14:55:44 +03:00 · 2015-07-03 15:25:41 +02:00 · 2015-07-03 15:25:41 +02:00 · fb68df91b8
commit fb68df91b8
parent 2fbcdd0ea8
8 changed files with 101 additions and 116 deletions
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -69,6 +69,7 @@ cdef class Model:
        assert self.n_classes == eg.c.nr_class
        memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
        self.set_scores(eg.c.scores, eg.c.atoms)
+
        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
        if eg.c.guess == -1:
            raise ValidationError("No valid classes during prediction")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
-                        t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])]
+                        t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
+                                 for s in token.get('ssenses', [])]
                        wsd.append(t_wsd)
                    sents.append((
                        (ids, words, tags, heads, labels, ner, wsd),
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -133,6 +133,5 @@ class Scorer(object):
            if gold_senses and gold.gold_to_cand[i] is not None:
                cand_i = gold.gold_to_cand[i]
                sense_str = tokens[cand_i].sense_
-                sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
                self.wsd.tp += sense_str in gold_senses
                self.wsd.fn += sense_str not in gold_senses
--- a/spacy/sense_tagger.pyx
+++ b/spacy/sense_tagger.pyx
@ -2,9 +2,12 @@ from .typedefs cimport flags_t
 from .structs cimport TokenC
 from .strings cimport StringStore
 from .tokens cimport Tokens
-from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
+from .senses cimport N_SENSES, encode_sense_strs
+from .senses cimport N_Tops, J_ppl, V_body
 from .gold cimport GoldParse
-from .parts_of_speech cimport NOUN, VERB
+from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
+
+from . cimport parts_of_speech

 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
@ -21,40 +24,30 @@ cdef enum:
    P2c
    P2c6
    P2c4
-    P2ss
-    P2s

    P1W
    P1p
    P1c
    P1c6
    P1c4
-    P1ss
-    P1s

    N0W
    N0p
    N0c
    N0c6
    N0c4
-    N0ss
-    N0s
    
    N1W
    N1p
    N1c
    N1c6
    N1c4
-    N1ss
-    N1s
    
    N2W
    N2p
    N2c
    N2c6
    N2c4
-    N2ss
-    N2s
    
    CONTEXT_SIZE

@ -67,8 +60,6 @@ unigrams = (
    (P2c6, P2p),
    (P2c4, P2p),
    (P2c,),
-    (P2ss,),
-    (P1s,),

    (P1W,),
    (P1p,),
@ -84,8 +75,6 @@ unigrams = (
    (P1c6, P1p),
    (P1c4, P1p),
    (P1c,),
-    (P1ss,),
-    (P1s,),
    
    (N0p,),
    (N0W, N0p),
@ -100,7 +89,6 @@ unigrams = (
    (N0c6, N0p),
    (N0c4, N0p),
    (N0c,),
-    (N0ss,),

    (N1p,),
    (N1W, N1p),
@ -115,7 +103,6 @@ unigrams = (
    (N1c6, N1p),
    (N1c4, N1p),
    (N1c,),
-    (N1ss,),

    (N2p,),
    (N2W, N2p),
@ -130,7 +117,6 @@ unigrams = (
    (N2c6, N2p),
    (N2c4, N2p),
    (N2c,),
-    (N2ss,),
 )


@ -141,27 +127,21 @@ bigrams = (
    (P1c, N0p),
    (P1c6, N0p),

-    (P2s, P1s),
-    (P2ss, P1s,),
-    (P2ss, P1ss,),
-
-    (P1ss, N0ss),
-
    (N0p, N1p,),
 )


 trigrams = (
    (P1p, N0p, N1p),
-    (P2p, P1p, N0ss),
+    (P2p, P1p,),
    (P2c4, P1c4, N0c4),
    
    (P1p, N0p, N1p),
-    (P1p, N0p, N1ss),
+    (P1p, N0p,),
    (P1c4, N0c4, N1c4),

    (N0p, N1p, N2p),
-    (N0p, N1p, N2ss),
+    (N0p, N1p,),
    (N0c4, N1c4, N2c4),
 )

@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
    ctxt[2] = token.lex.cluster
    ctxt[3] = token.lex.cluster & 15
    ctxt[4] = token.lex.cluster & 63
-    ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
-    ctxt[6] = token.sense


 cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
@ -194,6 +172,7 @@ cdef class SenseTagger:
    cdef readonly LinearModel model
    cdef readonly Extractor extractor
    cdef readonly model_dir
+    cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses

    def __init__(self, StringStore strings, model_dir):
        if model_dir is not None and path.isdir(model_dir):
@ -207,25 +186,50 @@ cdef class SenseTagger:
            self.model.load(self.model_dir, freq_thresh=0)
        self.strings = strings

+        self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
+        self.pos_senses[<int>parts_of_speech.ADJ] = 0
+        self.pos_senses[<int>parts_of_speech.ADV] = 0
+        self.pos_senses[<int>parts_of_speech.ADP] = 0
+        self.pos_senses[<int>parts_of_speech.CONJ] = 0
+        self.pos_senses[<int>parts_of_speech.DET] = 0
+        self.pos_senses[<int>parts_of_speech.NOUN] = 0
+        self.pos_senses[<int>parts_of_speech.NUM] = 0
+        self.pos_senses[<int>parts_of_speech.PRON] = 0
+        self.pos_senses[<int>parts_of_speech.PRT] = 0
+        self.pos_senses[<int>parts_of_speech.VERB] = 0
+        self.pos_senses[<int>parts_of_speech.X] = 0
+        self.pos_senses[<int>parts_of_speech.PUNCT] = 0
+        self.pos_senses[<int>parts_of_speech.EOL] = 0
+
+
+        cdef flags_t sense = 0
+        for _sense in range(N_Tops, V_body):
+            self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
+
+        for _sense in range(V_body, J_ppl):
+            self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
+
    def __call__(self, Tokens tokens):
        cdef atom_t[CONTEXT_SIZE] context
        cdef int i, guess, n_feats
-        cdef const TokenC* token
+        cdef TokenC* token
        for i in range(tokens.length):
            token = &tokens.data[i]
            if token.pos in (NOUN, VERB):
                fill_context(context, token)
                feats = self.extractor.get_feats(context, &n_feats)
                scores = self.model.get_scores(feats, n_feats)
-                tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
+                tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])

    def train(self, Tokens tokens, GoldParse gold):
        cdef int i, j
+        cdef TokenC* token
        for i, ssenses in enumerate(gold.ssenses):
+            token = &tokens.data[i]
            if ssenses:
                gold.c.ssenses[i] = encode_sense_strs(ssenses)
            else:
-                gold.c.ssenses[i] = pos_senses(&tokens.data[i])
+                gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
        
        cdef atom_t[CONTEXT_SIZE] context
        cdef int n_feats
@ -240,7 +244,7 @@ cdef class SenseTagger:
                fill_context(context, token)
                feats = self.extractor.get_feats(context, &n_feats)
                scores = self.model.get_scores(feats, n_feats)
-                token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
+                token.sense = self.best_in_set(scores, token.lex.senses)
                best = self.best_in_set(scores, gold.c.ssenses[i])
                guess_counts = {}
                gold_counts = {}
@ -251,7 +255,7 @@ cdef class SenseTagger:
                        feat = (f_i, f_key)
                        gold_counts[feat]  = gold_counts.get(feat, 0) + 1.0
                        guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
-                #self.model.update({token.sense: guess_counts, best: gold_counts})
+                self.model.update({token.sense: guess_counts, best: gold_counts})
        return cost

    cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
@ -266,10 +270,6 @@ cdef class SenseTagger:
        return argmax


-cdef flags_t pos_senses(const TokenC* token) nogil:
-    return token.lex.senses & POS_SENSES[<int>token.pos]
-
-
 cdef list _set_bits(flags_t flags):
    bits = []
    cdef flags_t bit
--- a/spacy/senses.pxd
+++ b/spacy/senses.pxd
@ -4,13 +4,17 @@ from .typedefs cimport flags_t

 cpdef enum:
    NO_SENSE
+    J_all
+    J_pert
+    A_all
+    N_Tops
    N_act
    N_animal
    N_artifact
    N_attribute
    N_body
    N_cognition
-    N_communication
+    N_communication  
    N_event
    N_feeling
    N_food
@ -44,10 +48,8 @@ cpdef enum:
    V_social
    V_stative
    V_weather
+    J_ppl
    N_SENSES


-cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
-
-
 cdef flags_t encode_sense_strs(sense_names) except 0
--- a/spacy/senses.pyx
+++ b/spacy/senses.pyx
@ -2,74 +2,56 @@ from __future__ import unicode_literals
 cimport parts_of_speech


-POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
-POS_SENSES[<int>parts_of_speech.ADJ] = 0
-POS_SENSES[<int>parts_of_speech.ADV] = 0
-POS_SENSES[<int>parts_of_speech.ADP] = 0
-POS_SENSES[<int>parts_of_speech.CONJ] = 0
-POS_SENSES[<int>parts_of_speech.DET] = 0
-POS_SENSES[<int>parts_of_speech.NOUN] = 0
-POS_SENSES[<int>parts_of_speech.NUM] = 0
-POS_SENSES[<int>parts_of_speech.PRON] = 0
-POS_SENSES[<int>parts_of_speech.PRT] = 0
-POS_SENSES[<int>parts_of_speech.VERB] = 0
-POS_SENSES[<int>parts_of_speech.X] = 0
-POS_SENSES[<int>parts_of_speech.PUNCT] = 0
-POS_SENSES[<int>parts_of_speech.EOL] = 0
+lexnames_str = """
+-1      NO_SENSE       -1
+00      J_all 3
+01      A_pert        3 
+02      A_all 4
+03      N_Tops       1  
+04      N_act        1
+05      N_animal     1
+06      N_artifact   1
+07      N_attribute  1
+08      N_body       1
+09      N_cognition  1
+10      N_communication      1
+11      N_event      1
+12      N_feeling    1
+13      N_food       1
+14      N_group      1
+15      N_location   1
+16      N_motive     1
+17      N_object     1
+18      N_person     1
+19      N_phenomenon 1
+20      N_plant      1
+21      N_possession 1
+22      N_process    1
+23      N_quantity   1
+24      N_relation   1
+25      N_shape      1
+26      N_state      1
+27      N_substance  1
+28      N_time       1
+29      V_body       2
+30      V_change     2
+31      V_cognition  2
+32      V_communication      2
+33      V_competition        2
+34      V_consumption        2
+35      V_contact    2
+36      V_creation   2
+37      V_emotion    2
+38      V_motion     2
+39      V_perception 2
+40      V_possession 2
+41      V_social     2
+42      V_stative    2
+43      V_weather    2
+44      A_ppl 3
+""".strip()

-
-cdef int _sense = 0
-
-for _sense in range(N_act, V_body):
-    POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
-
-for _sense in range(V_body, V_weather+1):
-    POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
-
-
-STRINGS = (
-    '-NO_SENSE-',
-    'N_act',
-    'N_animal',
-    'N_artifact',
-    'N_attribute',
-    'N_body',
-    'N_cognition',
-    'N_communication',
-    'N_event',
-    'N_feeling',
-    'N_food',
-    'N_group',
-    'N_location',
-    'N_motive',
-    'N_object',
-    'N_person',
-    'N_phenomenon',
-    'N_plant',
-    'N_possession',
-    'N_process',
-    'N_quantity',
-    'N_relation',
-    'N_shape',
-    'N_state',
-    'N_substance',
-    'N_time',
-    'V_body',
-    'V_change',
-    'V_cognition',
-    'V_communication',
-    'V_competition',
-    'V_consumption',
-    'V_contact',
-    'V_creation',
-    'V_emotion',
-    'V_motion',
-    'V_perception',
-    'V_possession',
-    'V_social',
-    'V_stative',
-    'V_weather'
-)
+STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))

 IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))

@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
        return sense_bits | (1 << NO_SENSE)
    cdef flags_t sense_id = 0
    for sense_str in sense_names:
-        if '.' in sense_str:
-            sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1]
+        sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
+        sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
        sense_id = IDS[sense_str]
        sense_bits |= (1 << sense_id)
    return sense_bits
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type
-        context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
+        context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]

 cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
    # Take care to fill every element of context!
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -93,7 +93,7 @@ cdef class Tokens:
        else:
            size = 5
        self.mem = Pool()
-        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
+        # Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))