mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Hack on sense tagger
This commit is contained in:
		
							parent
							
								
									389dcd3fb2
								
							
						
					
					
						commit
						893b5fd42c
					
				| 
						 | 
					@ -1,19 +1,25 @@
 | 
				
			||||||
 | 
					from libc.string cimport memcpy
 | 
				
			||||||
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from thinc.learner cimport LinearModel
 | 
				
			||||||
 | 
					from thinc.features cimport Extractor, Feature
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from thinc.typedefs cimport atom_t, weight_t, feat_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .typedefs cimport flags_t
 | 
					from .typedefs cimport flags_t
 | 
				
			||||||
from .structs cimport TokenC
 | 
					from .structs cimport TokenC
 | 
				
			||||||
from .strings cimport StringStore
 | 
					from .strings cimport StringStore
 | 
				
			||||||
from .tokens cimport Tokens
 | 
					from .tokens cimport Tokens
 | 
				
			||||||
from .senses cimport N_SENSES, encode_sense_strs
 | 
					from .senses cimport N_SENSES, encode_sense_strs
 | 
				
			||||||
from .senses cimport N_Tops, J_ppl, V_body
 | 
					from .senses cimport NO_SENSE, N_Tops, J_ppl, V_body
 | 
				
			||||||
from .gold cimport GoldParse
 | 
					from .gold cimport GoldParse
 | 
				
			||||||
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
 | 
					from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . cimport parts_of_speech
 | 
					from . cimport parts_of_speech
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.learner cimport LinearModel
 | 
					 | 
				
			||||||
from thinc.features cimport Extractor
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.typedefs cimport atom_t, weight_t, feat_t
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,6 +55,9 @@ cdef enum:
 | 
				
			||||||
    N2c6
 | 
					    N2c6
 | 
				
			||||||
    N2c4
 | 
					    N2c4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    P1s
 | 
				
			||||||
 | 
					    P2s
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    CONTEXT_SIZE
 | 
					    CONTEXT_SIZE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,14 +86,11 @@ unigrams = (
 | 
				
			||||||
    (P1c,),
 | 
					    (P1c,),
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    (N0p,),
 | 
					    (N0p,),
 | 
				
			||||||
    (N0W, N0p),
 | 
					 | 
				
			||||||
    (N0c, N0p),
 | 
					    (N0c, N0p),
 | 
				
			||||||
    (N0c6, N0p),
 | 
					    (N0c6, N0p),
 | 
				
			||||||
    (N0c4, N0p),
 | 
					    (N0c4, N0p),
 | 
				
			||||||
    (N0c,),
 | 
					    (N0c,),
 | 
				
			||||||
    (N0W,),
 | 
					 | 
				
			||||||
    (N0p,),
 | 
					    (N0p,),
 | 
				
			||||||
    (N0W, N0p),
 | 
					 | 
				
			||||||
    (N0c, N0p),
 | 
					    (N0c, N0p),
 | 
				
			||||||
    (N0c6, N0p),
 | 
					    (N0c6, N0p),
 | 
				
			||||||
    (N0c4, N0p),
 | 
					    (N0c4, N0p),
 | 
				
			||||||
| 
						 | 
					@ -117,6 +123,12 @@ unigrams = (
 | 
				
			||||||
    (N2c6, N2p),
 | 
					    (N2c6, N2p),
 | 
				
			||||||
    (N2c4, N2p),
 | 
					    (N2c4, N2p),
 | 
				
			||||||
    (N2c,),
 | 
					    (N2c,),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    (P1s,),
 | 
				
			||||||
 | 
					    (P2s,),
 | 
				
			||||||
 | 
					    (P1s, P2s,),
 | 
				
			||||||
 | 
					    (P1s, N0p),
 | 
				
			||||||
 | 
					    (P1s, P2s, N0c),
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -165,6 +177,44 @@ cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fill_token(&ctxt[N1W], token + 1)
 | 
					    fill_token(&ctxt[N1W], token + 1)
 | 
				
			||||||
    fill_token(&ctxt[N2W], token + 2)
 | 
					    fill_token(&ctxt[N2W], token + 2)
 | 
				
			||||||
 | 
					    ctxt[P1s] = (token - 1).sense
 | 
				
			||||||
 | 
					    ctxt[P2s] = (token - 2).sense
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class FeatureVector:
 | 
				
			||||||
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					    cdef Feature* c
 | 
				
			||||||
 | 
					    cdef list extractors
 | 
				
			||||||
 | 
					    cdef int length
 | 
				
			||||||
 | 
					    cdef int _max_length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, length=100):
 | 
				
			||||||
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					        self.c = <Feature*>self.mem.alloc(length, sizeof(Feature))
 | 
				
			||||||
 | 
					        self.length = 0
 | 
				
			||||||
 | 
					        self._max_length = length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __len__(self):
 | 
				
			||||||
 | 
					        return self.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cpdef int add(self, feat_t key, weight_t value) except -1:
 | 
				
			||||||
 | 
					        if self.length == self._max_length:
 | 
				
			||||||
 | 
					            self._max_length *= 2
 | 
				
			||||||
 | 
					            self.c = <Feature*>self.mem.realloc(self.c, self._max_length * sizeof(Feature))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.c[self.length] = Feature(i=0, key=key, value=value)
 | 
				
			||||||
 | 
					        self.length += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int extend(self, const Feature* new_feats, int n_feats) except -1:
 | 
				
			||||||
 | 
					        new_length = self.length + n_feats
 | 
				
			||||||
 | 
					        if new_length >= self._max_length:
 | 
				
			||||||
 | 
					            self._max_length = 2 * new_length
 | 
				
			||||||
 | 
					            self.c = <Feature*>self.mem.realloc(self.c, new_length * sizeof(Feature))
 | 
				
			||||||
 | 
					        memcpy(&self.c[self.length], new_feats, n_feats * sizeof(Feature))
 | 
				
			||||||
 | 
					        self.length += n_feats
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def clear(self):
 | 
				
			||||||
 | 
					        self.length = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class SenseTagger:
 | 
					cdef class SenseTagger:
 | 
				
			||||||
| 
						 | 
					@ -201,25 +251,34 @@ cdef class SenseTagger:
 | 
				
			||||||
        self.pos_senses[<int>parts_of_speech.PUNCT] = 0
 | 
					        self.pos_senses[<int>parts_of_speech.PUNCT] = 0
 | 
				
			||||||
        self.pos_senses[<int>parts_of_speech.EOL] = 0
 | 
					        self.pos_senses[<int>parts_of_speech.EOL] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
        cdef flags_t sense = 0
 | 
					        cdef flags_t sense = 0
 | 
				
			||||||
        for _sense in range(N_Tops, V_body):
 | 
					        for sense in range(N_Tops, V_body):
 | 
				
			||||||
            self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
 | 
					            self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for _sense in range(V_body, J_ppl):
 | 
					        for sense in range(V_body, J_ppl):
 | 
				
			||||||
            self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
 | 
					            self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Tokens tokens):
 | 
					    def __call__(self, Tokens tokens):
 | 
				
			||||||
        cdef atom_t[CONTEXT_SIZE] context
 | 
					        cdef atom_t[CONTEXT_SIZE] local_context
 | 
				
			||||||
        cdef int i, guess, n_feats
 | 
					        cdef int i, guess, n_feats
 | 
				
			||||||
 | 
					        cdef flags_t valid_senses = 0
 | 
				
			||||||
        cdef TokenC* token
 | 
					        cdef TokenC* token
 | 
				
			||||||
 | 
					        cdef FeatureVector features = FeatureVector(100)
 | 
				
			||||||
        for i in range(tokens.length):
 | 
					        for i in range(tokens.length):
 | 
				
			||||||
            token = &tokens.data[i]
 | 
					            token = &tokens.data[i]
 | 
				
			||||||
            if token.pos in (NOUN, VERB):
 | 
					            if token.lex.senses == 1:
 | 
				
			||||||
                fill_context(context, token)
 | 
					                continue
 | 
				
			||||||
                feats = self.extractor.get_feats(context, &n_feats)
 | 
					            assert not (token.lex.senses & (1 << NO_SENSE)), (tokens[i].orth_, token.lex.senses)
 | 
				
			||||||
                scores = self.model.get_scores(feats, n_feats)
 | 
					            assert not (self.pos_senses[<int>token.pos] & (1 << NO_SENSE))
 | 
				
			||||||
                tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
 | 
					            valid_senses = token.lex.senses & self.pos_senses[<int>token.pos]
 | 
				
			||||||
 | 
					            assert not (valid_senses & (1 << NO_SENSE))
 | 
				
			||||||
 | 
					            if valid_senses:
 | 
				
			||||||
 | 
					                fill_context(local_context, token)
 | 
				
			||||||
 | 
					                local_feats = self.extractor.get_feats(local_context, &n_feats)
 | 
				
			||||||
 | 
					                features.extend(local_feats, n_feats)
 | 
				
			||||||
 | 
					                scores = self.model.get_scores(features.c, features.length)
 | 
				
			||||||
 | 
					                tokens.data[i].sense = self.best_in_set(scores, valid_senses)
 | 
				
			||||||
 | 
					                features.clear()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def train(self, Tokens tokens, GoldParse gold):
 | 
					    def train(self, Tokens tokens, GoldParse gold):
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
| 
						 | 
					@ -228,8 +287,10 @@ cdef class SenseTagger:
 | 
				
			||||||
            token = &tokens.data[i]
 | 
					            token = &tokens.data[i]
 | 
				
			||||||
            if ssenses:
 | 
					            if ssenses:
 | 
				
			||||||
                gold.c.ssenses[i] = encode_sense_strs(ssenses)
 | 
					                gold.c.ssenses[i] = encode_sense_strs(ssenses)
 | 
				
			||||||
            else:
 | 
					            elif token.lex.senses >= 2 and token.pos in (NOUN, VERB):
 | 
				
			||||||
                gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
 | 
					                gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                gold.c.ssenses[i] = 0
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        cdef atom_t[CONTEXT_SIZE] context
 | 
					        cdef atom_t[CONTEXT_SIZE] context
 | 
				
			||||||
        cdef int n_feats
 | 
					        cdef int n_feats
 | 
				
			||||||
| 
						 | 
					@ -244,7 +305,7 @@ cdef class SenseTagger:
 | 
				
			||||||
                fill_context(context, token)
 | 
					                fill_context(context, token)
 | 
				
			||||||
                feats = self.extractor.get_feats(context, &n_feats)
 | 
					                feats = self.extractor.get_feats(context, &n_feats)
 | 
				
			||||||
                scores = self.model.get_scores(feats, n_feats)
 | 
					                scores = self.model.get_scores(feats, n_feats)
 | 
				
			||||||
                token.sense = self.best_in_set(scores, token.lex.senses)
 | 
					                token.sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
 | 
				
			||||||
                best = self.best_in_set(scores, gold.c.ssenses[i])
 | 
					                best = self.best_in_set(scores, gold.c.ssenses[i])
 | 
				
			||||||
                guess_counts = {}
 | 
					                guess_counts = {}
 | 
				
			||||||
                gold_counts = {}
 | 
					                gold_counts = {}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user