mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Refactor symbols, so that frequency rank can be derived from the orth id of a word.
This commit is contained in:
		
							parent
							
								
									08e29519a6
								
							
						
					
					
						commit
						064bd69ad0
					
				| 
						 | 
				
			
			@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
 | 
			
		|||
            probs[word] = oov_prob
 | 
			
		||||
 | 
			
		||||
    lexicon = []
 | 
			
		||||
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
			
		||||
        # First encode the strings into the StringStore. This way, we can map
 | 
			
		||||
        # the orth IDs to frequency ranks
 | 
			
		||||
        orth = vocab.strings[word]
 | 
			
		||||
    # Now actually load the vocab
 | 
			
		||||
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
			
		||||
        lexeme = vocab[word]
 | 
			
		||||
        lexeme.prob = prob
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
 | 
			
		|||
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
			
		||||
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
			
		||||
             'spacy.cfile', 'spacy.matcher',
 | 
			
		||||
             'spacy.syntax.ner']
 | 
			
		||||
             'spacy.syntax.ner',
 | 
			
		||||
             'spacy.symbols']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,6 @@
 | 
			
		|||
# Reserve 64 values for flag features
 | 
			
		||||
cpdef enum attr_id_t:
 | 
			
		||||
    NULL_ATTR
 | 
			
		||||
    IS_ALPHA
 | 
			
		||||
    IS_ASCII
 | 
			
		||||
    IS_DIGIT
 | 
			
		||||
| 
						 | 
				
			
			@ -14,8 +15,7 @@ cpdef enum attr_id_t:
 | 
			
		|||
    IS_STOP
 | 
			
		||||
    IS_OOV
 | 
			
		||||
    
 | 
			
		||||
    FLAG13 = 13
 | 
			
		||||
    FLAG14
 | 
			
		||||
    FLAG14 = 14
 | 
			
		||||
    FLAG15
 | 
			
		||||
    FLAG16
 | 
			
		||||
    FLAG17
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,90 @@
 | 
			
		|||
ATTR_IDS = {
 | 
			
		||||
    "NULL_ATTR": NULL_ATTR,
 | 
			
		||||
    "IS_ALPHA": IS_ALPHA,
 | 
			
		||||
    "IS_ASCII": IS_ASCII,
 | 
			
		||||
    "IS_DIGIT": IS_DIGIT,
 | 
			
		||||
    "IS_LOWER": IS_LOWER,
 | 
			
		||||
    "IS_PUNCT": IS_PUNCT,
 | 
			
		||||
    "IS_SPACE": IS_SPACE,
 | 
			
		||||
    "IS_TITLE": IS_TITLE,
 | 
			
		||||
    "IS_UPPER": IS_UPPER,
 | 
			
		||||
    "LIKE_URL": LIKE_URL,
 | 
			
		||||
    "LIKE_NUM": LIKE_NUM,
 | 
			
		||||
    "LIKE_EMAIL": LIKE_EMAIL,
 | 
			
		||||
    "IS_STOP": IS_STOP,
 | 
			
		||||
    "IS_OOV": IS_OOV,
 | 
			
		||||
 | 
			
		||||
    "FLAG14": FLAG14,
 | 
			
		||||
    "FLAG15": FLAG15,
 | 
			
		||||
    "FLAG16": FLAG16,
 | 
			
		||||
    "FLAG17": FLAG17,
 | 
			
		||||
    "FLAG18": FLAG18,
 | 
			
		||||
    "FLAG19": FLAG19,
 | 
			
		||||
    "FLAG20": FLAG20,
 | 
			
		||||
    "FLAG21": FLAG21,
 | 
			
		||||
    "FLAG22": FLAG22,
 | 
			
		||||
    "FLAG23": FLAG23,
 | 
			
		||||
    "FLAG24": FLAG24,
 | 
			
		||||
    "FLAG25": FLAG25,
 | 
			
		||||
    "FLAG26": FLAG26,
 | 
			
		||||
    "FLAG27": FLAG27,
 | 
			
		||||
    "FLAG28": FLAG28,
 | 
			
		||||
    "FLAG29": FLAG29,
 | 
			
		||||
    "FLAG30": FLAG30,
 | 
			
		||||
    "FLAG31": FLAG31,
 | 
			
		||||
    "FLAG32": FLAG32,
 | 
			
		||||
    "FLAG33": FLAG33,
 | 
			
		||||
    "FLAG34": FLAG34,
 | 
			
		||||
    "FLAG35": FLAG35,
 | 
			
		||||
    "FLAG36": FLAG36,
 | 
			
		||||
    "FLAG37": FLAG37,
 | 
			
		||||
    "FLAG38": FLAG38,
 | 
			
		||||
    "FLAG39": FLAG39,
 | 
			
		||||
    "FLAG40": FLAG40,
 | 
			
		||||
    "FLAG41": FLAG41,
 | 
			
		||||
    "FLAG42": FLAG42,
 | 
			
		||||
    "FLAG43": FLAG43,
 | 
			
		||||
    "FLAG44": FLAG44,
 | 
			
		||||
    "FLAG45": FLAG45,
 | 
			
		||||
    "FLAG46": FLAG46,
 | 
			
		||||
    "FLAG47": FLAG47,
 | 
			
		||||
    "FLAG48": FLAG48,
 | 
			
		||||
    "FLAG49": FLAG49,
 | 
			
		||||
    "FLAG50": FLAG50,
 | 
			
		||||
    "FLAG51": FLAG51,
 | 
			
		||||
    "FLAG52": FLAG52,
 | 
			
		||||
    "FLAG53": FLAG53,
 | 
			
		||||
    "FLAG54": FLAG54,
 | 
			
		||||
    "FLAG55": FLAG55,
 | 
			
		||||
    "FLAG56": FLAG56,
 | 
			
		||||
    "FLAG57": FLAG57,
 | 
			
		||||
    "FLAG58": FLAG58,
 | 
			
		||||
    "FLAG59": FLAG59,
 | 
			
		||||
    "FLAG60": FLAG60,
 | 
			
		||||
    "FLAG61": FLAG61,
 | 
			
		||||
    "FLAG62": FLAG62,
 | 
			
		||||
    "FLAG63": FLAG63,
 | 
			
		||||
 | 
			
		||||
    "ID": ID,
 | 
			
		||||
    "ORTH": ORTH,
 | 
			
		||||
    "LOWER": LOWER,
 | 
			
		||||
    "NORM": NORM,
 | 
			
		||||
    "SHAPE": SHAPE,
 | 
			
		||||
    "PREFIX": PREFIX,
 | 
			
		||||
    "SUFFIX": SUFFIX,
 | 
			
		||||
 | 
			
		||||
    "LENGTH": LENGTH,
 | 
			
		||||
    "CLUSTER": CLUSTER,
 | 
			
		||||
    "LEMMA": LEMMA,
 | 
			
		||||
    "POS": POS,
 | 
			
		||||
    "TAG": TAG,
 | 
			
		||||
    "DEP": DEP,
 | 
			
		||||
    "ENT_IOB": ENT_IOB,
 | 
			
		||||
    "ENT_TYPE": ENT_TYPE,
 | 
			
		||||
    "HEAD": HEAD,
 | 
			
		||||
    "SPACY": SPACY,
 | 
			
		||||
    "PROB": PROB,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ATTR IDs, in order of the symbol
 | 
			
		||||
ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
 | 
			
		||||
| 
						 | 
				
			
			@ -15,7 +15,7 @@ from libcpp.vector cimport vector
 | 
			
		|||
from murmurhash.mrmr cimport hash64
 | 
			
		||||
 | 
			
		||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 | 
			
		||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 | 
			
		||||
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 | 
			
		||||
from .tokens.doc cimport get_token_attr
 | 
			
		||||
from .tokens.doc cimport Doc
 | 
			
		||||
from .vocab cimport Vocab
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,23 +1,24 @@
 | 
			
		|||
# Google universal tag set
 | 
			
		||||
from .symbols cimport *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef enum univ_pos_t:
 | 
			
		||||
    NO_TAG
 | 
			
		||||
    ADJ
 | 
			
		||||
    ADP
 | 
			
		||||
    ADV
 | 
			
		||||
    AUX
 | 
			
		||||
    CONJ
 | 
			
		||||
    DET
 | 
			
		||||
    INTJ
 | 
			
		||||
    NOUN
 | 
			
		||||
    NUM
 | 
			
		||||
    PART
 | 
			
		||||
    PRON
 | 
			
		||||
    PROPN
 | 
			
		||||
    PUNCT
 | 
			
		||||
    SCONJ
 | 
			
		||||
    SYM
 | 
			
		||||
    VERB
 | 
			
		||||
    X
 | 
			
		||||
    EOL
 | 
			
		||||
    SPACE
 | 
			
		||||
    N_UNIV_TAGS
 | 
			
		||||
    NO_TAG = EMPTY_VALUE
 | 
			
		||||
    ADJ = POS_adj
 | 
			
		||||
    ADP = POS_adp
 | 
			
		||||
    ADV = POS_adv
 | 
			
		||||
    AUX = POS_aux
 | 
			
		||||
    CONJ = POS_conj
 | 
			
		||||
    DET = POS_det
 | 
			
		||||
    INTJ = POS_intj
 | 
			
		||||
    NOUN = POS_noun
 | 
			
		||||
    NUM = POS_num
 | 
			
		||||
    PART = POS_part
 | 
			
		||||
    PRON = POS_pron
 | 
			
		||||
    PROPN = POS_propn
 | 
			
		||||
    PUNCT = POS_punct
 | 
			
		||||
    SCONJ = POS_sconj
 | 
			
		||||
    SYM = POS_sym
 | 
			
		||||
    VERB = POS_verb
 | 
			
		||||
    X = POS_x
 | 
			
		||||
    EOL = POS_eol
 | 
			
		||||
    SPACE = POS_space
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -67,6 +67,21 @@ cdef class Vocab:
 | 
			
		|||
        self._by_hash = PreshMap()
 | 
			
		||||
        self._by_orth = PreshMap()
 | 
			
		||||
        self.strings = StringStore()
 | 
			
		||||
        # Load strings in a special order, so that we have an onset number for
 | 
			
		||||
        # the vocabulary. This way, when words are added in order, the orth ID
 | 
			
		||||
        # is the frequency rank of the word, plus a certain offset. The structural
 | 
			
		||||
        # strings are loaded first, because the vocab is open-class, and these
 | 
			
		||||
        # symbols are closed class.
 | 
			
		||||
        #for attr_name in sorted(ATTR_NAMES.keys()):
 | 
			
		||||
        #    _ = self.strings[attr_name]
 | 
			
		||||
        #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
 | 
			
		||||
        #    _ = self.strings[pos_name]
 | 
			
		||||
        #for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
 | 
			
		||||
        #    _ = self.strings[morph_name]
 | 
			
		||||
        #for entity_type_name in sorted(ENTITY_TYPES.keys()):
 | 
			
		||||
        #    _ = self.strings[entity_type_name]
 | 
			
		||||
        #for tag_name in sorted(TAG_MAP.keys()):
 | 
			
		||||
        #    _ = self.strings[tag_name]
 | 
			
		||||
        self.get_lex_attr = get_lex_attr
 | 
			
		||||
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
 | 
			
		||||
        self.serializer_freqs = serializer_freqs
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user