mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Refactor symbols, so that frequency rank can be derived from the orth id of a word.
This commit is contained in:
		
							parent
							
								
									08e29519a6
								
							
						
					
					
						commit
						064bd69ad0
					
				| 
						 | 
					@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
 | 
				
			||||||
            probs[word] = oov_prob
 | 
					            probs[word] = oov_prob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    lexicon = []
 | 
					    lexicon = []
 | 
				
			||||||
 | 
					    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
				
			||||||
 | 
					        # First encode the strings into the StringStore. This way, we can map
 | 
				
			||||||
 | 
					        # the orth IDs to frequency ranks
 | 
				
			||||||
 | 
					        orth = vocab.strings[word]
 | 
				
			||||||
 | 
					    # Now actually load the vocab
 | 
				
			||||||
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
					    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
				
			||||||
        lexeme = vocab[word]
 | 
					        lexeme = vocab[word]
 | 
				
			||||||
        lexeme.prob = prob
 | 
					        lexeme.prob = prob
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
 | 
				
			||||||
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
					             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
				
			||||||
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
					             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
				
			||||||
             'spacy.cfile', 'spacy.matcher',
 | 
					             'spacy.cfile', 'spacy.matcher',
 | 
				
			||||||
             'spacy.syntax.ner']
 | 
					             'spacy.syntax.ner',
 | 
				
			||||||
 | 
					             'spacy.symbols']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
# Reserve 64 values for flag features
 | 
					# Reserve 64 values for flag features
 | 
				
			||||||
cpdef enum attr_id_t:
 | 
					cpdef enum attr_id_t:
 | 
				
			||||||
 | 
					    NULL_ATTR
 | 
				
			||||||
    IS_ALPHA
 | 
					    IS_ALPHA
 | 
				
			||||||
    IS_ASCII
 | 
					    IS_ASCII
 | 
				
			||||||
    IS_DIGIT
 | 
					    IS_DIGIT
 | 
				
			||||||
| 
						 | 
					@ -14,8 +15,7 @@ cpdef enum attr_id_t:
 | 
				
			||||||
    IS_STOP
 | 
					    IS_STOP
 | 
				
			||||||
    IS_OOV
 | 
					    IS_OOV
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    FLAG13 = 13
 | 
					    FLAG14 = 14
 | 
				
			||||||
    FLAG14
 | 
					 | 
				
			||||||
    FLAG15
 | 
					    FLAG15
 | 
				
			||||||
    FLAG16
 | 
					    FLAG16
 | 
				
			||||||
    FLAG17
 | 
					    FLAG17
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,90 @@
 | 
				
			||||||
 | 
					ATTR_IDS = {
 | 
				
			||||||
 | 
					    "NULL_ATTR": NULL_ATTR,
 | 
				
			||||||
 | 
					    "IS_ALPHA": IS_ALPHA,
 | 
				
			||||||
 | 
					    "IS_ASCII": IS_ASCII,
 | 
				
			||||||
 | 
					    "IS_DIGIT": IS_DIGIT,
 | 
				
			||||||
 | 
					    "IS_LOWER": IS_LOWER,
 | 
				
			||||||
 | 
					    "IS_PUNCT": IS_PUNCT,
 | 
				
			||||||
 | 
					    "IS_SPACE": IS_SPACE,
 | 
				
			||||||
 | 
					    "IS_TITLE": IS_TITLE,
 | 
				
			||||||
 | 
					    "IS_UPPER": IS_UPPER,
 | 
				
			||||||
 | 
					    "LIKE_URL": LIKE_URL,
 | 
				
			||||||
 | 
					    "LIKE_NUM": LIKE_NUM,
 | 
				
			||||||
 | 
					    "LIKE_EMAIL": LIKE_EMAIL,
 | 
				
			||||||
 | 
					    "IS_STOP": IS_STOP,
 | 
				
			||||||
 | 
					    "IS_OOV": IS_OOV,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    "FLAG14": FLAG14,
 | 
				
			||||||
 | 
					    "FLAG15": FLAG15,
 | 
				
			||||||
 | 
					    "FLAG16": FLAG16,
 | 
				
			||||||
 | 
					    "FLAG17": FLAG17,
 | 
				
			||||||
 | 
					    "FLAG18": FLAG18,
 | 
				
			||||||
 | 
					    "FLAG19": FLAG19,
 | 
				
			||||||
 | 
					    "FLAG20": FLAG20,
 | 
				
			||||||
 | 
					    "FLAG21": FLAG21,
 | 
				
			||||||
 | 
					    "FLAG22": FLAG22,
 | 
				
			||||||
 | 
					    "FLAG23": FLAG23,
 | 
				
			||||||
 | 
					    "FLAG24": FLAG24,
 | 
				
			||||||
 | 
					    "FLAG25": FLAG25,
 | 
				
			||||||
 | 
					    "FLAG26": FLAG26,
 | 
				
			||||||
 | 
					    "FLAG27": FLAG27,
 | 
				
			||||||
 | 
					    "FLAG28": FLAG28,
 | 
				
			||||||
 | 
					    "FLAG29": FLAG29,
 | 
				
			||||||
 | 
					    "FLAG30": FLAG30,
 | 
				
			||||||
 | 
					    "FLAG31": FLAG31,
 | 
				
			||||||
 | 
					    "FLAG32": FLAG32,
 | 
				
			||||||
 | 
					    "FLAG33": FLAG33,
 | 
				
			||||||
 | 
					    "FLAG34": FLAG34,
 | 
				
			||||||
 | 
					    "FLAG35": FLAG35,
 | 
				
			||||||
 | 
					    "FLAG36": FLAG36,
 | 
				
			||||||
 | 
					    "FLAG37": FLAG37,
 | 
				
			||||||
 | 
					    "FLAG38": FLAG38,
 | 
				
			||||||
 | 
					    "FLAG39": FLAG39,
 | 
				
			||||||
 | 
					    "FLAG40": FLAG40,
 | 
				
			||||||
 | 
					    "FLAG41": FLAG41,
 | 
				
			||||||
 | 
					    "FLAG42": FLAG42,
 | 
				
			||||||
 | 
					    "FLAG43": FLAG43,
 | 
				
			||||||
 | 
					    "FLAG44": FLAG44,
 | 
				
			||||||
 | 
					    "FLAG45": FLAG45,
 | 
				
			||||||
 | 
					    "FLAG46": FLAG46,
 | 
				
			||||||
 | 
					    "FLAG47": FLAG47,
 | 
				
			||||||
 | 
					    "FLAG48": FLAG48,
 | 
				
			||||||
 | 
					    "FLAG49": FLAG49,
 | 
				
			||||||
 | 
					    "FLAG50": FLAG50,
 | 
				
			||||||
 | 
					    "FLAG51": FLAG51,
 | 
				
			||||||
 | 
					    "FLAG52": FLAG52,
 | 
				
			||||||
 | 
					    "FLAG53": FLAG53,
 | 
				
			||||||
 | 
					    "FLAG54": FLAG54,
 | 
				
			||||||
 | 
					    "FLAG55": FLAG55,
 | 
				
			||||||
 | 
					    "FLAG56": FLAG56,
 | 
				
			||||||
 | 
					    "FLAG57": FLAG57,
 | 
				
			||||||
 | 
					    "FLAG58": FLAG58,
 | 
				
			||||||
 | 
					    "FLAG59": FLAG59,
 | 
				
			||||||
 | 
					    "FLAG60": FLAG60,
 | 
				
			||||||
 | 
					    "FLAG61": FLAG61,
 | 
				
			||||||
 | 
					    "FLAG62": FLAG62,
 | 
				
			||||||
 | 
					    "FLAG63": FLAG63,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    "ID": ID,
 | 
				
			||||||
 | 
					    "ORTH": ORTH,
 | 
				
			||||||
 | 
					    "LOWER": LOWER,
 | 
				
			||||||
 | 
					    "NORM": NORM,
 | 
				
			||||||
 | 
					    "SHAPE": SHAPE,
 | 
				
			||||||
 | 
					    "PREFIX": PREFIX,
 | 
				
			||||||
 | 
					    "SUFFIX": SUFFIX,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    "LENGTH": LENGTH,
 | 
				
			||||||
 | 
					    "CLUSTER": CLUSTER,
 | 
				
			||||||
 | 
					    "LEMMA": LEMMA,
 | 
				
			||||||
 | 
					    "POS": POS,
 | 
				
			||||||
 | 
					    "TAG": TAG,
 | 
				
			||||||
 | 
					    "DEP": DEP,
 | 
				
			||||||
 | 
					    "ENT_IOB": ENT_IOB,
 | 
				
			||||||
 | 
					    "ENT_TYPE": ENT_TYPE,
 | 
				
			||||||
 | 
					    "HEAD": HEAD,
 | 
				
			||||||
 | 
					    "SPACY": SPACY,
 | 
				
			||||||
 | 
					    "PROB": PROB,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ATTR IDs, in order of the symbol
 | 
				
			||||||
 | 
					ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,7 @@ from libcpp.vector cimport vector
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 | 
					from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 | 
				
			||||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 | 
					from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 | 
				
			||||||
from .tokens.doc cimport get_token_attr
 | 
					from .tokens.doc cimport get_token_attr
 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					from .tokens.doc cimport Doc
 | 
				
			||||||
from .vocab cimport Vocab
 | 
					from .vocab cimport Vocab
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,24 @@
 | 
				
			||||||
# Google universal tag set
 | 
					from .symbols cimport *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef enum univ_pos_t:
 | 
					cpdef enum univ_pos_t:
 | 
				
			||||||
    NO_TAG
 | 
					    NO_TAG = EMPTY_VALUE
 | 
				
			||||||
    ADJ
 | 
					    ADJ = POS_adj
 | 
				
			||||||
    ADP
 | 
					    ADP = POS_adp
 | 
				
			||||||
    ADV
 | 
					    ADV = POS_adv
 | 
				
			||||||
    AUX
 | 
					    AUX = POS_aux
 | 
				
			||||||
    CONJ
 | 
					    CONJ = POS_conj
 | 
				
			||||||
    DET
 | 
					    DET = POS_det
 | 
				
			||||||
    INTJ
 | 
					    INTJ = POS_intj
 | 
				
			||||||
    NOUN
 | 
					    NOUN = POS_noun
 | 
				
			||||||
    NUM
 | 
					    NUM = POS_num
 | 
				
			||||||
    PART
 | 
					    PART = POS_part
 | 
				
			||||||
    PRON
 | 
					    PRON = POS_pron
 | 
				
			||||||
    PROPN
 | 
					    PROPN = POS_propn
 | 
				
			||||||
    PUNCT
 | 
					    PUNCT = POS_punct
 | 
				
			||||||
    SCONJ
 | 
					    SCONJ = POS_sconj
 | 
				
			||||||
    SYM
 | 
					    SYM = POS_sym
 | 
				
			||||||
    VERB
 | 
					    VERB = POS_verb
 | 
				
			||||||
    X
 | 
					    X = POS_x
 | 
				
			||||||
    EOL
 | 
					    EOL = POS_eol
 | 
				
			||||||
    SPACE
 | 
					    SPACE = POS_space
 | 
				
			||||||
    N_UNIV_TAGS
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -67,6 +67,21 @@ cdef class Vocab:
 | 
				
			||||||
        self._by_hash = PreshMap()
 | 
					        self._by_hash = PreshMap()
 | 
				
			||||||
        self._by_orth = PreshMap()
 | 
					        self._by_orth = PreshMap()
 | 
				
			||||||
        self.strings = StringStore()
 | 
					        self.strings = StringStore()
 | 
				
			||||||
 | 
					        # Load strings in a special order, so that we have an onset number for
 | 
				
			||||||
 | 
					        # the vocabulary. This way, when words are added in order, the orth ID
 | 
				
			||||||
 | 
					        # is the frequency rank of the word, plus a certain offset. The structural
 | 
				
			||||||
 | 
					        # strings are loaded first, because the vocab is open-class, and these
 | 
				
			||||||
 | 
					        # symbols are closed class.
 | 
				
			||||||
 | 
					        #for attr_name in sorted(ATTR_NAMES.keys()):
 | 
				
			||||||
 | 
					        #    _ = self.strings[attr_name]
 | 
				
			||||||
 | 
					        #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
 | 
				
			||||||
 | 
					        #    _ = self.strings[pos_name]
 | 
				
			||||||
 | 
					        #for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
 | 
				
			||||||
 | 
					        #    _ = self.strings[morph_name]
 | 
				
			||||||
 | 
					        #for entity_type_name in sorted(ENTITY_TYPES.keys()):
 | 
				
			||||||
 | 
					        #    _ = self.strings[entity_type_name]
 | 
				
			||||||
 | 
					        #for tag_name in sorted(TAG_MAP.keys()):
 | 
				
			||||||
 | 
					        #    _ = self.strings[tag_name]
 | 
				
			||||||
        self.get_lex_attr = get_lex_attr
 | 
					        self.get_lex_attr = get_lex_attr
 | 
				
			||||||
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
 | 
					        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
 | 
				
			||||||
        self.serializer_freqs = serializer_freqs
 | 
					        self.serializer_freqs = serializer_freqs
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user