mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Refactor symbols, so that frequency rank can be derived from the orth id of a word.
This commit is contained in:
parent
08e29519a6
commit
064bd69ad0
|
@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
|||
probs[word] = oov_prob
|
||||
|
||||
lexicon = []
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
# First encode the strings into the StringStore. This way, we can map
|
||||
# the orth IDs to frequency ranks
|
||||
orth = vocab.strings[word]
|
||||
# Now actually load the vocab
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
|
|
3
setup.py
3
setup.py
|
@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
|||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||
'spacy.cfile', 'spacy.matcher',
|
||||
'spacy.syntax.ner']
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
|
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
|
|||
IS_STOP
|
||||
IS_OOV
|
||||
|
||||
FLAG13 = 13
|
||||
FLAG14
|
||||
FLAG14 = 14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
ATTR_IDS = {
|
||||
"NULL_ATTR": NULL_ATTR,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
"IS_ASCII": IS_ASCII,
|
||||
"IS_DIGIT": IS_DIGIT,
|
||||
"IS_LOWER": IS_LOWER,
|
||||
"IS_PUNCT": IS_PUNCT,
|
||||
"IS_SPACE": IS_SPACE,
|
||||
"IS_TITLE": IS_TITLE,
|
||||
"IS_UPPER": IS_UPPER,
|
||||
"LIKE_URL": LIKE_URL,
|
||||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
"FLAG17": FLAG17,
|
||||
"FLAG18": FLAG18,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
"FLAG22": FLAG22,
|
||||
"FLAG23": FLAG23,
|
||||
"FLAG24": FLAG24,
|
||||
"FLAG25": FLAG25,
|
||||
"FLAG26": FLAG26,
|
||||
"FLAG27": FLAG27,
|
||||
"FLAG28": FLAG28,
|
||||
"FLAG29": FLAG29,
|
||||
"FLAG30": FLAG30,
|
||||
"FLAG31": FLAG31,
|
||||
"FLAG32": FLAG32,
|
||||
"FLAG33": FLAG33,
|
||||
"FLAG34": FLAG34,
|
||||
"FLAG35": FLAG35,
|
||||
"FLAG36": FLAG36,
|
||||
"FLAG37": FLAG37,
|
||||
"FLAG38": FLAG38,
|
||||
"FLAG39": FLAG39,
|
||||
"FLAG40": FLAG40,
|
||||
"FLAG41": FLAG41,
|
||||
"FLAG42": FLAG42,
|
||||
"FLAG43": FLAG43,
|
||||
"FLAG44": FLAG44,
|
||||
"FLAG45": FLAG45,
|
||||
"FLAG46": FLAG46,
|
||||
"FLAG47": FLAG47,
|
||||
"FLAG48": FLAG48,
|
||||
"FLAG49": FLAG49,
|
||||
"FLAG50": FLAG50,
|
||||
"FLAG51": FLAG51,
|
||||
"FLAG52": FLAG52,
|
||||
"FLAG53": FLAG53,
|
||||
"FLAG54": FLAG54,
|
||||
"FLAG55": FLAG55,
|
||||
"FLAG56": FLAG56,
|
||||
"FLAG57": FLAG57,
|
||||
"FLAG58": FLAG58,
|
||||
"FLAG59": FLAG59,
|
||||
"FLAG60": FLAG60,
|
||||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
"NORM": NORM,
|
||||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"CLUSTER": CLUSTER,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
"TAG": TAG,
|
||||
"DEP": DEP,
|
||||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"HEAD": HEAD,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
}
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
|
|
@ -15,7 +15,7 @@ from libcpp.vector cimport vector
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab
|
||||
|
|
|
@ -1,23 +1,24 @@
|
|||
# Google universal tag set
|
||||
from .symbols cimport *
|
||||
|
||||
|
||||
cpdef enum univ_pos_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
NUM
|
||||
PART
|
||||
PRON
|
||||
PROPN
|
||||
PUNCT
|
||||
SCONJ
|
||||
SYM
|
||||
VERB
|
||||
X
|
||||
EOL
|
||||
SPACE
|
||||
N_UNIV_TAGS
|
||||
NO_TAG = EMPTY_VALUE
|
||||
ADJ = POS_adj
|
||||
ADP = POS_adp
|
||||
ADV = POS_adv
|
||||
AUX = POS_aux
|
||||
CONJ = POS_conj
|
||||
DET = POS_det
|
||||
INTJ = POS_intj
|
||||
NOUN = POS_noun
|
||||
NUM = POS_num
|
||||
PART = POS_part
|
||||
PRON = POS_pron
|
||||
PROPN = POS_propn
|
||||
PUNCT = POS_punct
|
||||
SCONJ = POS_sconj
|
||||
SYM = POS_sym
|
||||
VERB = POS_verb
|
||||
X = POS_x
|
||||
EOL = POS_eol
|
||||
SPACE = POS_space
|
||||
|
|
|
@ -67,6 +67,21 @@ cdef class Vocab:
|
|||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
# Load strings in a special order, so that we have an onset number for
|
||||
# the vocabulary. This way, when words are added in order, the orth ID
|
||||
# is the frequency rank of the word, plus a certain offset. The structural
|
||||
# strings are loaded first, because the vocab is open-class, and these
|
||||
# symbols are closed class.
|
||||
#for attr_name in sorted(ATTR_NAMES.keys()):
|
||||
# _ = self.strings[attr_name]
|
||||
#for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
|
||||
# _ = self.strings[pos_name]
|
||||
#for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
|
||||
# _ = self.strings[morph_name]
|
||||
#for entity_type_name in sorted(ENTITY_TYPES.keys()):
|
||||
# _ = self.strings[entity_type_name]
|
||||
#for tag_name in sorted(TAG_MAP.keys()):
|
||||
# _ = self.strings[tag_name]
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.serializer_freqs = serializer_freqs
|
||||
|
|
Loading…
Reference in New Issue
Block a user