* Refactor symbols, so that frequency rank can be derived from the orth id of a word.

This commit is contained in:
Matthew Honnibal 2015-10-07 00:39:50 +11:00
parent 3b79d67462
commit 85ce36ab11
7 changed files with 138 additions and 26 deletions

View File

@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
probs[word] = oov_prob
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
# First encode the strings into the StringStore. This way, we can map
# the orth IDs to frequency ranks
orth = vocab.strings[word]
# Now actually load the vocab
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob

View File

@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner']
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':

View File

@ -1,5 +1,6 @@
# Reserve 64 values for flag features
cpdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII
IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
IS_STOP
IS_OOV
FLAG13 = 13
FLAG14
FLAG14 = 14
FLAG15
FLAG16
FLAG17

View File

@ -0,0 +1,90 @@
ATTR_IDS = {
"NULL_ATTR": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
}
# ATTR IDs, in order of the symbol
ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]

View File

@ -15,7 +15,7 @@ from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab

View File

@ -1,23 +1,24 @@
# Google universal tag set
from .symbols cimport *
cpdef enum univ_pos_t:
NO_TAG
ADJ
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
N_UNIV_TAGS
NO_TAG = EMPTY_VALUE
ADJ = POS_adj
ADP = POS_adp
ADV = POS_adv
AUX = POS_aux
CONJ = POS_conj
DET = POS_det
INTJ = POS_intj
NOUN = POS_noun
NUM = POS_num
PART = POS_part
PRON = POS_pron
PROPN = POS_propn
PUNCT = POS_punct
SCONJ = POS_sconj
SYM = POS_sym
VERB = POS_verb
X = POS_x
EOL = POS_eol
SPACE = POS_space

View File

@ -67,6 +67,21 @@ cdef class Vocab:
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
# Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural
# strings are loaded first, because the vocab is open-class, and these
# symbols are closed class.
#for attr_name in sorted(ATTR_NAMES.keys()):
# _ = self.strings[attr_name]
#for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
# _ = self.strings[pos_name]
#for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
# _ = self.strings[morph_name]
#for entity_type_name in sorted(ENTITY_TYPES.keys()):
# _ = self.strings[entity_type_name]
#for tag_name in sorted(TAG_MAP.keys()):
# _ = self.strings[tag_name]
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs