* Refactor symbols, so that frequency rank can be derived from the orth id of a word.

This commit is contained in:
Matthew Honnibal 2015-10-07 00:39:50 +11:00
parent 3b79d67462
commit 85ce36ab11
7 changed files with 138 additions and 26 deletions

View File

@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
probs[word] = oov_prob probs[word] = oov_prob
lexicon = [] lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
# First encode the strings into the StringStore. This way, we can map
# the orth IDs to frequency ranks
orth = vocab.strings[word]
# Now actually load the vocab
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word] lexeme = vocab[word]
lexeme.prob = prob lexeme.prob = prob

View File

@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher', 'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner'] 'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,5 +1,6 @@
# Reserve 64 values for flag features # Reserve 64 values for flag features
cpdef enum attr_id_t: cpdef enum attr_id_t:
NULL_ATTR
IS_ALPHA IS_ALPHA
IS_ASCII IS_ASCII
IS_DIGIT IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
IS_STOP IS_STOP
IS_OOV IS_OOV
FLAG13 = 13 FLAG14 = 14
FLAG14
FLAG15 FLAG15
FLAG16 FLAG16
FLAG17 FLAG17

View File

@ -0,0 +1,90 @@
ATTR_IDS = {
"NULL_ATTR": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
}
# ATTR IDs, in order of the symbol
ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]

View File

@ -15,7 +15,7 @@ from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .vocab cimport Vocab from .vocab cimport Vocab

View File

@ -1,23 +1,24 @@
# Google universal tag set from .symbols cimport *
cpdef enum univ_pos_t: cpdef enum univ_pos_t:
NO_TAG NO_TAG = EMPTY_VALUE
ADJ ADJ = POS_adj
ADP ADP = POS_adp
ADV ADV = POS_adv
AUX AUX = POS_aux
CONJ CONJ = POS_conj
DET DET = POS_det
INTJ INTJ = POS_intj
NOUN NOUN = POS_noun
NUM NUM = POS_num
PART PART = POS_part
PRON PRON = POS_pron
PROPN PROPN = POS_propn
PUNCT PUNCT = POS_punct
SCONJ SCONJ = POS_sconj
SYM SYM = POS_sym
VERB VERB = POS_verb
X X = POS_x
EOL EOL = POS_eol
SPACE SPACE = POS_space
N_UNIV_TAGS

View File

@ -67,6 +67,21 @@ cdef class Vocab:
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
# Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural
# strings are loaded first, because the vocab is open-class, and these
# symbols are closed class.
#for attr_name in sorted(ATTR_NAMES.keys()):
# _ = self.strings[attr_name]
#for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
# _ = self.strings[pos_name]
#for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
# _ = self.strings[morph_name]
#for entity_type_name in sorted(ENTITY_TYPES.keys()):
# _ = self.strings[entity_type_name]
#for tag_name in sorted(TAG_MAP.keys()):
# _ = self.strings[tag_name]
self.get_lex_attr = get_lex_attr self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs self.serializer_freqs = serializer_freqs