mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Refactor symbols, so that frequency rank can be derived from the orth id of a word.
This commit is contained in:
parent
3b79d67462
commit
85ce36ab11
|
@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
||||||
probs[word] = oov_prob
|
probs[word] = oov_prob
|
||||||
|
|
||||||
lexicon = []
|
lexicon = []
|
||||||
|
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
|
# First encode the strings into the StringStore. This way, we can map
|
||||||
|
# the orth IDs to frequency ranks
|
||||||
|
orth = vocab.strings[word]
|
||||||
|
# Now actually load the vocab
|
||||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
lexeme = vocab[word]
|
lexeme = vocab[word]
|
||||||
lexeme.prob = prob
|
lexeme.prob = prob
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||||
'spacy.cfile', 'spacy.matcher',
|
'spacy.cfile', 'spacy.matcher',
|
||||||
'spacy.syntax.ner']
|
'spacy.syntax.ner',
|
||||||
|
'spacy.symbols']
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
cpdef enum attr_id_t:
|
cpdef enum attr_id_t:
|
||||||
|
NULL_ATTR
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
IS_ASCII
|
IS_ASCII
|
||||||
IS_DIGIT
|
IS_DIGIT
|
||||||
|
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
|
||||||
FLAG13 = 13
|
FLAG14 = 14
|
||||||
FLAG14
|
|
||||||
FLAG15
|
FLAG15
|
||||||
FLAG16
|
FLAG16
|
||||||
FLAG17
|
FLAG17
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
ATTR_IDS = {
|
||||||
|
"NULL_ATTR": NULL_ATTR,
|
||||||
|
"IS_ALPHA": IS_ALPHA,
|
||||||
|
"IS_ASCII": IS_ASCII,
|
||||||
|
"IS_DIGIT": IS_DIGIT,
|
||||||
|
"IS_LOWER": IS_LOWER,
|
||||||
|
"IS_PUNCT": IS_PUNCT,
|
||||||
|
"IS_SPACE": IS_SPACE,
|
||||||
|
"IS_TITLE": IS_TITLE,
|
||||||
|
"IS_UPPER": IS_UPPER,
|
||||||
|
"LIKE_URL": LIKE_URL,
|
||||||
|
"LIKE_NUM": LIKE_NUM,
|
||||||
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
|
"IS_STOP": IS_STOP,
|
||||||
|
"IS_OOV": IS_OOV,
|
||||||
|
|
||||||
|
"FLAG14": FLAG14,
|
||||||
|
"FLAG15": FLAG15,
|
||||||
|
"FLAG16": FLAG16,
|
||||||
|
"FLAG17": FLAG17,
|
||||||
|
"FLAG18": FLAG18,
|
||||||
|
"FLAG19": FLAG19,
|
||||||
|
"FLAG20": FLAG20,
|
||||||
|
"FLAG21": FLAG21,
|
||||||
|
"FLAG22": FLAG22,
|
||||||
|
"FLAG23": FLAG23,
|
||||||
|
"FLAG24": FLAG24,
|
||||||
|
"FLAG25": FLAG25,
|
||||||
|
"FLAG26": FLAG26,
|
||||||
|
"FLAG27": FLAG27,
|
||||||
|
"FLAG28": FLAG28,
|
||||||
|
"FLAG29": FLAG29,
|
||||||
|
"FLAG30": FLAG30,
|
||||||
|
"FLAG31": FLAG31,
|
||||||
|
"FLAG32": FLAG32,
|
||||||
|
"FLAG33": FLAG33,
|
||||||
|
"FLAG34": FLAG34,
|
||||||
|
"FLAG35": FLAG35,
|
||||||
|
"FLAG36": FLAG36,
|
||||||
|
"FLAG37": FLAG37,
|
||||||
|
"FLAG38": FLAG38,
|
||||||
|
"FLAG39": FLAG39,
|
||||||
|
"FLAG40": FLAG40,
|
||||||
|
"FLAG41": FLAG41,
|
||||||
|
"FLAG42": FLAG42,
|
||||||
|
"FLAG43": FLAG43,
|
||||||
|
"FLAG44": FLAG44,
|
||||||
|
"FLAG45": FLAG45,
|
||||||
|
"FLAG46": FLAG46,
|
||||||
|
"FLAG47": FLAG47,
|
||||||
|
"FLAG48": FLAG48,
|
||||||
|
"FLAG49": FLAG49,
|
||||||
|
"FLAG50": FLAG50,
|
||||||
|
"FLAG51": FLAG51,
|
||||||
|
"FLAG52": FLAG52,
|
||||||
|
"FLAG53": FLAG53,
|
||||||
|
"FLAG54": FLAG54,
|
||||||
|
"FLAG55": FLAG55,
|
||||||
|
"FLAG56": FLAG56,
|
||||||
|
"FLAG57": FLAG57,
|
||||||
|
"FLAG58": FLAG58,
|
||||||
|
"FLAG59": FLAG59,
|
||||||
|
"FLAG60": FLAG60,
|
||||||
|
"FLAG61": FLAG61,
|
||||||
|
"FLAG62": FLAG62,
|
||||||
|
"FLAG63": FLAG63,
|
||||||
|
|
||||||
|
"ID": ID,
|
||||||
|
"ORTH": ORTH,
|
||||||
|
"LOWER": LOWER,
|
||||||
|
"NORM": NORM,
|
||||||
|
"SHAPE": SHAPE,
|
||||||
|
"PREFIX": PREFIX,
|
||||||
|
"SUFFIX": SUFFIX,
|
||||||
|
|
||||||
|
"LENGTH": LENGTH,
|
||||||
|
"CLUSTER": CLUSTER,
|
||||||
|
"LEMMA": LEMMA,
|
||||||
|
"POS": POS,
|
||||||
|
"TAG": TAG,
|
||||||
|
"DEP": DEP,
|
||||||
|
"ENT_IOB": ENT_IOB,
|
||||||
|
"ENT_TYPE": ENT_TYPE,
|
||||||
|
"HEAD": HEAD,
|
||||||
|
"SPACY": SPACY,
|
||||||
|
"PROB": PROB,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ATTR IDs, in order of the symbol
|
||||||
|
ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
|
|
@ -15,7 +15,7 @@ from libcpp.vector cimport vector
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||||
from .tokens.doc cimport get_token_attr
|
from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
|
@ -1,23 +1,24 @@
|
||||||
# Google universal tag set
|
from .symbols cimport *
|
||||||
|
|
||||||
|
|
||||||
cpdef enum univ_pos_t:
|
cpdef enum univ_pos_t:
|
||||||
NO_TAG
|
NO_TAG = EMPTY_VALUE
|
||||||
ADJ
|
ADJ = POS_adj
|
||||||
ADP
|
ADP = POS_adp
|
||||||
ADV
|
ADV = POS_adv
|
||||||
AUX
|
AUX = POS_aux
|
||||||
CONJ
|
CONJ = POS_conj
|
||||||
DET
|
DET = POS_det
|
||||||
INTJ
|
INTJ = POS_intj
|
||||||
NOUN
|
NOUN = POS_noun
|
||||||
NUM
|
NUM = POS_num
|
||||||
PART
|
PART = POS_part
|
||||||
PRON
|
PRON = POS_pron
|
||||||
PROPN
|
PROPN = POS_propn
|
||||||
PUNCT
|
PUNCT = POS_punct
|
||||||
SCONJ
|
SCONJ = POS_sconj
|
||||||
SYM
|
SYM = POS_sym
|
||||||
VERB
|
VERB = POS_verb
|
||||||
X
|
X = POS_x
|
||||||
EOL
|
EOL = POS_eol
|
||||||
SPACE
|
SPACE = POS_space
|
||||||
N_UNIV_TAGS
|
|
||||||
|
|
|
@ -67,6 +67,21 @@ cdef class Vocab:
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
|
# Load strings in a special order, so that we have an onset number for
|
||||||
|
# the vocabulary. This way, when words are added in order, the orth ID
|
||||||
|
# is the frequency rank of the word, plus a certain offset. The structural
|
||||||
|
# strings are loaded first, because the vocab is open-class, and these
|
||||||
|
# symbols are closed class.
|
||||||
|
#for attr_name in sorted(ATTR_NAMES.keys()):
|
||||||
|
# _ = self.strings[attr_name]
|
||||||
|
#for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
|
||||||
|
# _ = self.strings[pos_name]
|
||||||
|
#for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
|
||||||
|
# _ = self.strings[morph_name]
|
||||||
|
#for entity_type_name in sorted(ENTITY_TYPES.keys()):
|
||||||
|
# _ = self.strings[entity_type_name]
|
||||||
|
#for tag_name in sorted(TAG_MAP.keys()):
|
||||||
|
# _ = self.strings[tag_name]
|
||||||
self.get_lex_attr = get_lex_attr
|
self.get_lex_attr = get_lex_attr
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
self.serializer_freqs = serializer_freqs
|
self.serializer_freqs = serializer_freqs
|
||||||
|
|
Loading…
Reference in New Issue
Block a user