* Remove vocab10k field, and add flags for gazetteers

This commit is contained in:
Matthew Honnibal 2014-11-03 00:13:51 +11:00
parent f1c3e17c80
commit 70ea862703
2 changed files with 16 additions and 3 deletions

View File

@ -23,6 +23,14 @@ cpdef enum:
OFT_TITLE
OFT_UPPER
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
cdef struct Lexeme:
flag_t flags
@ -31,7 +39,6 @@ cdef struct Lexeme:
id_t sic
id_t norm
id_t shape
id_t vocab10k
id_t asciied
id_t prefix
id_t suffix

View File

@ -54,9 +54,15 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.norm = lex.sic
lex.shape = get_string_id(orth.word_shape(string), store)
lex.asciied = get_string_id(orth.asciied(string), store)
non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
lex.vocab10k = get_string_id(non_sparse, store)
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
lex.flags |= props.get('in_males', 0) << IN_MALES
lex.flags |= props.get('in_females', 0) << IN_FEMALES
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
lex.flags |= props.get('in_places', 0) << IN_PLACES
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
lex.flags |= props.get('in_games', 0) << IN_GAMES
lex.flags |= props.get('in_names', 0) << IN_NAMES
return lex