* Add STEM attribute to lexeme

This commit is contained in:
Matthew Honnibal 2014-12-04 20:46:20 +11:00
parent d7952634ca
commit e1b1f45cc9
2 changed files with 35 additions and 33 deletions

View File

@ -72,7 +72,8 @@ cpdef enum attr_id_t:
ID ID
SIC SIC
NORM STEM
DENSE
SHAPE SHAPE
ASCIIED ASCIIED
PREFIX PREFIX
@ -89,7 +90,8 @@ cdef struct Lexeme:
attr_t id attr_t id
attr_t sic attr_t sic
attr_t norm attr_t stem
attr_t dense
attr_t shape attr_t shape
attr_t asciied attr_t asciied
attr_t prefix attr_t prefix
@ -116,4 +118,32 @@ cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id) cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == DENSE:
return lex.dense
elif feat_name == STEM:
return lex.stem
elif feat_name == SHAPE:
return lex.shape
elif feat_name == ASCIIED:
return lex.asciied
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
elif feat_name == SENSE_TYPE:
return lex.sense_type
else:
return 0

View File

@ -27,38 +27,10 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.prefix = string_store[string[:1]] lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]] lex.suffix = string_store[string[-3:]]
lex.norm = lex.sic # TODO
lex.shape = string_store[orth.word_shape(string)] lex.shape = string_store[orth.word_shape(string)]
lex.dense = lex.sic if lex.prob >= -10 else lex.shape
lex.stem = string_store[props.get('stem', string)]
lex.asciied = string_store[orth.asciied(string)] lex.asciied = string_store[orth.asciied(string)]
lex.flags = props.get('flags', 0) lex.flags = props.get('flags', 0)
return lex return lex
cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name):
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == ASCIIED:
return lex.asciied
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
elif feat_name == SENSE_TYPE:
return lex.sense_type
else:
raise StandardError('Feature ID: %d not found' % feat_name)