* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries

This commit is contained in:
Matthew Honnibal 2014-10-31 17:43:00 +11:00
parent aaf6953fe0
commit 6c807aa45f
2 changed files with 7 additions and 4 deletions

View File

@ -24,6 +24,7 @@ cpdef enum:
cdef struct Lexeme: cdef struct Lexeme:
flag_t flags flag_t flags
id_t id
id_t sic id_t sic
id_t norm id_t norm
id_t shape id_t shape
@ -36,7 +37,7 @@ cdef struct Lexeme:
len_t length len_t length
tag_t cluster tag_t cluster
tag_t pos tag_t postype
tag_t supersense tag_t supersense
@ -44,7 +45,7 @@ cdef struct Lexeme:
cdef Lexeme EMPTY_LEXEME cdef Lexeme EMPTY_LEXEME
cpdef Lexeme init(unicode string, hash_t hashed, cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except * StringStore store, dict props) except *

View File

@ -26,14 +26,15 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
return flags return flags
cpdef Lexeme init(unicode string, hash_t hashed, cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except *: StringStore store, dict props) except *:
cdef Lexeme lex cdef Lexeme lex
lex.id = i
lex.length = len(string) lex.length = len(string)
lex.sic = get_string_id(string, store) lex.sic = get_string_id(string, store)
lex.cluster = props.get('cluster', 0) lex.cluster = props.get('cluster', 0)
lex.pos = props.get('pos', 0) lex.postype = props.get('postype', 0)
lex.supersense = props.get('supersense', 0) lex.supersense = props.get('supersense', 0)
lex.prob = props.get('prob', 0) lex.prob = props.get('prob', 0)
@ -55,6 +56,7 @@ cpdef Lexeme init(unicode string, hash_t hashed,
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
return lex return lex
cdef id_t get_string_id(unicode string, StringStore store) except 0: cdef id_t get_string_id(unicode string, StringStore store) except 0:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string)) cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))