2014-09-15 05:22:40 +04:00
|
|
|
from cpython.ref cimport Py_INCREF
|
2014-09-18 01:09:24 +04:00
|
|
|
from cymem.cymem cimport Pool
|
2014-10-29 15:19:38 +03:00
|
|
|
from murmurhash.mrmr cimport hash64
|
2014-09-15 05:22:40 +04:00
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
from libc.string cimport memset
|
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
import orth
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
from .utf8string cimport Utf8Str
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
OOV_DIST_FLAGS = 0
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
2014-10-09 12:53:30 +04:00
|
|
|
|
|
|
|
|
2014-10-29 15:19:38 +03:00
|
|
|
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
2014-10-10 01:11:31 +04:00
|
|
|
cdef flag_t flags = 0
|
2014-10-23 17:59:17 +04:00
|
|
|
flags |= orth.is_alpha(string) << IS_ALPHA
|
|
|
|
flags |= orth.is_ascii(string) << IS_ASCII
|
|
|
|
flags |= orth.is_digit(string) << IS_DIGIT
|
|
|
|
flags |= orth.is_lower(string) << IS_LOWER
|
|
|
|
flags |= orth.is_punct(string) << IS_PUNCT
|
|
|
|
flags |= orth.is_space(string) << IS_SPACE
|
|
|
|
flags |= orth.is_title(string) << IS_TITLE
|
|
|
|
flags |= orth.is_upper(string) << IS_UPPER
|
2014-11-02 05:19:05 +03:00
|
|
|
|
|
|
|
flags |= orth.like_url(string) << LIKE_URL
|
|
|
|
flags |= orth.like_number(string) << LIKE_NUMBER
|
2014-10-10 01:11:31 +04:00
|
|
|
return flags
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-10 12:17:22 +04:00
|
|
|
|
2014-10-31 09:43:00 +03:00
|
|
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
2014-10-29 15:19:38 +03:00
|
|
|
StringStore store, dict props) except *:
|
|
|
|
cdef Lexeme lex
|
2014-10-31 09:43:00 +03:00
|
|
|
lex.id = i
|
2014-10-29 15:19:38 +03:00
|
|
|
lex.length = len(string)
|
|
|
|
lex.sic = get_string_id(string, store)
|
|
|
|
|
|
|
|
lex.cluster = props.get('cluster', 0)
|
2014-10-31 09:43:00 +03:00
|
|
|
lex.postype = props.get('postype', 0)
|
2014-10-29 15:19:38 +03:00
|
|
|
lex.supersense = props.get('supersense', 0)
|
|
|
|
lex.prob = props.get('prob', 0)
|
|
|
|
|
|
|
|
cdef float upper_pc = props.get('upper_pc', 0.0)
|
|
|
|
cdef float lower_pc = props.get('lower_pc', 0.0)
|
|
|
|
cdef float title_pc = props.get('title_pc', 0.0)
|
|
|
|
|
|
|
|
lex.prefix = get_string_id(string[0], store)
|
|
|
|
lex.suffix = get_string_id(string[-3:], store)
|
2014-10-30 09:56:11 +03:00
|
|
|
if upper_pc or lower_pc or title_pc:
|
|
|
|
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
|
|
|
|
lex.norm = get_string_id(canon_cased, store)
|
|
|
|
else:
|
|
|
|
lex.norm = lex.sic
|
2014-10-29 15:19:38 +03:00
|
|
|
lex.shape = get_string_id(orth.word_shape(string), store)
|
|
|
|
lex.asciied = get_string_id(orth.asciied(string), store)
|
|
|
|
non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
|
|
|
|
lex.vocab10k = get_string_id(non_sparse, store)
|
|
|
|
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
|
|
|
return lex
|
|
|
|
|
2014-10-31 09:43:00 +03:00
|
|
|
|
2014-10-30 07:42:15 +03:00
|
|
|
cdef id_t get_string_id(unicode string, StringStore store) except 0:
|
2014-09-15 08:34:45 +04:00
|
|
|
cdef bytes byte_string = string.encode('utf8')
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
2014-10-29 15:19:38 +03:00
|
|
|
return orig_str.i
|