2015-02-02 08:37:25 +03:00
|
|
|
from libc.string cimport memset
|
|
|
|
|
|
|
|
from murmurhash.mrmr cimport hash64
|
|
|
|
from ._state cimport entity_is_open
|
|
|
|
from ..lexeme cimport *
|
|
|
|
|
|
|
|
|
|
|
|
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
|
|
|
|
c[T_sic] = lex.sic
|
|
|
|
c[T_cluster] = lex.cluster
|
|
|
|
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
|
|
|
c[T_shape] = lex.shape
|
|
|
|
c[T_asciied] = lex.asciied
|
|
|
|
c[T_prefix] = lex.prefix
|
|
|
|
c[T_suffix] = lex.suffix
|
|
|
|
c[T_length] = lex.length
|
|
|
|
|
|
|
|
c[T_postype] = lex.postype
|
|
|
|
c[T_nertype] = 0
|
|
|
|
c[T_sensetype] = 0
|
2015-04-19 11:31:31 +03:00
|
|
|
|
2015-02-02 08:37:25 +03:00
|
|
|
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
|
|
|
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
|
|
|
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
|
|
|
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
|
|
|
|
c[T_is_space] = lex.flags & (1 << IS_SPACE)
|
|
|
|
c[T_is_title] = lex.flags & (1 << IS_TITLE)
|
|
|
|
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
|
|
|
|
c[T_like_url] = lex.flags & (1 << LIKE_URL)
|
|
|
|
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
|
|
|
|
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
|
|
|
|
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
|
|
|
|
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
|
|
|
|
|
|
|
|
c[T_in_males] = lex.flags & (1 << IN_MALES)
|
|
|
|
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
|
|
|
|
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
|
|
|
|
c[T_in_places] = lex.flags & (1 << IN_PLACES)
|
|
|
|
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
|
|
|
|
c[T_in_names] = lex.flags & (1 << IN_NAMES)
|
|
|
|
|
|
|
|
c[T_pos] = pos
|
|
|
|
c[T_sense] = 0
|
|
|
|
|
|
|
|
|
|
|
|
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
|
|
|
|
c[0] = lex.sic
|
|
|
|
c[1] = lex.cluster
|
|
|
|
c[2] = lex.shape
|
|
|
|
c[3] = pos
|
|
|
|
|
|
|
|
|
|
|
|
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
|
|
|
|
cdef int i
|
|
|
|
for i in range(N_FIELDS):
|
|
|
|
context[i] = 0
|
|
|
|
i = s.i
|
|
|
|
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
|
|
|
|
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
|
|
|
|
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
|
|
|
|
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
|
|
|
|
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
|
|
|
|
|
|
|
|
cdef atom_t[5] ent_vals
|
|
|
|
if entity_is_open(s):
|
|
|
|
context[E0_sic] = tokens.lex[s.curr.start].sic
|
|
|
|
context[E0_cluster] = tokens.lex[s.curr.start].cluster
|
|
|
|
context[E0_pos] = tokens.pos[s.curr.start]
|
|
|
|
context[E_last_sic] = tokens.lex[s.i-1].sic
|
|
|
|
context[E_last_cluster] = tokens.lex[s.i-1].cluster
|
|
|
|
context[E_last_pos] = tokens.pos[s.i-1]
|
|
|
|
if (s.curr.start + 1) < s.i:
|
|
|
|
context[E1_sic] = tokens.lex[s.curr.start+1].sic
|
|
|
|
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
|
|
|
|
context[E1_pos] = tokens.pos[s.curr.start+1]
|
|
|
|
return 1
|