spaCy/spacy/context.pyx

104 lines
3.0 KiB
Cython
Raw Normal View History

from murmurhash.mrmr cimport hash64
from .lexeme cimport *
cdef class Slots:
def __init__(self):
self.P2 = Token()
self.P1 = Token()
self.N0 = Token()
self.N1 = Token()
self.N2 = Token()
cdef void _number_token(Token t, int* n_fields):
cdef int i = n_fields[0]
t.i = i; i += 1
t.c = i; i += 1
t.w = i; i += 1
t.shape = i; i += 1
t.pref = i; i += 1
t.suff = i; i += 1
t.oft_title = i; i += 1
t.oft_upper = i; i += 1
t.is_alpha = i; i += 1
t.is_digit = i; i += 1
t.is_title = i; i += 1
t.is_upper = i; i += 1
t.url = i; i += 1
t.num = i; i += 1
t.postype = i; i += 1
t.pos = i; i += 1
t.ner = i; i += 1
n_fields[0] = i
cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
t.i = lex.sic
t.c = lex.cluster
t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
t.shape = lex.shape
t.pref = lex.prefix
t.suff = lex.suffix
t.oft_title = lex.flags & (1 << OFT_TITLE)
t.oft_upper = lex.flags & (1 << OFT_UPPER)
t.is_alpha = lex.flags & (1 << IS_ALPHA)
t.is_digit = lex.flags & (1 << IS_DIGIT)
t.is_title = lex.flags & (1 << IS_TITLE)
t.is_upper = lex.flags & (1 << IS_UPPER)
t.url = lex.flags & (1 << LIKE_URL)
t.num = lex.flags & (1 << LIKE_NUMBER)
t.postype = lex.postype
t.pos = pos
t.ner = ner
cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
context[ids.i] = vals.i
context[ids.c] = vals.c
context[ids.w] = vals.w
context[ids.shape] = vals.shape
context[ids.pref] = vals.pref
context[ids.suff] = vals.suff
context[ids.oft_title] = vals.oft_title
context[ids.oft_upper] = vals.oft_upper
context[ids.is_alpha] = vals.is_alpha
context[ids.is_digit] = vals.is_digit
context[ids.is_title] = vals.is_title
context[ids.is_upper] = vals.is_upper
context[ids.url] = vals.url
context[ids.num] = vals.num
context[ids.postype] = vals.postype
context[ids.pos] = vals.pos
context[ids.ner] = vals.ner
cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
return 1
cdef int fill_flat(atom_t* context, Slots s) except -1:
_flatten_token(context, FIELD_IDS.P2, s.P2)
_flatten_token(context, FIELD_IDS.P1, s.P1)
_flatten_token(context, FIELD_IDS.N0, s.N0)
_flatten_token(context, FIELD_IDS.N1, s.N1)
_flatten_token(context, FIELD_IDS.N2, s.N2)
N_FIELDS = 0
FIELD_IDS = Slots()
_number_token(FIELD_IDS.P2, &N_FIELDS)
_number_token(FIELD_IDS.P1, &N_FIELDS)
_number_token(FIELD_IDS.N0, &N_FIELDS)
_number_token(FIELD_IDS.N1, &N_FIELDS)
_number_token(FIELD_IDS.N2, &N_FIELDS)