mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Moving feature context stuff to a generalized place
This commit is contained in:
parent
3733444101
commit
0a8c84625d
43
spacy/context.pxd
Normal file
43
spacy/context.pxd
Normal file
|
@ -0,0 +1,43 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport Lexeme
|
||||
|
||||
|
||||
cdef struct Token:
|
||||
atom_t i
|
||||
atom_t c
|
||||
atom_t w
|
||||
atom_t shape
|
||||
atom_t pref
|
||||
atom_t suff
|
||||
atom_t oft_title
|
||||
atom_t oft_upper
|
||||
atom_t is_alpha
|
||||
atom_t is_digit
|
||||
atom_t is_title
|
||||
atom_t is_upper
|
||||
|
||||
atom_t url
|
||||
atom_t num
|
||||
|
||||
atom_t postype
|
||||
atom_t pos
|
||||
atom_t ner
|
||||
|
||||
|
||||
cdef struct Slots:
|
||||
Token P2
|
||||
Token P1
|
||||
Token N0
|
||||
Token N1
|
||||
Token N2
|
||||
|
||||
|
||||
cdef Slots FIELD_IDS
|
||||
cdef int N_FIELDS
|
||||
|
||||
|
||||
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0
|
||||
|
||||
cdef int fill_flat(atom_t* context, Slots* s) except -1
|
93
spacy/context.pyx
Normal file
93
spacy/context.pyx
Normal file
|
@ -0,0 +1,93 @@
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from .lexeme cimport *
|
||||
|
||||
|
||||
cdef void _number_token(Token* t, int* n_fields):
|
||||
cdef int i = n_fields[0]
|
||||
t.i = i; i += 1
|
||||
t.c = i; i += 1
|
||||
t.w = i; i += 1
|
||||
t.shape = i; i += 1
|
||||
t.pref = i; i += 1
|
||||
t.suff = i; i += 1
|
||||
t.oft_title = i; i += 1
|
||||
t.oft_upper = i; i += 1
|
||||
t.is_alpha = i; i += 1
|
||||
t.is_digit = i; i += 1
|
||||
t.is_title = i; i += 1
|
||||
t.is_upper = i; i += 1
|
||||
|
||||
t.url = i; i += 1
|
||||
t.num = i; i += 1
|
||||
|
||||
t.postype = i; i += 1
|
||||
t.pos = i; i += 1
|
||||
t.ner = i; i += 1
|
||||
|
||||
n_fields[0] = i
|
||||
|
||||
|
||||
cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
|
||||
t.i = lex.sic
|
||||
t.c = lex.cluster
|
||||
t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
||||
t.shape = lex.shape
|
||||
t.pref = lex.prefix
|
||||
t.suff = lex.suffix
|
||||
|
||||
t.oft_title = lex.flags & (1 << OFT_TITLE)
|
||||
t.oft_upper = lex.flags & (1 << OFT_UPPER)
|
||||
t.is_alpha = lex.flags & (1 << IS_ALPHA)
|
||||
t.is_digit = lex.flags & (1 << IS_DIGIT)
|
||||
t.is_title = lex.flags & (1 << IS_TITLE)
|
||||
t.is_upper = lex.flags & (1 << IS_UPPER)
|
||||
t.url = lex.flags & (1 << LIKE_URL)
|
||||
t.num = lex.flags & (1 << LIKE_NUMBER)
|
||||
t.postype = lex.postype
|
||||
t.pos = pos
|
||||
t.ner = ner
|
||||
|
||||
|
||||
cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
|
||||
context[ids.i] = vals.i
|
||||
context[ids.c] = vals.c
|
||||
context[ids.w] = vals.w
|
||||
context[ids.shape] = vals.shape
|
||||
context[ids.pref] = vals.pref
|
||||
context[ids.suff] = vals.suff
|
||||
context[ids.oft_title] = vals.oft_title
|
||||
context[ids.oft_upper] = vals.oft_upper
|
||||
context[ids.is_alpha] = vals.is_alpha
|
||||
context[ids.is_digit] = vals.is_digit
|
||||
context[ids.is_title] = vals.is_title
|
||||
context[ids.is_upper] = vals.is_upper
|
||||
context[ids.url] = vals.url
|
||||
context[ids.num] = vals.num
|
||||
context[ids.postype] = vals.postype
|
||||
context[ids.pos] = vals.pos
|
||||
context[ids.ner] = vals.ner
|
||||
|
||||
|
||||
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0:
|
||||
fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
|
||||
fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
|
||||
fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
|
||||
fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
|
||||
fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
|
||||
return hash64(s, sizeof(Slots), 0)
|
||||
|
||||
|
||||
cdef int fill_flat(atom_t* context, Slots* s) except -1:
|
||||
_flatten_token(context, &FIELD_IDS.P2, &s.P2)
|
||||
_flatten_token(context, &FIELD_IDS.P1, &s.P1)
|
||||
_flatten_token(context, &FIELD_IDS.N0, &s.N0)
|
||||
_flatten_token(context, &FIELD_IDS.N1, &s.N1)
|
||||
_flatten_token(context, &FIELD_IDS.N2, &s.N2)
|
||||
|
||||
|
||||
N_FIELDS = 0
|
||||
_number_token(&FIELD_IDS.P2, &N_FIELDS)
|
||||
_number_token(&FIELD_IDS.P1, &N_FIELDS)
|
||||
_number_token(&FIELD_IDS.N0, &N_FIELDS)
|
||||
_number_token(&FIELD_IDS.N1, &N_FIELDS)
|
||||
_number_token(&FIELD_IDS.N2, &N_FIELDS)
|
Loading…
Reference in New Issue
Block a user