* Moving feature context stuff to a generalized place

2025-11-12 05:45:53 +03:00 · 2014-11-05 19:55:10 +11:00 · 2014-11-05 19:55:10 +11:00 · 0a8c84625d
commit 0a8c84625d
parent 3733444101
2 changed files with 136 additions and 0 deletions
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -0,0 +1,43 @@
+from thinc.typedefs cimport atom_t
+from .typedefs cimport hash_t
+from .tokens cimport Tokens
+from .lexeme cimport Lexeme
+
+
+cdef struct Token:
+    atom_t i
+    atom_t c
+    atom_t w
+    atom_t shape
+    atom_t pref
+    atom_t suff
+    atom_t oft_title
+    atom_t oft_upper
+    atom_t is_alpha
+    atom_t is_digit
+    atom_t is_title
+    atom_t is_upper
+
+    atom_t url
+    atom_t num
+
+    atom_t postype
+    atom_t pos
+    atom_t ner
+
+
+cdef struct Slots:
+    Token P2
+    Token P1
+    Token N0
+    Token N1
+    Token N2
+
+
+cdef Slots FIELD_IDS
+cdef int N_FIELDS
+
+
+cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0
+
+cdef int fill_flat(atom_t* context, Slots* s) except -1
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@ -0,0 +1,93 @@
+from murmurhash.mrmr cimport hash64
+from .lexeme cimport *
+
+
+cdef void _number_token(Token* t, int* n_fields):
+    cdef int i = n_fields[0]
+    t.i = i; i += 1
+    t.c = i; i += 1
+    t.w = i; i += 1
+    t.shape = i; i += 1
+    t.pref = i; i += 1
+    t.suff = i; i += 1
+    t.oft_title = i; i += 1
+    t.oft_upper = i; i += 1
+    t.is_alpha = i; i += 1
+    t.is_digit = i; i += 1
+    t.is_title = i; i += 1
+    t.is_upper = i; i += 1
+
+    t.url = i; i += 1
+    t.num = i; i += 1
+
+    t.postype = i; i += 1
+    t.pos = i; i += 1
+    t.ner = i; i += 1
+
+    n_fields[0] = i
+
+
+cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
+    t.i = lex.sic
+    t.c = lex.cluster
+    t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
+    t.shape = lex.shape
+    t.pref = lex.prefix
+    t.suff = lex.suffix
+
+    t.oft_title = lex.flags & (1 << OFT_TITLE)
+    t.oft_upper = lex.flags & (1 << OFT_UPPER)
+    t.is_alpha = lex.flags & (1 << IS_ALPHA)
+    t.is_digit = lex.flags & (1 << IS_DIGIT)
+    t.is_title = lex.flags & (1 << IS_TITLE)
+    t.is_upper = lex.flags & (1 << IS_UPPER)
+    t.url = lex.flags & (1 << LIKE_URL)
+    t.num = lex.flags & (1 << LIKE_NUMBER)
+    t.postype = lex.postype
+    t.pos = pos
+    t.ner = ner
+
+
+cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
+    context[ids.i] = vals.i
+    context[ids.c] = vals.c
+    context[ids.w] = vals.w
+    context[ids.shape] = vals.shape
+    context[ids.pref] = vals.pref
+    context[ids.suff] = vals.suff
+    context[ids.oft_title] = vals.oft_title
+    context[ids.oft_upper] = vals.oft_upper
+    context[ids.is_alpha] = vals.is_alpha
+    context[ids.is_digit] = vals.is_digit
+    context[ids.is_title] = vals.is_title
+    context[ids.is_upper] = vals.is_upper
+    context[ids.url] = vals.url
+    context[ids.num] = vals.num
+    context[ids.postype] = vals.postype
+    context[ids.pos] = vals.pos
+    context[ids.ner] = vals.ner
+
+
+cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0:
+    fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
+    fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
+    fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
+    fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
+    fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
+    return hash64(s, sizeof(Slots), 0)
+
+
+cdef int fill_flat(atom_t* context, Slots* s) except -1:
+    _flatten_token(context, &FIELD_IDS.P2, &s.P2)
+    _flatten_token(context, &FIELD_IDS.P1, &s.P1)
+    _flatten_token(context, &FIELD_IDS.N0, &s.N0)
+    _flatten_token(context, &FIELD_IDS.N1, &s.N1)
+    _flatten_token(context, &FIELD_IDS.N2, &s.N2)
+
+
+N_FIELDS = 0
+_number_token(&FIELD_IDS.P2, &N_FIELDS)
+_number_token(&FIELD_IDS.P1, &N_FIELDS)
+_number_token(&FIELD_IDS.N0, &N_FIELDS)
+_number_token(&FIELD_IDS.N1, &N_FIELDS)
+_number_token(&FIELD_IDS.N2, &N_FIELDS)