* Revise context, focussing on POS tagging for now

2025-11-06 19:07:30 +03:00 · 2014-12-07 15:28:22 +11:00 · 2014-12-07 15:28:22 +11:00 · f5c4f2eb52
commit f5c4f2eb52
parent e27b912ef9
2 changed files with 54 additions and 181 deletions
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -1,66 +1,49 @@
 from thinc.typedefs cimport atom_t
-from .typedefs cimport hash_t
+from .tokens cimport TokenC
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-cdef class Token:
+cpdef enum:
-    cdef readonly atom_t sic
+    P2_sic
-    cdef readonly atom_t cluster
+    P2_cluster
-    cdef readonly atom_t norm
+    P2_shape
-    cdef readonly atom_t shape
+    P2_prefix
-    cdef readonly atom_t asciied
+    P2_suffix
-    cdef readonly atom_t prefix
+    P2_pos
-    cdef readonly atom_t suffix
+    P2_sense
    cdef readonly atom_t length
-    cdef readonly atom_t postype
+    P1_sic
-    cdef readonly atom_t nertype
+    P1_cluster
-    cdef readonly atom_t sensetype
+    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_sense
-    cdef readonly atom_t is_alpha
+    W_sic
-    cdef readonly atom_t is_ascii
+    W_cluster
-    cdef readonly atom_t is_digit
+    W_shape
-    cdef readonly atom_t is_lower
+    W_prefix
-    cdef readonly atom_t is_punct
+    W_suffix
-    cdef readonly atom_t is_space
+    W_pos
-    cdef readonly atom_t is_title
+    W_sense
    cdef readonly atom_t is_upper
    cdef readonly atom_t like_url
    cdef readonly atom_t like_number
    cdef readonly atom_t oft_lower
    cdef readonly atom_t oft_title
    cdef readonly atom_t oft_upper
-    cdef readonly atom_t in_males
+    N1_sic
-    cdef readonly atom_t in_females
+    N1_cluster
-    cdef readonly atom_t in_surnames
+    N1_shape
-    cdef readonly atom_t in_places
+    N1_prefix
-    cdef readonly atom_t in_games
+    N1_suffix
-    cdef readonly atom_t in_celebs
+    N1_pos
-    cdef readonly atom_t in_names
+    N1_sense
-    cdef readonly atom_t pos
+    N2_sic
-    cdef readonly atom_t sense
+    N2_cluster
-    cdef readonly atom_t ner
+    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_sense
    N_FIELDS
-cdef class Slots:
+cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1
    cdef readonly Token P4
    cdef readonly Token P3
    cdef readonly Token P2
    cdef readonly Token P1
    cdef readonly Token N0
    cdef readonly Token N1
    cdef readonly Token N2
    cdef readonly Token N3
    cdef readonly Token N4
 cdef int N_FIELDS
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
 cpdef Slots FIELD_IDS
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@ -1,126 +1,16 @@
-from murmurhash.mrmr cimport hash64
+cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1:
-from .lexeme cimport *
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
-cdef class Slots:
+cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil:
-    def __init__(self):
+    context[0] = t.lex.sic
-        self.P4 = Token()
+    context[1] = t.lex.cluster
-        self.P3 = Token()
+    context[2] = t.lex.shape
-        self.P2 = Token()
+    context[3] = t.lex.prefix
-        self.P1 = Token()
+    context[4] = t.lex.suffix
-        self.N0 = Token()
+    context[5] = t.pos
-        self.N1 = Token()
+    context[6] = t.sense
        self.N2 = Token()
        self.N3 = Token()
        self.N4 = Token()
 cdef void _number_token(Token t, int* n_fields):
    cdef int i = n_fields[0]
    t.sic = i; i += 1
    t.cluster = i; i += 1
    t.norm = i; i += 1
    t.shape = i; i += 1
    t.prefix = i; i += 1
    t.suffix = i; i += 1
    t.length = i; i += 1
    t.postype = i; i += 1
    t.nertype = i; i += 1
    t.sensetype = i; i += 1
    t.is_alpha = i; i += 1
    t.is_ascii = i; i += 1
    t.is_digit = i; i += 1
    t.is_lower = i; i += 1
    t.is_punct = i; i += 1
    t.is_space = i; i += 1
    t.is_title = i; i += 1
    t.is_upper = i; i += 1
    t.like_number = i; i += 1
    t.like_url = i; i += 1
    t.oft_lower = i; i += 1
    t.oft_title = i; i += 1
    t.oft_upper = i; i += 1
    t.in_males = i; i += 1
    t.in_females = i; i += 1
    t.in_surnames = i; i += 1
    t.in_places = i; i += 1
    t.in_games = i; i += 1
    t.in_celebs = i; i += 1
    t.in_names = i; i += 1
    t.pos = i; i += 1
    t.sense = i; i += 1
    t.ner = i; i += 1
    n_fields[0] = i
 cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
    c[t.sic] = lex.sic
    c[t.cluster] = lex.cluster
    c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    c[t.shape] = lex.shape
    c[t.asciied] = lex.asciied
    c[t.prefix] = lex.prefix
    c[t.suffix] = lex.suffix
    c[t.length] = lex.length
    c[t.postype] = lex.postype
    c[t.nertype] = 0
    c[t.sensetype] = 0
    c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
    c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
    c[t.is_lower] = lex.flags & (1 << IS_LOWER)
    c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
    c[t.is_space] = lex.flags & (1 << IS_SPACE)
    c[t.is_title] = lex.flags & (1 << IS_TITLE)
    c[t.is_upper] = lex.flags & (1 << IS_UPPER)
    c[t.like_url] = lex.flags & (1 << LIKE_URL)
    c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
    c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
    c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
    c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
    c[t.in_males] = lex.flags & (1 << IN_MALES)
    c[t.in_females] = lex.flags & (1 << IN_FEMALES)
    c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
    c[t.in_places] = lex.flags & (1 << IN_PLACES)
    c[t.in_games] = lex.flags & (1 << IN_GAMES)
    c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
    c[t.in_names] = lex.flags & (1 << IN_NAMES)
    c[t.pos] = pos
    c[t.sense] = 0
    c[t.ner] = ner
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
    _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
    _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
    _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
    _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
    _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
    _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
    _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
    _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
    _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
    return 1
 N_FIELDS = 0
 FIELD_IDS = Slots()
 _number_token(FIELD_IDS.P4, &N_FIELDS)
 _number_token(FIELD_IDS.P3, &N_FIELDS)
 _number_token(FIELD_IDS.P2, &N_FIELDS)
 _number_token(FIELD_IDS.P1, &N_FIELDS)
 _number_token(FIELD_IDS.N0, &N_FIELDS)
 _number_token(FIELD_IDS.N1, &N_FIELDS)
 _number_token(FIELD_IDS.N2, &N_FIELDS)
 _number_token(FIELD_IDS.N3, &N_FIELDS)
 _number_token(FIELD_IDS.N4, &N_FIELDS)