* Revise context, focussing on POS tagging for now

This commit is contained in:
Matthew Honnibal 2014-12-07 15:28:22 +11:00
parent e27b912ef9
commit f5c4f2eb52
2 changed files with 54 additions and 181 deletions

View File

@ -1,66 +1,49 @@
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
from .typedefs cimport hash_t from .tokens cimport TokenC
from .tokens cimport Tokens
from .lexeme cimport Lexeme
cdef class Token: cpdef enum:
cdef readonly atom_t sic P2_sic
cdef readonly atom_t cluster P2_cluster
cdef readonly atom_t norm P2_shape
cdef readonly atom_t shape P2_prefix
cdef readonly atom_t asciied P2_suffix
cdef readonly atom_t prefix P2_pos
cdef readonly atom_t suffix P2_sense
cdef readonly atom_t length
cdef readonly atom_t postype P1_sic
cdef readonly atom_t nertype P1_cluster
cdef readonly atom_t sensetype P1_shape
P1_prefix
P1_suffix
P1_pos
P1_sense
cdef readonly atom_t is_alpha W_sic
cdef readonly atom_t is_ascii W_cluster
cdef readonly atom_t is_digit W_shape
cdef readonly atom_t is_lower W_prefix
cdef readonly atom_t is_punct W_suffix
cdef readonly atom_t is_space W_pos
cdef readonly atom_t is_title W_sense
cdef readonly atom_t is_upper
cdef readonly atom_t like_url
cdef readonly atom_t like_number
cdef readonly atom_t oft_lower
cdef readonly atom_t oft_title
cdef readonly atom_t oft_upper
cdef readonly atom_t in_males N1_sic
cdef readonly atom_t in_females N1_cluster
cdef readonly atom_t in_surnames N1_shape
cdef readonly atom_t in_places N1_prefix
cdef readonly atom_t in_games N1_suffix
cdef readonly atom_t in_celebs N1_pos
cdef readonly atom_t in_names N1_sense
cdef readonly atom_t pos N2_sic
cdef readonly atom_t sense N2_cluster
cdef readonly atom_t ner N2_shape
N2_prefix
N2_suffix
N2_pos
N2_sense
N_FIELDS
cdef class Slots: cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1
cdef readonly Token P4
cdef readonly Token P3
cdef readonly Token P2
cdef readonly Token P1
cdef readonly Token N0
cdef readonly Token N1
cdef readonly Token N2
cdef readonly Token N3
cdef readonly Token N4
cdef int N_FIELDS
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
cpdef Slots FIELD_IDS

View File

@ -1,126 +1,16 @@
from murmurhash.mrmr cimport hash64 cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1:
from .lexeme cimport * _fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef class Slots: cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil:
def __init__(self): context[0] = t.lex.sic
self.P4 = Token() context[1] = t.lex.cluster
self.P3 = Token() context[2] = t.lex.shape
self.P2 = Token() context[3] = t.lex.prefix
self.P1 = Token() context[4] = t.lex.suffix
self.N0 = Token() context[5] = t.pos
self.N1 = Token() context[6] = t.sense
self.N2 = Token()
self.N3 = Token()
self.N4 = Token()
cdef void _number_token(Token t, int* n_fields):
cdef int i = n_fields[0]
t.sic = i; i += 1
t.cluster = i; i += 1
t.norm = i; i += 1
t.shape = i; i += 1
t.prefix = i; i += 1
t.suffix = i; i += 1
t.length = i; i += 1
t.postype = i; i += 1
t.nertype = i; i += 1
t.sensetype = i; i += 1
t.is_alpha = i; i += 1
t.is_ascii = i; i += 1
t.is_digit = i; i += 1
t.is_lower = i; i += 1
t.is_punct = i; i += 1
t.is_space = i; i += 1
t.is_title = i; i += 1
t.is_upper = i; i += 1
t.like_number = i; i += 1
t.like_url = i; i += 1
t.oft_lower = i; i += 1
t.oft_title = i; i += 1
t.oft_upper = i; i += 1
t.in_males = i; i += 1
t.in_females = i; i += 1
t.in_surnames = i; i += 1
t.in_places = i; i += 1
t.in_games = i; i += 1
t.in_celebs = i; i += 1
t.in_names = i; i += 1
t.pos = i; i += 1
t.sense = i; i += 1
t.ner = i; i += 1
n_fields[0] = i
cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
c[t.sic] = lex.sic
c[t.cluster] = lex.cluster
c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
c[t.shape] = lex.shape
c[t.asciied] = lex.asciied
c[t.prefix] = lex.prefix
c[t.suffix] = lex.suffix
c[t.length] = lex.length
c[t.postype] = lex.postype
c[t.nertype] = 0
c[t.sensetype] = 0
c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
c[t.is_lower] = lex.flags & (1 << IS_LOWER)
c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
c[t.is_space] = lex.flags & (1 << IS_SPACE)
c[t.is_title] = lex.flags & (1 << IS_TITLE)
c[t.is_upper] = lex.flags & (1 << IS_UPPER)
c[t.like_url] = lex.flags & (1 << LIKE_URL)
c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
c[t.in_males] = lex.flags & (1 << IN_MALES)
c[t.in_females] = lex.flags & (1 << IN_FEMALES)
c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
c[t.in_places] = lex.flags & (1 << IN_PLACES)
c[t.in_games] = lex.flags & (1 << IN_GAMES)
c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
c[t.in_names] = lex.flags & (1 << IN_NAMES)
c[t.pos] = pos
c[t.sense] = 0
c[t.ner] = ner
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
_fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
_fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
_fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
_fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
_fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
_fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
_fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
_fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
return 1
N_FIELDS = 0
FIELD_IDS = Slots()
_number_token(FIELD_IDS.P4, &N_FIELDS)
_number_token(FIELD_IDS.P3, &N_FIELDS)
_number_token(FIELD_IDS.P2, &N_FIELDS)
_number_token(FIELD_IDS.P1, &N_FIELDS)
_number_token(FIELD_IDS.N0, &N_FIELDS)
_number_token(FIELD_IDS.N1, &N_FIELDS)
_number_token(FIELD_IDS.N2, &N_FIELDS)
_number_token(FIELD_IDS.N3, &N_FIELDS)
_number_token(FIELD_IDS.N4, &N_FIELDS)