* Revise context, focussing on POS tagging for now

This commit is contained in:
Matthew Honnibal 2014-12-07 15:28:22 +11:00
parent e27b912ef9
commit f5c4f2eb52
2 changed files with 54 additions and 181 deletions

View File

@ -1,66 +1,49 @@
from thinc.typedefs cimport atom_t
from .typedefs cimport hash_t
from .tokens cimport Tokens
from .lexeme cimport Lexeme
from .tokens cimport TokenC
cdef class Token:
cdef readonly atom_t sic
cdef readonly atom_t cluster
cdef readonly atom_t norm
cdef readonly atom_t shape
cdef readonly atom_t asciied
cdef readonly atom_t prefix
cdef readonly atom_t suffix
cdef readonly atom_t length
cpdef enum:
P2_sic
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_sense
cdef readonly atom_t postype
cdef readonly atom_t nertype
cdef readonly atom_t sensetype
P1_sic
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_sense
cdef readonly atom_t is_alpha
cdef readonly atom_t is_ascii
cdef readonly atom_t is_digit
cdef readonly atom_t is_lower
cdef readonly atom_t is_punct
cdef readonly atom_t is_space
cdef readonly atom_t is_title
cdef readonly atom_t is_upper
cdef readonly atom_t like_url
cdef readonly atom_t like_number
cdef readonly atom_t oft_lower
cdef readonly atom_t oft_title
cdef readonly atom_t oft_upper
W_sic
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_sense
cdef readonly atom_t in_males
cdef readonly atom_t in_females
cdef readonly atom_t in_surnames
cdef readonly atom_t in_places
cdef readonly atom_t in_games
cdef readonly atom_t in_celebs
cdef readonly atom_t in_names
N1_sic
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_sense
cdef readonly atom_t pos
cdef readonly atom_t sense
cdef readonly atom_t ner
N2_sic
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_sense
N_FIELDS
cdef class Slots:
cdef readonly Token P4
cdef readonly Token P3
cdef readonly Token P2
cdef readonly Token P1
cdef readonly Token N0
cdef readonly Token N1
cdef readonly Token N2
cdef readonly Token N3
cdef readonly Token N4
cdef int N_FIELDS
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
cpdef Slots FIELD_IDS
cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1

View File

@ -1,126 +1,16 @@
from murmurhash.mrmr cimport hash64
from .lexeme cimport *
cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef class Slots:
def __init__(self):
self.P4 = Token()
self.P3 = Token()
self.P2 = Token()
self.P1 = Token()
self.N0 = Token()
self.N1 = Token()
self.N2 = Token()
self.N3 = Token()
self.N4 = Token()
cdef void _number_token(Token t, int* n_fields):
cdef int i = n_fields[0]
t.sic = i; i += 1
t.cluster = i; i += 1
t.norm = i; i += 1
t.shape = i; i += 1
t.prefix = i; i += 1
t.suffix = i; i += 1
t.length = i; i += 1
t.postype = i; i += 1
t.nertype = i; i += 1
t.sensetype = i; i += 1
t.is_alpha = i; i += 1
t.is_ascii = i; i += 1
t.is_digit = i; i += 1
t.is_lower = i; i += 1
t.is_punct = i; i += 1
t.is_space = i; i += 1
t.is_title = i; i += 1
t.is_upper = i; i += 1
t.like_number = i; i += 1
t.like_url = i; i += 1
t.oft_lower = i; i += 1
t.oft_title = i; i += 1
t.oft_upper = i; i += 1
t.in_males = i; i += 1
t.in_females = i; i += 1
t.in_surnames = i; i += 1
t.in_places = i; i += 1
t.in_games = i; i += 1
t.in_celebs = i; i += 1
t.in_names = i; i += 1
t.pos = i; i += 1
t.sense = i; i += 1
t.ner = i; i += 1
n_fields[0] = i
cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
c[t.sic] = lex.sic
c[t.cluster] = lex.cluster
c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
c[t.shape] = lex.shape
c[t.asciied] = lex.asciied
c[t.prefix] = lex.prefix
c[t.suffix] = lex.suffix
c[t.length] = lex.length
c[t.postype] = lex.postype
c[t.nertype] = 0
c[t.sensetype] = 0
c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
c[t.is_lower] = lex.flags & (1 << IS_LOWER)
c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
c[t.is_space] = lex.flags & (1 << IS_SPACE)
c[t.is_title] = lex.flags & (1 << IS_TITLE)
c[t.is_upper] = lex.flags & (1 << IS_UPPER)
c[t.like_url] = lex.flags & (1 << LIKE_URL)
c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
c[t.in_males] = lex.flags & (1 << IN_MALES)
c[t.in_females] = lex.flags & (1 << IN_FEMALES)
c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
c[t.in_places] = lex.flags & (1 << IN_PLACES)
c[t.in_games] = lex.flags & (1 << IN_GAMES)
c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
c[t.in_names] = lex.flags & (1 << IN_NAMES)
c[t.pos] = pos
c[t.sense] = 0
c[t.ner] = ner
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
_fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
_fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
_fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
_fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
_fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
_fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
_fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
_fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
return 1
N_FIELDS = 0
FIELD_IDS = Slots()
_number_token(FIELD_IDS.P4, &N_FIELDS)
_number_token(FIELD_IDS.P3, &N_FIELDS)
_number_token(FIELD_IDS.P2, &N_FIELDS)
_number_token(FIELD_IDS.P1, &N_FIELDS)
_number_token(FIELD_IDS.N0, &N_FIELDS)
_number_token(FIELD_IDS.N1, &N_FIELDS)
_number_token(FIELD_IDS.N2, &N_FIELDS)
_number_token(FIELD_IDS.N3, &N_FIELDS)
_number_token(FIELD_IDS.N4, &N_FIELDS)
cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense