diff --git a/spacy/context.pxd b/spacy/context.pxd index 47aedb3a4..5e36aaa8b 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -5,24 +5,43 @@ from .lexeme cimport Lexeme cdef class Token: - cdef readonly atom_t i - cdef readonly atom_t c - cdef readonly atom_t w + cdef readonly atom_t sic + cdef readonly atom_t cluster + cdef readonly atom_t norm cdef readonly atom_t shape - cdef readonly atom_t pref - cdef readonly atom_t suff - cdef readonly atom_t oft_title - cdef readonly atom_t oft_upper - cdef readonly atom_t is_alpha - cdef readonly atom_t is_digit - cdef readonly atom_t is_title - cdef readonly atom_t is_upper - - cdef readonly atom_t url - cdef readonly atom_t num + cdef readonly atom_t asciied + cdef readonly atom_t prefix + cdef readonly atom_t suffix + cdef readonly atom_t length cdef readonly atom_t postype + cdef readonly atom_t nertype + cdef readonly atom_t sensetype + + cdef readonly atom_t is_alpha + cdef readonly atom_t is_ascii + cdef readonly atom_t is_digit + cdef readonly atom_t is_lower + cdef readonly atom_t is_punct + cdef readonly atom_t is_space + cdef readonly atom_t is_title + cdef readonly atom_t is_upper + cdef readonly atom_t like_url + cdef readonly atom_t like_number + cdef readonly atom_t oft_lower + cdef readonly atom_t oft_title + cdef readonly atom_t oft_upper + + cdef readonly atom_t in_males + cdef readonly atom_t in_females + cdef readonly atom_t in_surnames + cdef readonly atom_t in_places + cdef readonly atom_t in_games + cdef readonly atom_t in_celebs + cdef readonly atom_t in_names + cdef readonly atom_t pos + cdef readonly atom_t sense cdef readonly atom_t ner diff --git a/spacy/context.pyx b/spacy/context.pyx index d715f2e5e..5413039cc 100644 --- a/spacy/context.pyx +++ b/spacy/context.pyx @@ -13,67 +13,126 @@ cdef class Slots: cdef void _number_token(Token t, int* n_fields): cdef int i = n_fields[0] - t.i = i; i += 1 - t.c = i; i += 1 - t.w = i; i += 1 + t.sic = i; i += 1 + t.cluster = i; i += 1 + t.norm = i; i += 1 t.shape = i; i += 1 - t.pref = i; i += 1 - t.suff = i; i += 1 - t.oft_title = i; i += 1 - t.oft_upper = i; i += 1 + t.prefix = i; i += 1 + t.suffix = i; i += 1 + t.length = i; i += 1 + + t.postype = i; i += 1 + t.nertype = i; i += 1 + t.sensetype = i; i += 1 + t.is_alpha = i; i += 1 + t.is_ascii = i; i += 1 t.is_digit = i; i += 1 + t.is_lower = i; i += 1 + t.is_punct = i; i += 1 + t.is_space = i; i += 1 t.is_title = i; i += 1 t.is_upper = i; i += 1 - t.url = i; i += 1 - t.num = i; i += 1 + t.like_number = i; i += 1 + t.like_url = i; i += 1 + + t.oft_lower = i; i += 1 + t.oft_title = i; i += 1 + t.oft_upper = i; i += 1 + + t.in_males = i; i += 1 + t.in_females = i; i += 1 + t.in_surnames = i; i += 1 + t.in_places = i; i += 1 + t.in_games = i; i += 1 + t.in_celebs = i; i += 1 + t.in_names = i; i += 1 - t.postype = i; i += 1 t.pos = i; i += 1 + t.sense = i; i += 1 t.ner = i; i += 1 n_fields[0] = i cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner): - t.i = lex.sic - t.c = lex.cluster - t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape + t.sic = lex.sic + t.cluster = lex.cluster + t.norm = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape t.shape = lex.shape - t.pref = lex.prefix - t.suff = lex.suffix + t.asciied = lex.asciied + t.prefix = lex.prefix + t.suffix = lex.suffix + t.length = lex.length - t.oft_title = lex.flags & (1 << OFT_TITLE) - t.oft_upper = lex.flags & (1 << OFT_UPPER) + t.postype = lex.postype + t.nertype = 0 + t.sensetype = 0 + t.is_alpha = lex.flags & (1 << IS_ALPHA) t.is_digit = lex.flags & (1 << IS_DIGIT) + t.is_lower = lex.flags & (1 << IS_LOWER) + t.is_punct = lex.flags & (1 << IS_PUNCT) + t.is_space = lex.flags & (1 << IS_SPACE) t.is_title = lex.flags & (1 << IS_TITLE) t.is_upper = lex.flags & (1 << IS_UPPER) - t.url = lex.flags & (1 << LIKE_URL) - t.num = lex.flags & (1 << LIKE_NUMBER) - t.postype = lex.postype + t.like_url = lex.flags & (1 << LIKE_URL) + t.like_number = lex.flags & (1 << LIKE_NUMBER) + t.oft_lower = lex.flags & (1 << OFT_LOWER) + t.oft_title = lex.flags & (1 << OFT_TITLE) + t.oft_upper = lex.flags & (1 << OFT_UPPER) + + t.in_males = lex.flags & (1 << IN_MALES) + t.in_females = lex.flags & (1 << IN_FEMALES) + t.in_surnames = lex.flags & (1 << IN_SURNAMES) + t.in_places = lex.flags & (1 << IN_PLACES) + t.in_games = lex.flags & (1 << IN_GAMES) + t.in_celebs = lex.flags & (1 << IN_CELEBS) + t.in_names = lex.flags & (1 << IN_NAMES) + t.pos = pos + t.sense = 0 t.ner = ner cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1: - context[ids.i] = vals.i - context[ids.c] = vals.c - context[ids.w] = vals.w + context[ids.sic] = vals.sic + context[ids.cluster] = vals.cluster + context[ids.norm] = vals.norm context[ids.shape] = vals.shape - context[ids.pref] = vals.pref - context[ids.suff] = vals.suff - context[ids.oft_title] = vals.oft_title - context[ids.oft_upper] = vals.oft_upper + context[ids.asciied] = vals.asciied + context[ids.prefix] = vals.prefix + context[ids.suffix] = vals.suffix + context[ids.length] = vals.length + + context[ids.postype] = vals.postype + context[ids.nertype] = vals.nertype + context[ids.sensetype] = vals.sensetype + context[ids.is_alpha] = vals.is_alpha + context[ids.is_ascii] = vals.is_ascii context[ids.is_digit] = vals.is_digit + context[ids.is_lower] = vals.is_lower + context[ids.is_punct] = vals.is_punct context[ids.is_title] = vals.is_title context[ids.is_upper] = vals.is_upper - context[ids.url] = vals.url - context[ids.num] = vals.num - context[ids.postype] = vals.postype + context[ids.like_url] = vals.like_url + context[ids.like_number] = vals.like_number + context[ids.oft_lower] = vals.oft_lower + context[ids.oft_title] = vals.oft_title + context[ids.oft_upper] = vals.oft_upper + + context[ids.in_males] = vals.in_males + context[ids.in_females] = vals.in_females + context[ids.in_surnames] = vals.in_surnames + context[ids.in_places] = vals.in_places + context[ids.in_games] = vals.in_games + context[ids.in_celebs] = vals.in_celebs + context[ids.in_names] = vals.in_names + context[ids.pos] = vals.pos + context[ids.sense] = vals.sense context[ids.ner] = vals.ner diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 1d369eef6..0d7d206e5 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -51,8 +51,6 @@ cdef struct Lexeme: tag_t supersense - - cdef Lexeme EMPTY_LEXEME cpdef Lexeme init(id_t i, unicode string, hash_t hashed, diff --git a/spacy/pos_feats.pyx b/spacy/pos_feats.pyx index 600d75d3d..e8a2699d4 100644 --- a/spacy/pos_feats.pyx +++ b/spacy/pos_feats.pyx @@ -9,33 +9,33 @@ cpdef Token N2 = FIELD_IDS.N2 TEMPLATES = ( - (N0.i,), - (N0.w,), - (N0.suff,), - (N0.pref,), + (N0.sic,), + (N0.norm,), + (N0.suffix,), + (N0.prefix,), (P1.pos,), (P2.pos,), (P1.pos, P2.pos), - (P1.pos, N0.w), - (P1.w,), - (P1.suff,), - (P2.w,), - (N1.w,), - (N1.suff,), - (N2.w,), + (P1.pos, N0.norm), + (P1.norm,), + (P1.suffix,), + (P2.norm,), + (N1.norm,), + (N1.suffix,), + (N2.norm,), (N0.shape,), - (N0.c,), - (N1.c,), - (N2.c,), - (P1.c,), - (P2.c,), + (N0.cluster,), + (N1.cluster,), + (N2.cluster,), + (P1.cluster,), + (P2.cluster,), (N0.oft_upper,), (N0.oft_title,), (N0.postype,), - (P1.url,), - (N1.num,), - (N1.url,), + (P1.like_url,), + (N1.like_number,), + (N1.like_url,), )