From 7e0a9077ddfd8751ea23dbec1f44eb89d3f395c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Nov 2014 23:22:36 +1100 Subject: [PATCH] * Add context files --- spacy/ner/context.pxd | 153 ++++++++++++++++++++++++++++++++++++++++++ spacy/ner/context.pyx | 76 +++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 spacy/ner/context.pxd create mode 100644 spacy/ner/context.pyx diff --git a/spacy/ner/context.pxd b/spacy/ner/context.pxd new file mode 100644 index 000000000..f9280c516 --- /dev/null +++ b/spacy/ner/context.pxd @@ -0,0 +1,153 @@ +from thinc.typedefs cimport atom_t +from ..typedefs cimport hash_t +from ..tokens cimport Tokens +from ..lexeme cimport Lexeme +from .structs cimport State + + +cpdef enum: + T_sic + T_cluster + T_norm + T_shape + T_asciied + T_prefix + T_suffix + T_length + T_postype + T_nertype + T_sensetype + T_is_alpha + T_is_ascii + T_is_digit + T_is_lower + T_is_punct + T_is_space + T_is_title + T_is_upper + T_like_url + T_like_number + T_oft_lower + T_oft_title + T_oft_upper + T_in_males + T_in_females + T_in_surnames + T_in_places + T_in_celebs + T_in_names + T_pos + T_sense + T_ner + + +cpdef enum: + P2_sic + P2_cluster + P2_norm + P2_shape + P2_prefix + P2_suffix + P2_length + P2_postype + P2_is_alpha + P2_is_digit + P2_is_lower + P2_is_punct + P2_is_title + P2_is_upper + P2_like_number + P2_pos + + P1_sic + P1_cluster + P1_norm + P1_shape + P1_prefix + P1_suffix + P1_length + P1_postype + P1_is_alpha + P1_is_digit + P1_is_lower + P1_is_punct + P1_is_title + P1_is_upper + P1_like_number + P1_pos + + W_sic + W_cluster + W_norm + W_shape + W_prefix + W_suffix + W_length + W_postype + W_is_alpha + W_is_digit + W_is_lower + W_is_punct + W_is_space + W_is_title + W_is_upper + W_like_number + W_pos + + N1_sic + N1_cluster + N1_norm + N1_shape + N1_prefix + N1_suffix + N1_length + N1_postype + N1_is_alpha + N1_is_ascii + N1_is_digit + N1_is_lower + N1_is_punct + N1_is_space + N1_is_title + N1_is_upper + N1_like_number + N1_pos + + N2_sic + N2_cluster + N2_norm + N2_shape + N2_asciied + N2_prefix + N2_suffix + N2_length + N2_postype + N2_is_alpha + N2_is_digit + N2_is_lower + N2_is_punct + N2_is_space + N2_is_title + N2_is_upper + N2_like_number + N2_pos + N2_sense + + E0_sic + E0_cluster + E0_pos + + E1_sic + E1_cluster + E1_pos + + E_last_sic + E_last_cluster + E_last_pos + + N_FIELDS + + +cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 + + diff --git a/spacy/ner/context.pyx b/spacy/ner/context.pyx new file mode 100644 index 000000000..c062bb098 --- /dev/null +++ b/spacy/ner/context.pyx @@ -0,0 +1,76 @@ +from libc.string cimport memset + +from murmurhash.mrmr cimport hash64 +from ._state cimport entity_is_open +from ..lexeme cimport * + + +cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos): + c[T_sic] = lex.sic + c[T_cluster] = lex.cluster + c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape + c[T_shape] = lex.shape + c[T_asciied] = lex.asciied + c[T_prefix] = lex.prefix + c[T_suffix] = lex.suffix + c[T_length] = lex.length + + c[T_postype] = lex.postype + c[T_nertype] = 0 + c[T_sensetype] = 0 + + c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) + c[T_is_digit] = lex.flags & (1 << IS_DIGIT) + c[T_is_lower] = lex.flags & (1 << IS_LOWER) + c[T_is_punct] = lex.flags & (1 << IS_PUNCT) + c[T_is_space] = lex.flags & (1 << IS_SPACE) + c[T_is_title] = lex.flags & (1 << IS_TITLE) + c[T_is_upper] = lex.flags & (1 << IS_UPPER) + c[T_like_url] = lex.flags & (1 << LIKE_URL) + c[T_like_number] = lex.flags & (1 << LIKE_NUMBER) + c[T_oft_lower] = lex.flags & (1 << OFT_LOWER) + c[T_oft_title] = lex.flags & (1 << OFT_TITLE) + c[T_oft_upper] = lex.flags & (1 << OFT_UPPER) + + c[T_in_males] = lex.flags & (1 << IN_MALES) + c[T_in_females] = lex.flags & (1 << IN_FEMALES) + c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES) + c[T_in_places] = lex.flags & (1 << IN_PLACES) + c[T_in_celebs] = lex.flags & (1 << IN_CELEBS) + c[T_in_names] = lex.flags & (1 << IN_NAMES) + + c[T_pos] = pos + c[T_sense] = 0 + + +cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos): + c[0] = lex.sic + c[1] = lex.cluster + c[2] = lex.shape + c[3] = pos + + +cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1: + cdef int i + for i in range(N_FIELDS): + context[i] = 0 + i = s.i + _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2]) + _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1]) + _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i]) + _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1]) + _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2]) + + cdef atom_t[5] ent_vals + if entity_is_open(s): + context[E0_sic] = tokens.lex[s.curr.start].sic + context[E0_cluster] = tokens.lex[s.curr.start].cluster + context[E0_pos] = tokens.pos[s.curr.start] + context[E_last_sic] = tokens.lex[s.i-1].sic + context[E_last_cluster] = tokens.lex[s.i-1].cluster + context[E_last_pos] = tokens.pos[s.i-1] + if (s.curr.start + 1) < s.i: + context[E1_sic] = tokens.lex[s.curr.start+1].sic + context[E1_cluster] = tokens.lex[s.curr.start+1].cluster + context[E1_pos] = tokens.pos[s.curr.start+1] + return 1