* Add context files

This commit is contained in:
Matthew Honnibal 2014-11-12 23:22:36 +11:00
parent 9b13392ac7
commit 7e0a9077dd
2 changed files with 229 additions and 0 deletions

153
spacy/ner/context.pxd Normal file
View File

@ -0,0 +1,153 @@
from thinc.typedefs cimport atom_t
from ..typedefs cimport hash_t
from ..tokens cimport Tokens
from ..lexeme cimport Lexeme
from .structs cimport State
cpdef enum:
T_sic
T_cluster
T_norm
T_shape
T_asciied
T_prefix
T_suffix
T_length
T_postype
T_nertype
T_sensetype
T_is_alpha
T_is_ascii
T_is_digit
T_is_lower
T_is_punct
T_is_space
T_is_title
T_is_upper
T_like_url
T_like_number
T_oft_lower
T_oft_title
T_oft_upper
T_in_males
T_in_females
T_in_surnames
T_in_places
T_in_celebs
T_in_names
T_pos
T_sense
T_ner
cpdef enum:
P2_sic
P2_cluster
P2_norm
P2_shape
P2_prefix
P2_suffix
P2_length
P2_postype
P2_is_alpha
P2_is_digit
P2_is_lower
P2_is_punct
P2_is_title
P2_is_upper
P2_like_number
P2_pos
P1_sic
P1_cluster
P1_norm
P1_shape
P1_prefix
P1_suffix
P1_length
P1_postype
P1_is_alpha
P1_is_digit
P1_is_lower
P1_is_punct
P1_is_title
P1_is_upper
P1_like_number
P1_pos
W_sic
W_cluster
W_norm
W_shape
W_prefix
W_suffix
W_length
W_postype
W_is_alpha
W_is_digit
W_is_lower
W_is_punct
W_is_space
W_is_title
W_is_upper
W_like_number
W_pos
N1_sic
N1_cluster
N1_norm
N1_shape
N1_prefix
N1_suffix
N1_length
N1_postype
N1_is_alpha
N1_is_ascii
N1_is_digit
N1_is_lower
N1_is_punct
N1_is_space
N1_is_title
N1_is_upper
N1_like_number
N1_pos
N2_sic
N2_cluster
N2_norm
N2_shape
N2_asciied
N2_prefix
N2_suffix
N2_length
N2_postype
N2_is_alpha
N2_is_digit
N2_is_lower
N2_is_punct
N2_is_space
N2_is_title
N2_is_upper
N2_like_number
N2_pos
N2_sense
E0_sic
E0_cluster
E0_pos
E1_sic
E1_cluster
E1_pos
E_last_sic
E_last_cluster
E_last_pos
N_FIELDS
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1

76
spacy/ner/context.pyx Normal file
View File

@ -0,0 +1,76 @@
from libc.string cimport memset
from murmurhash.mrmr cimport hash64
from ._state cimport entity_is_open
from ..lexeme cimport *
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
c[T_sic] = lex.sic
c[T_cluster] = lex.cluster
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
c[T_shape] = lex.shape
c[T_asciied] = lex.asciied
c[T_prefix] = lex.prefix
c[T_suffix] = lex.suffix
c[T_length] = lex.length
c[T_postype] = lex.postype
c[T_nertype] = 0
c[T_sensetype] = 0
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
c[T_is_space] = lex.flags & (1 << IS_SPACE)
c[T_is_title] = lex.flags & (1 << IS_TITLE)
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
c[T_like_url] = lex.flags & (1 << LIKE_URL)
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
c[T_in_males] = lex.flags & (1 << IN_MALES)
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
c[T_in_places] = lex.flags & (1 << IN_PLACES)
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
c[T_in_names] = lex.flags & (1 << IN_NAMES)
c[T_pos] = pos
c[T_sense] = 0
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
c[0] = lex.sic
c[1] = lex.cluster
c[2] = lex.shape
c[3] = pos
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
cdef int i
for i in range(N_FIELDS):
context[i] = 0
i = s.i
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
cdef atom_t[5] ent_vals
if entity_is_open(s):
context[E0_sic] = tokens.lex[s.curr.start].sic
context[E0_cluster] = tokens.lex[s.curr.start].cluster
context[E0_pos] = tokens.pos[s.curr.start]
context[E_last_sic] = tokens.lex[s.i-1].sic
context[E_last_cluster] = tokens.lex[s.i-1].cluster
context[E_last_pos] = tokens.pos[s.i-1]
if (s.curr.start + 1) < s.i:
context[E1_sic] = tokens.lex[s.curr.start+1].sic
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
context[E1_pos] = tokens.pos[s.curr.start+1]
return 1