spaCy/spacy/ner/context.pyx

from libc.string cimport memset

from murmurhash.mrmr cimport hash64
from ._state cimport entity_is_open
from ..lexeme cimport *


cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
    c[T_sic] = lex.sic
    c[T_cluster] = lex.cluster
    c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    c[T_shape] = lex.shape
    c[T_asciied] = lex.asciied
    c[T_prefix] = lex.prefix
    c[T_suffix] = lex.suffix
    c[T_length] = lex.length

    c[T_postype] = lex.postype
    c[T_nertype] = 0
    c[T_sensetype] = 0

    c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
    c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
    c[T_is_lower] = lex.flags & (1 << IS_LOWER)
    c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
    c[T_is_space] = lex.flags & (1 << IS_SPACE)
    c[T_is_title] = lex.flags & (1 << IS_TITLE)
    c[T_is_upper] = lex.flags & (1 << IS_UPPER)
    c[T_like_url] = lex.flags & (1 << LIKE_URL)
    c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
    c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
    c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
    c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)

    c[T_in_males] = lex.flags & (1 << IN_MALES)
    c[T_in_females] = lex.flags & (1 << IN_FEMALES)
    c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
    c[T_in_places] = lex.flags & (1 << IN_PLACES)
    c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
    c[T_in_names] = lex.flags & (1 << IN_NAMES)

    c[T_pos] = pos
    c[T_sense] = 0


cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
    c[0] = lex.sic
    c[1] = lex.cluster
    c[2] = lex.shape
    c[3] = pos


cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
    cdef int i
    for i in range(N_FIELDS):
        context[i] = 0
    i = s.i
    _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
    _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
    _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
    _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
    _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])

    cdef atom_t[5] ent_vals
    if entity_is_open(s):
        context[E0_sic] = tokens.lex[s.curr.start].sic
        context[E0_cluster] = tokens.lex[s.curr.start].cluster
        context[E0_pos] = tokens.pos[s.curr.start]
        context[E_last_sic] = tokens.lex[s.i-1].sic
        context[E_last_cluster] = tokens.lex[s.i-1].cluster
        context[E_last_pos] = tokens.pos[s.i-1]
        if (s.curr.start + 1) < s.i:
            context[E1_sic] = tokens.lex[s.curr.start+1].sic
            context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
            context[E1_pos] = tokens.pos[s.curr.start+1]
    return 1
* Resurrect old NER code. This version won't be the one that runs; we want to re-use the parser code. But for now this is a useful reference. 2015-02-02 08:37:25 +03:00			`from libc.string cimport memset`

			`from murmurhash.mrmr cimport hash64`
			`from ._state cimport entity_is_open`
			`from ..lexeme cimport *`


			`cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):`
			`c[T_sic] = lex.sic`
			`c[T_cluster] = lex.cluster`
			`c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape`
			`c[T_shape] = lex.shape`
			`c[T_asciied] = lex.asciied`
			`c[T_prefix] = lex.prefix`
			`c[T_suffix] = lex.suffix`
			`c[T_length] = lex.length`

			`c[T_postype] = lex.postype`
			`c[T_nertype] = 0`
			`c[T_sensetype] = 0`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Resurrect old NER code. This version won't be the one that runs; we want to re-use the parser code. But for now this is a useful reference. 2015-02-02 08:37:25 +03:00			`c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)`
			`c[T_is_digit] = lex.flags & (1 << IS_DIGIT)`
			`c[T_is_lower] = lex.flags & (1 << IS_LOWER)`
			`c[T_is_punct] = lex.flags & (1 << IS_PUNCT)`
			`c[T_is_space] = lex.flags & (1 << IS_SPACE)`
			`c[T_is_title] = lex.flags & (1 << IS_TITLE)`
			`c[T_is_upper] = lex.flags & (1 << IS_UPPER)`
			`c[T_like_url] = lex.flags & (1 << LIKE_URL)`
			`c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)`
			`c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)`
			`c[T_oft_title] = lex.flags & (1 << OFT_TITLE)`
			`c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)`

			`c[T_in_males] = lex.flags & (1 << IN_MALES)`
			`c[T_in_females] = lex.flags & (1 << IN_FEMALES)`
			`c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)`
			`c[T_in_places] = lex.flags & (1 << IN_PLACES)`
			`c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)`
			`c[T_in_names] = lex.flags & (1 << IN_NAMES)`

			`c[T_pos] = pos`
			`c[T_sense] = 0`


			`cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):`
			`c[0] = lex.sic`
			`c[1] = lex.cluster`
			`c[2] = lex.shape`
			`c[3] = pos`


			`cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:`
			`cdef int i`
			`for i in range(N_FIELDS):`
			`context[i] = 0`
			`i = s.i`
			`_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])`
			`_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])`
			`_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])`
			`_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])`
			`_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])`

			`cdef atom_t[5] ent_vals`
			`if entity_is_open(s):`
			`context[E0_sic] = tokens.lex[s.curr.start].sic`
			`context[E0_cluster] = tokens.lex[s.curr.start].cluster`
			`context[E0_pos] = tokens.pos[s.curr.start]`
			`context[E_last_sic] = tokens.lex[s.i-1].sic`
			`context[E_last_cluster] = tokens.lex[s.i-1].cluster`
			`context[E_last_pos] = tokens.pos[s.i-1]`
			`if (s.curr.start + 1) < s.i:`
			`context[E1_sic] = tokens.lex[s.curr.start+1].sic`
			`context[E1_cluster] = tokens.lex[s.curr.start+1].cluster`
			`context[E1_pos] = tokens.pos[s.curr.start+1]`
			`return 1`