spaCy/spacy/structs.pxd

from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t

from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .parts_of_speech cimport univ_pos_t


cdef struct LexemeC:
    const float* repvec

    flags_t flags

    attr_t id
    attr_t length

    attr_t orth
    attr_t lower
    attr_t norm
    attr_t shape
    attr_t prefix
    attr_t suffix

    attr_t cluster

    float prob
    float sentiment
    float l2_norm


cdef struct Morphology:
    uint8_t number
    uint8_t tenspect # Tense/aspect/voice
    uint8_t mood
    uint8_t gender
    uint8_t person
    uint8_t case
    uint8_t misc


cdef struct PosTag:
    Morphology morph
    int id
    univ_pos_t pos


# Start and end will be offsets: i + ent.start will always take you to the
# "next" entity start. If inside an entity, ent.start will be negative ---
# the next entity is the start of the one the token is inside.  If i _is_
# the start of an entity, then ent.start will be the beginning of the next one.
#
# The same/inverse is true for end. If ent.end has a negative value, we are either
# at the end of an entity, or outside one.  If we're inside an entity, ent.end
# will have a positive value.
#
# This allows us to easily find the span of an entity we might be inside, while
# naturally sharing an API with iterating through all entities in the sentence
cdef struct Entity:
    int32_t tag
    uint16_t flags
    int8_t start
    int8_t end


cdef struct TokenC:
    const LexemeC* lex
    Morphology morph
    Entity ent
    univ_pos_t pos
    int tag
    int idx
    int lemma
    int sense
    int head
    int dep
    bint sent_end
    uint32_t l_kids
    uint32_t r_kids


cdef struct Utf8Str:
    id_t i
    hash_t key
    unsigned char* chars
    int length


cdef struct UniStr:
    Py_UNICODE* chars
    size_t n
    hash_t key