spaCy/spacy/structs.pxd

from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t

from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .parts_of_speech cimport univ_pos_t


cdef struct LexemeC:
    const float* repvec

    flags_t flags
   
    attr_t id
    attr_t length

    attr_t orth
    attr_t lower
    attr_t norm
    attr_t shape
    attr_t prefix
    attr_t suffix
 
    attr_t cluster

    float prob
    float sentiment
    float l2_norm


cdef struct Morphology:
    uint8_t number
    uint8_t tenspect # Tense/aspect/voice
    uint8_t mood
    uint8_t gender
    uint8_t person
    uint8_t case
    uint8_t misc


cdef struct PosTag:
    Morphology morph
    int id
    univ_pos_t pos


# Start and end will be offsets: i + ent.start will always take you to the
# "next" entity start. If inside an entity, ent.start will be negative ---
# the next entity is the start of the one the token is inside.  If i _is_
# the start of an entity, then ent.start will be the beginning of the next one.
#
# The same/inverse is true for end. If ent.end has a negative value, we are either
# at the end of an entity, or outside one.  If we're inside an entity, ent.end
# will have a positive value.
#
# This allows us to easily find the span of an entity we might be inside, while
# naturally sharing an API with iterating through all entities in the sentence
cdef struct Entity:
    int32_t tag
    uint16_t flags
    int8_t start
    int8_t end


cdef struct TokenC:
    const LexemeC* lex
    Morphology morph
    Entity ent
    univ_pos_t pos
    int tag
    int idx
    int lemma
    int sense
    int head
    int dep
    bint sent_end
    uint32_t l_kids
    uint32_t r_kids


cdef struct Utf8Str:
    id_t i
    hash_t key
    unsigned char* chars
    int length


cdef struct UniStr:
    Py_UNICODE* chars
    size_t n
    hash_t key
* Start setting out how NER will be implemented in the data model 2015-02-02 08:35:58 +03:00			`from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00
* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 08:31:07 +03:00			`from .typedefs cimport flags_t, attr_t, id_t, hash_t`
			`from .parts_of_speech cimport univ_pos_t`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00

* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef struct LexemeC:`
* Rename vec to repvec 2015-01-21 18:04:24 +03:00			`const float* repvec`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`flags_t flags`

			`attr_t id`
* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 03:23:44 +03:00			`attr_t length`

* Rename sic to orth 2015-01-22 18:08:25 +03:00			`attr_t orth`
* Rename NORM1 and NORM2 attrs to lower and norm 2015-01-23 22:17:03 +03:00			`attr_t lower`
			`attr_t norm`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`attr_t shape`
			`attr_t prefix`
			`attr_t suffix`

			`attr_t cluster`

			`float prob`
			`float sentiment`
* Add L2 norm field to LexemeC struct 2015-02-07 16:43:17 +03:00			`float l2_norm`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00

			`cdef struct Morphology:`
			`uint8_t number`
			`uint8_t tenspect # Tense/aspect/voice`
			`uint8_t mood`
			`uint8_t gender`
			`uint8_t person`
			`uint8_t case`
			`uint8_t misc`


			`cdef struct PosTag:`
			`Morphology morph`
			`int id`
* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 08:31:07 +03:00			`univ_pos_t pos`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00

* Start setting out how NER will be implemented in the data model 2015-02-02 08:35:58 +03:00			`# Start and end will be offsets: i + ent.start will always take you to the`
			`# "next" entity start. If inside an entity, ent.start will be negative ---`
			`# the next entity is the start of the one the token is inside. If i _is_`
			`# the start of an entity, then ent.start will be the beginning of the next one.`
			`#`
			`# The same/inverse is true for end. If ent.end has a negative value, we are either`
			`# at the end of an entity, or outside one. If we're inside an entity, ent.end`
			`# will have a positive value.`
			`#`
			`# This allows us to easily find the span of an entity we might be inside, while`
			`# naturally sharing an API with iterating through all entities in the sentence`
			`cdef struct Entity:`
			`int32_t tag`
			`uint16_t flags`
			`int8_t start`
			`int8_t end`


* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`cdef struct TokenC:`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`const LexemeC* lex`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`Morphology morph`
* Start setting out how NER will be implemented in the data model 2015-02-02 08:35:58 +03:00			`Entity ent`
* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 08:31:07 +03:00			`univ_pos_t pos`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`int tag`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`int idx`
			`int lemma`
			`int sense`
			`int head`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`int dep`
* Add sent_end flag to TokenC struct 2015-01-31 05:44:16 +03:00			`bint sent_end`
* Move all struct definitions to structs.pxd, to avoid circular dependencies 2014-12-19 22:51:33 +03:00			`uint32_t l_kids`
			`uint32_t r_kids`


			`cdef struct Utf8Str:`
			`id_t i`
			`hash_t key`
			`unsigned char* chars`
			`int length`


			`cdef struct UniStr:`
			`Py_UNICODE* chars`
			`size_t n`
			`hash_t key`