2014-12-16 14:44:43 +03:00
|
|
|
from libc.stdint cimport uint32_t
|
|
|
|
|
2014-12-02 15:48:05 +03:00
|
|
|
import numpy as np
|
|
|
|
cimport numpy as np
|
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
from cymem.cymem cimport Pool
|
2014-12-04 12:46:55 +03:00
|
|
|
from thinc.typedefs cimport atom_t
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
from .lexeme cimport Lexeme
|
2014-12-08 13:12:15 +03:00
|
|
|
|
2014-12-03 03:05:15 +03:00
|
|
|
from .typedefs cimport flags_t
|
2014-12-10 00:09:32 +03:00
|
|
|
from .typedefs cimport Morphology
|
|
|
|
from .lang cimport Language
|
2014-12-08 13:12:15 +03:00
|
|
|
|
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
|
2014-12-05 07:56:14 +03:00
|
|
|
cdef struct TokenC:
|
|
|
|
const Lexeme* lex
|
2014-12-08 13:12:15 +03:00
|
|
|
Morphology morph
|
2014-12-05 07:56:14 +03:00
|
|
|
int idx
|
|
|
|
int pos
|
2014-12-08 13:12:15 +03:00
|
|
|
int lemma
|
2014-12-05 07:56:14 +03:00
|
|
|
int sense
|
2014-12-16 14:44:43 +03:00
|
|
|
int head
|
|
|
|
int dep_tag
|
|
|
|
uint32_t l_kids
|
|
|
|
uint32_t r_kids
|
2014-12-05 07:56:14 +03:00
|
|
|
|
|
|
|
|
2014-12-09 08:50:01 +03:00
|
|
|
ctypedef const Lexeme* const_Lexeme_ptr
|
|
|
|
ctypedef TokenC* TokenC_ptr
|
|
|
|
|
|
|
|
ctypedef fused LexemeOrToken:
|
|
|
|
const_Lexeme_ptr
|
|
|
|
TokenC_ptr
|
|
|
|
|
|
|
|
|
2014-09-15 05:22:40 +04:00
|
|
|
cdef class Tokens:
|
2014-10-22 18:57:59 +04:00
|
|
|
cdef Pool mem
|
2014-12-10 00:09:32 +03:00
|
|
|
cdef Language lang
|
|
|
|
cdef list tag_names
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2014-12-05 07:56:14 +03:00
|
|
|
cdef TokenC* data
|
2014-10-22 18:57:59 +04:00
|
|
|
|
|
|
|
cdef int length
|
|
|
|
cdef int max_length
|
2014-10-14 08:21:03 +04:00
|
|
|
|
2014-12-09 08:50:01 +03:00
|
|
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
2014-10-23 17:59:17 +04:00
|
|
|
|
2014-12-04 12:46:55 +03:00
|
|
|
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
2014-12-02 15:48:05 +03:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
|
|
|
|
cdef class Token:
|
2014-12-10 00:09:32 +03:00
|
|
|
cdef public Language lang
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef public int i
|
|
|
|
cdef public int idx
|
2014-12-10 00:09:32 +03:00
|
|
|
cdef int pos
|
2014-12-08 13:12:15 +03:00
|
|
|
cdef int lemma
|
2014-12-17 13:10:12 +03:00
|
|
|
cdef public int head
|
|
|
|
cdef public int dep_tag
|
2014-10-23 17:59:17 +04:00
|
|
|
|
2014-10-31 09:44:39 +03:00
|
|
|
cdef public atom_t id
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef public atom_t cluster
|
|
|
|
cdef public atom_t length
|
2014-10-31 09:44:39 +03:00
|
|
|
cdef public atom_t postype
|
|
|
|
cdef public atom_t sensetype
|
2014-10-23 17:59:17 +04:00
|
|
|
|
2014-10-29 15:19:38 +03:00
|
|
|
cdef public atom_t sic
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef public atom_t norm
|
|
|
|
cdef public atom_t shape
|
|
|
|
cdef public atom_t asciied
|
|
|
|
cdef public atom_t prefix
|
|
|
|
cdef public atom_t suffix
|
|
|
|
|
|
|
|
cdef public float prob
|
|
|
|
|
2014-12-03 03:05:15 +03:00
|
|
|
cdef public flags_t flags
|