2014-12-16 14:44:43 +03:00
|
|
|
from libc.stdint cimport uint32_t
|
|
|
|
|
2015-01-19 11:59:55 +03:00
|
|
|
from numpy cimport ndarray
|
2015-01-24 09:29:04 +03:00
|
|
|
cimport numpy
|
2014-12-02 15:48:05 +03:00
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
from cymem.cymem cimport Pool
|
2014-12-21 23:25:43 +03:00
|
|
|
from thinc.typedefs cimport atom_t
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2015-01-25 08:31:07 +03:00
|
|
|
from .typedefs cimport flags_t, attr_id_t, attr_t
|
|
|
|
from .parts_of_speech cimport univ_pos_t
|
2015-01-12 02:26:22 +03:00
|
|
|
from .structs cimport Morphology, TokenC, LexemeC
|
2014-12-21 23:25:43 +03:00
|
|
|
from .vocab cimport Vocab
|
2014-12-19 23:03:26 +03:00
|
|
|
from .strings cimport StringStore
|
2014-12-05 07:56:14 +03:00
|
|
|
|
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
ctypedef const LexemeC* const_Lexeme_ptr
|
2014-12-09 08:50:01 +03:00
|
|
|
ctypedef TokenC* TokenC_ptr
|
|
|
|
|
|
|
|
ctypedef fused LexemeOrToken:
|
|
|
|
const_Lexeme_ptr
|
|
|
|
TokenC_ptr
|
|
|
|
|
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
|
2014-12-24 09:42:00 +03:00
|
|
|
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
|
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
2014-12-24 09:42:00 +03:00
|
|
|
return lexeme.flags & (1 << flag_id)
|
|
|
|
|
|
|
|
|
2014-09-15 05:22:40 +04:00
|
|
|
cdef class Tokens:
|
2014-10-22 18:57:59 +04:00
|
|
|
cdef Pool mem
|
2014-12-21 23:25:43 +03:00
|
|
|
cdef Vocab vocab
|
2015-01-24 09:29:04 +03:00
|
|
|
|
2014-12-05 07:56:14 +03:00
|
|
|
cdef TokenC* data
|
2015-01-24 09:29:04 +03:00
|
|
|
|
|
|
|
|
2015-01-30 10:04:41 +03:00
|
|
|
cdef list _py_tokens
|
2015-01-21 10:57:09 +03:00
|
|
|
cdef unicode _string
|
2015-01-31 05:42:58 +03:00
|
|
|
cdef tuple _tag_strings
|
|
|
|
cdef tuple _dep_strings
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2015-01-25 07:33:54 +03:00
|
|
|
cdef public bint is_tagged
|
|
|
|
cdef public bint is_parsed
|
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
cdef int length
|
|
|
|
cdef int max_length
|
2014-10-14 08:21:03 +04:00
|
|
|
|
2014-12-09 08:50:01 +03:00
|
|
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
2014-10-23 17:59:17 +04:00
|
|
|
|
2015-01-05 09:54:13 +03:00
|
|
|
cpdef long[:,:] to_array(self, object features)
|
2014-12-02 15:48:05 +03:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
|
|
|
|
cdef class Token:
|
2015-01-31 05:42:58 +03:00
|
|
|
cdef Vocab vocab
|
|
|
|
cdef unicode _string
|
|
|
|
|
|
|
|
cdef const TokenC* c
|
2015-01-31 08:37:13 +03:00
|
|
|
cdef readonly int i
|
2015-01-31 05:42:58 +03:00
|
|
|
cdef int array_len
|
2015-02-12 02:05:06 +03:00
|
|
|
cdef bint _owns_c_data
|
2015-01-31 05:42:58 +03:00
|
|
|
|
|
|
|
|
2015-02-16 23:20:31 +03:00
|
|
|
cdef Tokens _seq
|
2015-01-31 05:42:58 +03:00
|
|
|
cdef tuple _tag_strings
|
|
|
|
cdef tuple _dep_strings
|
|
|
|
|
|
|
|
@staticmethod
|
2015-01-31 14:10:22 +03:00
|
|
|
cdef inline Token cinit(Vocab vocab, unicode string,
|
2015-01-31 05:42:58 +03:00
|
|
|
const TokenC* token, int offset, int array_len,
|
2015-02-16 23:20:31 +03:00
|
|
|
Tokens parent_seq, tuple tag_strings, tuple dep_strings):
|
2015-02-07 20:59:12 +03:00
|
|
|
if offset < 0 or offset >= array_len:
|
|
|
|
|
|
|
|
msg = "Attempt to access token at %d, max length %d"
|
|
|
|
raise IndexError(msg % (offset, array_len))
|
2015-02-16 23:20:31 +03:00
|
|
|
if parent_seq._py_tokens[offset] is not None:
|
|
|
|
return parent_seq._py_tokens[offset]
|
2015-01-31 05:42:58 +03:00
|
|
|
|
2015-01-31 14:10:22 +03:00
|
|
|
cdef Token self = Token.__new__(Token, vocab, string)
|
2015-01-31 05:42:58 +03:00
|
|
|
|
|
|
|
self.c = token
|
|
|
|
self.i = offset
|
|
|
|
self.array_len = array_len
|
|
|
|
|
2015-02-16 23:20:31 +03:00
|
|
|
self._seq = parent_seq
|
2015-01-31 05:42:58 +03:00
|
|
|
self._tag_strings = tag_strings
|
|
|
|
self._dep_strings = dep_strings
|
2015-02-16 23:20:31 +03:00
|
|
|
self._seq._py_tokens[offset] = self
|
2015-01-31 05:42:58 +03:00
|
|
|
return self
|
2015-02-12 02:05:06 +03:00
|
|
|
|
|
|
|
cdef int take_ownership_of_c_data(self) except -1
|