mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Improve array features in tokens
This commit is contained in:
parent
43d5964e13
commit
7018b53d3a
|
@ -1,5 +1,6 @@
|
|||
from spacy.lexeme cimport LexemeC
|
||||
from libcpp.vector cimport vector
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
|
@ -9,6 +10,12 @@ cdef class Tokens:
|
|||
|
||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||
cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat) except -1
|
||||
cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat) except -1
|
||||
cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat) except -1
|
||||
|
||||
cpdef int id(self, size_t i) except -1
|
||||
cpdef float prob(self, size_t i) except 1
|
||||
|
|
|
@ -2,6 +2,9 @@
|
|||
from .word cimport Lexeme
|
||||
|
||||
from .lexeme cimport *
|
||||
cimport numpy
|
||||
cimport cython
|
||||
import numpy
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
|
@ -38,6 +41,8 @@ cdef class Tokens:
|
|||
del self.pos
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i >= self.lex.size():
|
||||
raise IndexError
|
||||
return Lexeme(<size_t>self.lex.at(i))
|
||||
|
||||
def __len__(self):
|
||||
|
@ -48,6 +53,45 @@ cdef class Tokens:
|
|||
self.idx.push_back(idx)
|
||||
return idx + lexeme.ints[<int>LexInt_length]
|
||||
|
||||
cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat):
|
||||
cdef int feat_id, idx
|
||||
cdef int length = self.lex.size()
|
||||
for feat_id in features[:n_feat]:
|
||||
for idx in indices[:n_idx]:
|
||||
if idx < 0 or idx >= length:
|
||||
output[i] = 0
|
||||
else:
|
||||
output[i] = self.lex[0][idx].ints[<int>feat_id]
|
||||
i += 1
|
||||
return i
|
||||
|
||||
cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat):
|
||||
cdef int feat_id, idx
|
||||
cdef int length = self.lex.size()
|
||||
for feat_id in features[:n_feat]:
|
||||
for idx in indices[:n_idx]:
|
||||
if idx < 0 or idx >= length:
|
||||
output[i] = 0
|
||||
else:
|
||||
output[i] = <atom_t>self.lex[0][idx].strings[<int>feat_id]
|
||||
i += 1
|
||||
return i
|
||||
|
||||
cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx,
|
||||
int* features, int n_feat):
|
||||
cdef int feat_id, idx
|
||||
cdef int length = self.lex.size()
|
||||
for feat_id in features[:n_feat]:
|
||||
for idx in indices[:n_idx]:
|
||||
if idx < 0 or idx >= length:
|
||||
output[i] = 0
|
||||
else:
|
||||
output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id)
|
||||
i += 1
|
||||
return i
|
||||
|
||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||
cdef int i
|
||||
if lexemes == NULL:
|
||||
|
@ -89,6 +133,8 @@ cdef class Tokens:
|
|||
# methods, which requires them to know the IDs.
|
||||
|
||||
cpdef unicode string(self, size_t i):
|
||||
if i >= self.lex.size():
|
||||
raise IndexError
|
||||
return self.orig(i)
|
||||
|
||||
cpdef unicode orig(self, size_t i):
|
||||
|
|
Loading…
Reference in New Issue
Block a user