Add Span.to_array method

This commit is contained in:
Matthew Honnibal 2017-08-19 12:20:45 +02:00
parent 19c495f451
commit 482bba1722
2 changed files with 24 additions and 2 deletions

View File

@ -15,5 +15,5 @@ cdef class Span:
cdef public _vector
cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -7,7 +7,7 @@ import numpy
import numpy.linalg
from libc.math cimport sqrt
from .doc cimport token_by_start, token_by_end
from .doc cimport token_by_start, token_by_end, get_token_attr
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t
@ -135,6 +135,28 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.start, self.end):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.doc.c[i], feature)
return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \