Add Span.to_array method

2025-11-07 19:37:38 +03:00 · 2017-08-19 12:20:45 +02:00 · 2017-08-19 12:20:45 +02:00 · 482bba1722
commit 482bba1722
parent 19c495f451
2 changed files with 24 additions and 2 deletions
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -15,5 +15,5 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm
    cpdef int _recalculate_indices(self) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@ -135,6 +135,28 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
        The values will be 32-bit integers.
        attr_ids (list[int]): A list of attribute ID ints.
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.start, self.end):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.doc.c[i], feature)
        return output
    cpdef int _recalculate_indices(self) except -1:
        if self.end > self.doc.length \
        or self.doc.c[self.start].idx != self.start_char \