From 482bba1722b848a92d6f19ec2bb3152ed1b84ae4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 12:20:45 +0200 Subject: [PATCH] Add Span.to_array method --- spacy/tokens/span.pxd | 2 +- spacy/tokens/span.pyx | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 8d675c04f..9645189a5 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -15,5 +15,5 @@ cdef class Span: cdef public _vector cdef public _vector_norm - cpdef int _recalculate_indices(self) except -1 + cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9f2115fe1..9625b5547 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -7,7 +7,7 @@ import numpy import numpy.linalg from libc.math cimport sqrt -from .doc cimport token_by_start, token_by_end +from .doc cimport token_by_start, token_by_end, get_token_attr from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t @@ -135,6 +135,28 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + cpdef np.ndarray to_array(self, object py_attr_ids): + """Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape `(N, M)`, where `N` is the length of the document. + The values will be 32-bit integers. + + attr_ids (list[int]): A list of attribute ID ints. + RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row + per word, and one column per attribute indicated in the input + `attr_ids`. + """ + cdef int i, j + cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=2] output + # Make an array from the attributes --- otherwise our inner loop is Python + # dict iteration. + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + for i in range(self.start, self.end): + for j, feature in enumerate(attr_ids): + output[i, j] = get_token_attr(&self.doc.c[i], feature) + return output + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \