From e10e9ad2c52cacb73e43d99d57c2da610a0cc323 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 17 Nov 2017 18:55:56 +0100 Subject: [PATCH] Improve efficiency of Doc.to_array --- spacy/tokens/doc.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 62e65f366..4900a363d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,6 +1,7 @@ # coding: utf8 # cython: infer_types=True # cython: bounds_check=False +# cython: profile=True from __future__ import unicode_literals cimport cython @@ -567,7 +568,6 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature - cdef np.ndarray[attr_t, ndim=1] attr_ids cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids if not hasattr(py_attr_ids, '__iter__') \ @@ -579,12 +579,17 @@ cdef class Doc: for id_ in py_attr_ids] # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. - attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i') output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + c_output = output.data + c_attr_ids = attr_ids.data + cdef TokenC* token + cdef int nr_attr = attr_ids.shape[0] for i in range(self.length): - for j, feature in enumerate(attr_ids): - output[i, j] = get_token_attr(&self.c[i], feature) + token = &self.c[i] + for j in range(nr_attr): + c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j]) # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,))