Improve efficiency of Doc.to_array

2026-03-05 20:31:30 +03:00 · 2017-11-17 18:55:56 +01:00 · 2017-11-17 18:55:56 +01:00 · e10e9ad2c5
commit e10e9ad2c5
parent 2acc907d55
1 changed files with 9 additions and 4 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1,6 +1,7 @@
 # coding: utf8
 # cython: infer_types=True
 # cython: bounds_check=False
+# cython: profile=True
 from __future__ import unicode_literals

 cimport cython
@ -567,7 +568,6 @@ cdef class Doc:
        """
        cdef int i, j
        cdef attr_id_t feature
-        cdef np.ndarray[attr_t, ndim=1] attr_ids
        cdef np.ndarray[attr_t, ndim=2] output
        # Handle scalar/list inputs of strings/ints for py_attr_ids
        if not hasattr(py_attr_ids, '__iter__') \
@ -579,12 +579,17 @@ cdef class Doc:
                       for id_ in py_attr_ids]
        # Make an array from the attributes --- otherwise our inner loop is
        # Python dict iteration.
-        attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
+        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i')
        output = numpy.ndarray(shape=(self.length, len(attr_ids)),
                               dtype=numpy.uint64)
+        c_output = <attr_t*>output.data
+        c_attr_ids = <attr_id_t*>attr_ids.data
+        cdef TokenC* token
+        cdef int nr_attr = attr_ids.shape[0]
        for i in range(self.length):
-            for j, feature in enumerate(attr_ids):
-                output[i, j] = get_token_attr(&self.c[i], feature)
+            token = &self.c[i]
+            for j in range(nr_attr):
+                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
        # Handle 1d case
        return output if len(attr_ids) >= 2 else output.reshape((self.length,))