Make small changes to Doc.to_array

* Change type-check logic to 'hasattr' (Python type-checking is brittle) * Small 'house style' edits, mostly making code more terse.
2025-07-14 18:22:27 +03:00 · 2017-10-20 11:17:00 +02:00 · 2017-10-20 11:17:00 +02:00 · c0799430a7
commit c0799430a7
parent fbccc8c87d
1 changed files with 17 additions and 20 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -477,11 +477,11 @@ cdef class Doc:
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Export given token attributes to a numpy `ndarray`.
-	If `attr_ids` is a sequence of M attributes, the output array will
+        If `attr_ids` is a sequence of M attributes, the output array will
-	be of shape `(N, M)`, where N is the length of the `Doc`
+        be of shape `(N, M)`, where N is the length of the `Doc`
-	(in tokens). If `attr_ids` is a single attribute, the output shape will
+        (in tokens). If `attr_ids` is a single attribute, the output shape will
-	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
+        be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
-	or string name (e.g. 'LEMMA' or 'lemma').
+        or string name (e.g. 'LEMMA' or 'lemma').
        Example:
            from spacy import attrs
@ -499,28 +499,25 @@ cdef class Doc:
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D
        cdef np.ndarray[attr_t, ndim=2] output
        cdef np.ndarray[attr_t, ndim=1] output_1D
        # Handle scalar/list inputs of strings/ints for py_attr_ids
-        if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
+	    if not hasattr(py_attr_ids, '__iter__'):
-            py_attr_ids = [ py_attr_ids ]
+            py_attr_ids = [py_attr_ids]
-        py_attr_ids_input = []
+	
-        for py_attr_id in py_attr_ids:
+        # Allow strings, e.g. 'lemma' or 'LEMMA'
-            if( type(py_attr_id) is int ):
+        convert_id = lambda id_:  IDS[id_.upper()] if hasattr(id_, 'upper') else id_
-                py_attr_ids_input.append(py_attr_id)
+        # Make an array from the attributes --- otherwise inner loop would be Python
            else:
                py_attr_ids_input.append(IDS[py_attr_id.upper()])
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
-        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32)
+        attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids),
                                 dtype=numpy.int32)
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.c[i], feature)
-        if( len(attr_ids) == 1 ):
+        # Handle 1d case
-            output_1D = output.reshape((self.length))
+        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
            return output_1D
        return output
    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
        """