Make small changes to Doc.to_array

* Change type-check logic to 'hasattr' (Python type-checking is brittle)
* Small 'house style' edits, mostly making code more terse.
This commit is contained in:
Matthew Honnibal 2017-10-20 11:17:00 +02:00 committed by GitHub
parent fbccc8c87d
commit c0799430a7

View File

@ -477,11 +477,11 @@ cdef class Doc:
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Export given token attributes to a numpy `ndarray`. """Export given token attributes to a numpy `ndarray`.
If `attr_ids` is a sequence of M attributes, the output array will If `attr_ids` is a sequence of M attributes, the output array will
be of shape `(N, M)`, where N is the length of the `Doc` be of shape `(N, M)`, where N is the length of the `Doc`
(in tokens). If `attr_ids` is a single attribute, the output shape will (in tokens). If `attr_ids` is a single attribute, the output shape will
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
or string name (e.g. 'LEMMA' or 'lemma'). or string name (e.g. 'LEMMA' or 'lemma').
Example: Example:
from spacy import attrs from spacy import attrs
@ -499,28 +499,25 @@ cdef class Doc:
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D
cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=2] output
cdef np.ndarray[attr_t, ndim=1] output_1D
# Handle scalar/list inputs of strings/ints for py_attr_ids # Handle scalar/list inputs of strings/ints for py_attr_ids
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ): if not hasattr(py_attr_ids, '__iter__'):
py_attr_ids = [ py_attr_ids ] py_attr_ids = [py_attr_ids]
py_attr_ids_input = []
for py_attr_id in py_attr_ids: # Allow strings, e.g. 'lemma' or 'LEMMA'
if( type(py_attr_id) is int ): convert_id = lambda id_: IDS[id_.upper()] if hasattr(id_, 'upper') else id_
py_attr_ids_input.append(py_attr_id) # Make an array from the attributes --- otherwise inner loop would be Python
else:
py_attr_ids_input.append(IDS[py_attr_id.upper()])
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration. # dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32) attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids),
dtype=numpy.int32)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature) output[i, j] = get_token_attr(&self.c[i], feature)
if( len(attr_ids) == 1 ): # Handle 1d case
output_1D = output.reshape((self.length)) return output if len(attr_ids) >= 2 else output.reshape((self.length,))
return output_1D
return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
""" """