diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index dd87aa763..ff10394d1 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab): assert feats_array[0][0] != feats_array[0][1] +def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array((ORTH, SHAPE)) + feats_array_stringy = tokens.to_array(("ORTH", "SHAPE")) + assert feats_array_stringy[0][0] == feats_array[0][0] + assert feats_array_stringy[0][1] == feats_array[0][1] + + +def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array(ORTH) + assert feats_array.shape == (3,) + + def test_doc_array_tag(en_tokenizer): text = "A nice sentence." pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ad5358d9a..6e7230428 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,7 +21,7 @@ from .token cimport Token from .printers import parse_tree from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t -from ..attrs import intify_attrs +from ..attrs import intify_attrs, IDS from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE @@ -536,11 +536,15 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): - """Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape `(N, M)`, where `N` is the length of the document. - The values will be 32-bit integers. + """Export given token attributes to a numpy `ndarray`. - attr_ids (list[int]): A list of attribute ID ints. + If `attr_ids` is a sequence of M attributes, the output array will + be of shape `(N, M)`, where N is the length of the `Doc` + (in tokens). If `attr_ids` is a single attribute, the output shape will + be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) + or string name (e.g. 'LEMMA' or 'lemma'). + + attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. @@ -555,11 +559,18 @@ cdef class Doc: cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=1] output_1D - # Make an array from the attributes --- otherwise our inner loop is Python - # dict iteration. + # Handle scalar/list inputs of strings/ints for py_attr_ids if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ): py_attr_ids = [ py_attr_ids ] - cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + py_attr_ids_input = [] + for py_attr_id in py_attr_ids: + if( type(py_attr_id) is int ): + py_attr_ids_input.append(py_attr_id) + else: + py_attr_ids_input.append(IDS[py_attr_id.upper()]) + # Make an array from the attributes --- otherwise our inner loop is Python + # dict iteration. + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) for i in range(self.length): for j, feature in enumerate(attr_ids):