Support strings for attribute list in doc.to_array

This commit is contained in:
Ramanan Balakrishnan 2017-10-19 19:37:14 +05:30
parent 7b9b1be44c
commit b3ab124fc5
No known key found for this signature in database
GPG Key ID: 57283041B6B6D1D1
2 changed files with 39 additions and 8 deletions

View File

@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
assert feats_array[0][0] != feats_array[0][1] assert feats_array[0][0] != feats_array[0][1]
def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
text = "An example sentence"
tokens = en_tokenizer(text)
example = tokens.vocab["example"]
assert example.orth != example.shape
feats_array = tokens.to_array((ORTH, SHAPE))
feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
assert feats_array_stringy[0][0] == feats_array[0][0]
assert feats_array_stringy[0][1] == feats_array[0][1]
def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
text = "An example sentence"
tokens = en_tokenizer(text)
example = tokens.vocab["example"]
assert example.orth != example.shape
feats_array = tokens.to_array(ORTH)
assert feats_array.shape == (3,)
def test_doc_array_tag(en_tokenizer): def test_doc_array_tag(en_tokenizer):
text = "A nice sentence." text = "A nice sentence."
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']

View File

@ -21,7 +21,7 @@ from .token cimport Token
from .printers import parse_tree from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -536,11 +536,15 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy """Export given token attributes to a numpy `ndarray`.
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
attr_ids (list[int]): A list of attribute ID ints. If `attr_ids` is a sequence of M attributes, the output array will
be of shape `(N, M)`, where N is the length of the `Doc`
(in tokens). If `attr_ids` is a single attribute, the output shape will
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
or string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input per word, and one column per attribute indicated in the input
`attr_ids`. `attr_ids`.
@ -555,11 +559,18 @@ cdef class Doc:
cdef attr_id_t feature cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=2] output
cdef np.ndarray[attr_t, ndim=1] output_1D cdef np.ndarray[attr_t, ndim=1] output_1D
# Make an array from the attributes --- otherwise our inner loop is Python # Handle scalar/list inputs of strings/ints for py_attr_ids
# dict iteration.
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ): if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
py_attr_ids = [ py_attr_ids ] py_attr_ids = [ py_attr_ids ]
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) py_attr_ids_input = []
for py_attr_id in py_attr_ids:
if( type(py_attr_id) is int ):
py_attr_ids_input.append(py_attr_id)
else:
py_attr_ids_input.append(IDS[py_attr_id.upper()])
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):