mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Support strings for attribute list in doc.to_array
This commit is contained in:
parent
7b9b1be44c
commit
b3ab124fc5
|
@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
|
||||||
assert feats_array[0][0] != feats_array[0][1]
|
assert feats_array[0][0] != feats_array[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
|
||||||
|
text = "An example sentence"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
example = tokens.vocab["example"]
|
||||||
|
assert example.orth != example.shape
|
||||||
|
feats_array = tokens.to_array((ORTH, SHAPE))
|
||||||
|
feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
|
||||||
|
assert feats_array_stringy[0][0] == feats_array[0][0]
|
||||||
|
assert feats_array_stringy[0][1] == feats_array[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
|
||||||
|
text = "An example sentence"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
example = tokens.vocab["example"]
|
||||||
|
assert example.orth != example.shape
|
||||||
|
feats_array = tokens.to_array(ORTH)
|
||||||
|
assert feats_array.shape == (3,)
|
||||||
|
|
||||||
|
|
||||||
def test_doc_array_tag(en_tokenizer):
|
def test_doc_array_tag(en_tokenizer):
|
||||||
text = "A nice sentence."
|
text = "A nice sentence."
|
||||||
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
||||||
|
|
|
@ -21,7 +21,7 @@ from .token cimport Token
|
||||||
from .printers import parse_tree
|
from .printers import parse_tree
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs import intify_attrs
|
from ..attrs import intify_attrs, IDS
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
|
@ -536,11 +536,15 @@ cdef class Doc:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
"""Export given token attributes to a numpy `ndarray`.
|
||||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
|
||||||
The values will be 32-bit integers.
|
|
||||||
|
|
||||||
attr_ids (list[int]): A list of attribute ID ints.
|
If `attr_ids` is a sequence of M attributes, the output array will
|
||||||
|
be of shape `(N, M)`, where N is the length of the `Doc`
|
||||||
|
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
||||||
|
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
||||||
|
or string name (e.g. 'LEMMA' or 'lemma').
|
||||||
|
|
||||||
|
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
per word, and one column per attribute indicated in the input
|
per word, and one column per attribute indicated in the input
|
||||||
`attr_ids`.
|
`attr_ids`.
|
||||||
|
@ -555,11 +559,18 @@ cdef class Doc:
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
cdef np.ndarray[attr_t, ndim=1] output_1D
|
cdef np.ndarray[attr_t, ndim=1] output_1D
|
||||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||||
# dict iteration.
|
|
||||||
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
|
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
|
||||||
py_attr_ids = [ py_attr_ids ]
|
py_attr_ids = [ py_attr_ids ]
|
||||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
py_attr_ids_input = []
|
||||||
|
for py_attr_id in py_attr_ids:
|
||||||
|
if( type(py_attr_id) is int ):
|
||||||
|
py_attr_ids_input.append(py_attr_id)
|
||||||
|
else:
|
||||||
|
py_attr_ids_input.append(IDS[py_attr_id.upper()])
|
||||||
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
|
# dict iteration.
|
||||||
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64)
|
||||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user