mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add Span.to_array method
This commit is contained in:
parent
19c495f451
commit
482bba1722
|
@ -15,5 +15,5 @@ cdef class Span:
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1
|
cpdef int _recalculate_indices(self) except -1
|
||||||
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
from .doc cimport token_by_start, token_by_end
|
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -135,6 +135,28 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
|
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||||
|
The values will be 32-bit integers.
|
||||||
|
|
||||||
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
|
per word, and one column per attribute indicated in the input
|
||||||
|
`attr_ids`.
|
||||||
|
"""
|
||||||
|
cdef int i, j
|
||||||
|
cdef attr_id_t feature
|
||||||
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
|
# dict iteration.
|
||||||
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
|
for i in range(self.start, self.end):
|
||||||
|
for j, feature in enumerate(attr_ids):
|
||||||
|
output[i, j] = get_token_attr(&self.doc.c[i], feature)
|
||||||
|
return output
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1:
|
cpdef int _recalculate_indices(self) except -1:
|
||||||
if self.end > self.doc.length \
|
if self.end > self.doc.length \
|
||||||
or self.doc.c[self.start].idx != self.start_char \
|
or self.doc.c[self.start].idx != self.start_char \
|
||||||
|
|
Loading…
Reference in New Issue
Block a user