mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-15 06:09:01 +03:00
Cast to uint64 for all array-based doc representations
This commit is contained in:
parent
ca0cae2074
commit
b9c524917a
|
@ -295,7 +295,7 @@ def make_docs(nlp, batch, min_length, max_length):
|
|||
raise ValueError(Errors.E138.format(text=record))
|
||||
if "heads" in record:
|
||||
heads = record["heads"]
|
||||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = numpy.asarray([numpy.array(h).astype(numpy.uint64) for h in heads], dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if len(doc) >= min_length and len(doc) < max_length:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ORTH, SHAPE, POS, DEP
|
||||
|
@ -91,14 +92,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.array(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.array(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -37,9 +37,9 @@ def test_en_noun_chunks_not_nested(en_vocab):
|
|||
[0, root],
|
||||
[4, amod],
|
||||
[3, nmod],
|
||||
[-1, cc],
|
||||
[-2, conj],
|
||||
[-5, dobj],
|
||||
[numpy.array(-1).astype(numpy.uint64), cc],
|
||||
[numpy.array(-2).astype(numpy.uint64), conj],
|
||||
[numpy.array(-5).astype(numpy.uint64), dobj],
|
||||
],
|
||||
dtype="uint64",
|
||||
),
|
||||
|
|
|
@ -58,11 +58,12 @@ def get_doc(
|
|||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads:
|
||||
annot = numpy.array(heads).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = heads[i]
|
||||
attrs[i] = annot[i]
|
||||
else:
|
||||
attrs[i, j] = heads[i]
|
||||
attrs[i, j] = annot[i]
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
|
|
|
@ -805,7 +805,7 @@ cdef class Doc:
|
|||
`(M, N)` array of attributes.
|
||||
|
||||
attrs (list) A list of attribute ID ints.
|
||||
array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
|
||||
array (numpy.ndarray[ndim=2, dtype='uint64']): The attribute values.
|
||||
RETURNS (Doc): Itself.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_array
|
||||
|
@ -845,9 +845,9 @@ cdef class Doc:
|
|||
col = attrs.index(HEAD)
|
||||
for i in range(length):
|
||||
# cast index to signed int
|
||||
abs_head_index = numpy.int32(array[i, col]) + i
|
||||
abs_head_index = array[i, col].astype(numpy.int32) + i
|
||||
if abs_head_index < 0 or abs_head_index >= length:
|
||||
raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
|
||||
raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=abs_head_index-i))
|
||||
# Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
|
||||
if TAG in attrs:
|
||||
col = attrs.index(TAG)
|
||||
|
|
|
@ -272,7 +272,7 @@ cdef class Span:
|
|||
for ancestor in ancestors:
|
||||
ancestor_i = ancestor.i - self.start
|
||||
if ancestor_i in range(length):
|
||||
array[i, head_col] = ancestor_i - i
|
||||
array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
|
||||
|
||||
# if there is no appropriate ancestor, define a new artificial root
|
||||
value = array[i, head_col]
|
||||
|
@ -280,7 +280,7 @@ cdef class Span:
|
|||
new_root = old_to_new_root.get(ancestor_i, None)
|
||||
if new_root is not None:
|
||||
# take the same artificial root as a previous token from the same sentence
|
||||
array[i, head_col] = new_root - i
|
||||
array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
|
||||
else:
|
||||
# set this token as the new artificial root
|
||||
array[i, head_col] = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user