mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 08:14:15 +03:00
Convert all individual values explicitly to uint64 for array-based doc representations
This commit is contained in:
parent
4b2097a271
commit
a7215e345a
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.array(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.array(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -361,9 +361,9 @@ cdef class Doc:
|
|||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
attrs[i] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||
else:
|
||||
attrs[i, j] = annot[i]
|
||||
attrs[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
|
@ -981,7 +981,7 @@ cdef class Doc:
|
|||
for i in range(self.length):
|
||||
token = &self.c[i]
|
||||
for j in range(nr_attr):
|
||||
c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
|
||||
c_output[i*nr_attr + j] = numpy.array(get_token_attr(token, c_attr_ids[j])).astype(numpy.uint64)
|
||||
# Handle 1d case
|
||||
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
||||
|
||||
|
@ -1559,7 +1559,7 @@ cdef class Doc:
|
|||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||
if attr is HEAD:
|
||||
for i in range(len(words)):
|
||||
array[i, j] = annot[i]
|
||||
array[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||
elif attr is MORPH:
|
||||
for i in range(len(words)):
|
||||
array[i, j] = self.vocab.morphology.add(annot[i])
|
||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
|||
for ancestor in ancestors:
|
||||
ancestor_i = ancestor.i - self.c.start
|
||||
if ancestor_i in range(length):
|
||||
array[i, head_col] = ancestor_i - i
|
||||
array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
|
||||
|
||||
# if there is no appropriate ancestor, define a new artificial root
|
||||
value = array[i, head_col]
|
||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
|||
new_root = old_to_new_root.get(ancestor_i, None)
|
||||
if new_root is not None:
|
||||
# take the same artificial root as a previous token from the same sentence
|
||||
array[i, head_col] = new_root - i
|
||||
array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
|
||||
else:
|
||||
# set this token as the new artificial root
|
||||
array[i, head_col] = 0
|
||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
if key not in IDS:
|
||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||
elif key in ["ORTH", "SPACY"]:
|
||||
pass
|
||||
continue
|
||||
elif key == "HEAD":
|
||||
attrs.append(key)
|
||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||
elif key == "DEP":
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append([to_ternary_int(v) for v in value])
|
||||
row = [to_ternary_int(v) for v in value]
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
row = [vocab.morphology.add(v) for v in value]
|
||||
else:
|
||||
attrs.append(key)
|
||||
if not all(isinstance(v, str) for v in value):
|
||||
types = set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
row = [vocab.strings.add(v) for v in value]
|
||||
values.append([numpy.array(v).astype(numpy.uint64) for v in row])
|
||||
array = numpy.array(values, dtype=numpy.uint64)
|
||||
return attrs, array.T
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user