Convert all individual values explicitly to uint64 for array-based doc representations

This commit is contained in:
Adriane Boyd 2022-12-05 16:08:42 +01:00
parent 4b2097a271
commit a7215e345a
4 changed files with 16 additions and 15 deletions

View File

@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start
arr = doc.to_array(["HEAD"])
arr[0] = -1
arr[0] = numpy.array(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
# head after end
arr = doc.to_array(["HEAD"])
arr[0] = 5
arr[0] = numpy.array(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)

View File

@ -361,9 +361,9 @@ cdef class Doc:
if annot is heads or annot is sent_starts or annot is ent_iobs:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
attrs[i] = numpy.array(annot[i]).astype(numpy.uint64)
else:
attrs[i, j] = annot[i]
attrs[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
elif annot is morphs:
for i in range(len(words)):
morph_key = vocab.morphology.add(morphs[i])
@ -981,7 +981,7 @@ cdef class Doc:
for i in range(self.length):
token = &self.c[i]
for j in range(nr_attr):
c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
c_output[i*nr_attr + j] = numpy.array(get_token_attr(token, c_attr_ids[j])).astype(numpy.uint64)
# Handle 1d case
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
@ -1559,7 +1559,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD:
for i in range(len(words)):
array[i, j] = annot[i]
array[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
elif attr is MORPH:
for i in range(len(words)):
array[i, j] = self.vocab.morphology.add(annot[i])

View File

@ -299,7 +299,7 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i
array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i
array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0

View File

@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
pass
continue
elif key == "HEAD":
attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
values.append([to_ternary_int(v) for v in value])
row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])
row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
row = [vocab.strings.add(v) for v in value]
values.append([numpy.array(v).astype(numpy.uint64) for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T