Convert all individual values explicitly to uint64 for array-based doc representations

2025-12-21 09:04:25 +03:00 · 2022-12-05 16:08:42 +01:00 · 2022-12-05 16:08:42 +01:00 · a7215e345a
commit a7215e345a
parent 4b2097a271
4 changed files with 16 additions and 15 deletions
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.array(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.array(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -361,9 +361,9 @@ cdef class Doc:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        for i in range(len(words)):
                            if attrs.ndim == 1:
-                                attrs[i] = annot[i]
+                                attrs[i] = numpy.array(annot[i]).astype(numpy.uint64)
                            else:
-                                attrs[i, j] = annot[i]
+                                attrs[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
                    elif annot is morphs:
                        for i in range(len(words)):
                            morph_key = vocab.morphology.add(morphs[i])
@ -981,7 +981,7 @@ cdef class Doc:
        for i in range(self.length):
            token = &self.c[i]
            for j in range(nr_attr):
-                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
+                c_output[i*nr_attr + j] = numpy.array(get_token_attr(token, c_attr_ids[j])).astype(numpy.uint64)
        # Handle 1d case
        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
@ -1559,7 +1559,7 @@ cdef class Doc:
            for j, (attr, annot) in enumerate(token_annotations.items()):
                if attr is HEAD:
                    for i in range(len(words)):
-                        array[i, j] = annot[i]
+                        array[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
                elif attr is MORPH:
                    for i in range(len(words)):
                        array[i, j] = self.vocab.morphology.add(annot[i])
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -299,7 +299,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
+            row = [vocab.strings.add(v) for v in value]
-    array = numpy.asarray(values, dtype="uint64")
+        values.append([numpy.array(v).astype(numpy.uint64) for v in row])
    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T