From c8009c2734bff61a735077bcac790b07727543a3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 12 Dec 2022 08:45:35 +0100
Subject: [PATCH] Cast to uint64 for all array-based doc representations
 (#11933)

* Convert all individual values explicitly to uint64 for array-based doc representations

* Temporarily test with latest numpy v1.24.0rc

* Remove unnecessary conversion from attr_t

* Reduce number of individual casts

* Convert specifically from int32 to uint64

* Revert "Temporarily test with latest numpy v1.24.0rc"

This reverts commit eb0e3c5006515b9a7ff52bae59484c909b8a3f65.

* Also use int32 in tests
---
 spacy/tests/doc/test_array.py |  5 +++--
 spacy/tokens/doc.pyx          |  1 +
 spacy/tokens/span.pyx         |  4 ++--
 spacy/training/example.pyx    | 15 ++++++++-------
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index ef54c581c..af04eaeea 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,4 +1,5 @@
 import pytest
+import numpy
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
@@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
 
     # head before start
     arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
 
     # head after end
     arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a6732c7c9..53c44c3fa 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -347,6 +347,7 @@ cdef class Doc:
             for annot in annotations:
                 if annot:
                     if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                         for i in range(len(words)):
                             if attrs.ndim == 1:
                                 attrs[i] = annot[i]
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 093b2a4da..f39c02a11 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -297,7 +297,7 @@ cdef class Span:
                     for ancestor in ancestors:
                         ancestor_i = ancestor.i - self.c.start
                         if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 
                 # if there is no appropriate ancestor, define a new artificial root
                 value = array[i, head_col]
@@ -305,7 +305,7 @@ cdef class Span:
                     new_root = old_to_new_root.get(ancestor_i, None)
                     if new_root is not None:
                         # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                     else:
                         # set this token as the new artificial root
                         array[i, head_col] = 0
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1c80dc268..ddefa866a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
         if key not in IDS:
             raise ValueError(Errors.E974.format(obj="token", key=key))
         elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
         elif key == "HEAD":
             attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
         elif key == "DEP":
             attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
         elif key == "SENT_START":
             attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
         elif key == "MORPH":
             attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
         else:
             attrs.append(key)
             if not all(isinstance(v, str) for v in value):
                 types = set([type(v) for v in value])
                 raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
     return attrs, array.T