From b9c524917a2329bdcfcc5567424488039a574ccb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Dec 2022 18:00:05 +0100
Subject: [PATCH] Cast to uint64 for all array-based doc representations

---
 spacy/cli/pretrain.py                   | 2 +-
 spacy/tests/doc/test_array.py           | 5 +++--
 spacy/tests/lang/en/test_noun_chunks.py | 6 +++---
 spacy/tests/util.py                     | 5 +++--
 spacy/tokens/doc.pyx                    | 6 +++---
 spacy/tokens/span.pyx                   | 4 ++--
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index e949f76cf..629993d7b 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -295,7 +295,7 @@ def make_docs(nlp, batch, min_length, max_length):
             raise ValueError(Errors.E138.format(text=record))
         if "heads" in record:
             heads = record["heads"]
-            heads = numpy.asarray(heads, dtype="uint64")
+            heads = numpy.asarray([numpy.array(h).astype(numpy.uint64) for h in heads], dtype="uint64")
             heads = heads.reshape((len(doc), 1))
             doc = doc.from_array([HEAD], heads)
         if len(doc) >= min_length and len(doc) < max_length:
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index 09a6f9c4b..e2001824c 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import numpy
 import pytest
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP
@@ -91,14 +92,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
 
     # head before start
     arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.array(-1).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
 
     # head after end
     arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.array(5).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index ff67986a5..4426d50c2 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -37,9 +37,9 @@ def test_en_noun_chunks_not_nested(en_vocab):
                 [0, root],
                 [4, amod],
                 [3, nmod],
-                [-1, cc],
-                [-2, conj],
-                [-5, dobj],
+                [numpy.array(-1).astype(numpy.uint64), cc],
+                [numpy.array(-2).astype(numpy.uint64), conj],
+                [numpy.array(-5).astype(numpy.uint64), dobj],
             ],
             dtype="uint64",
         ),
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 4e1c50398..359767f1b 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -58,11 +58,12 @@ def get_doc(
         for annot in annotations:
             if annot:
                 if annot is heads:
+                    annot = numpy.array(heads).astype(numpy.uint64)
                     for i in range(len(words)):
                         if attrs.ndim == 1:
-                            attrs[i] = heads[i]
+                            attrs[i] = annot[i]
                         else:
-                            attrs[i, j] = heads[i]
+                            attrs[i, j] = annot[i]
                 else:
                     for i in range(len(words)):
                         if attrs.ndim == 1:
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 07d44d01c..3b09ddd28 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -805,7 +805,7 @@ cdef class Doc:
         `(M, N)` array of attributes.
 
         attrs (list) A list of attribute ID ints.
-        array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
+        array (numpy.ndarray[ndim=2, dtype='uint64']): The attribute values.
         RETURNS (Doc): Itself.
 
         DOCS: https://spacy.io/api/doc#from_array
@@ -845,9 +845,9 @@ cdef class Doc:
             col = attrs.index(HEAD)
             for i in range(length):
                 # cast index to signed int
-                abs_head_index = numpy.int32(array[i, col]) + i
+                abs_head_index = array[i, col].astype(numpy.int32) + i
                 if abs_head_index < 0 or abs_head_index >= length:
-                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
+                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=abs_head_index-i))
         # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
         if TAG in attrs:
             col = attrs.index(TAG)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 2ac8af9e4..a40347d23 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -272,7 +272,7 @@ cdef class Span:
                     for ancestor in ancestors:
                         ancestor_i = ancestor.i - self.start
                         if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
 
                 # if there is no appropriate ancestor, define a new artificial root
                 value = array[i, head_col]
@@ -280,7 +280,7 @@ cdef class Span:
                     new_root = old_to_new_root.get(ancestor_i, None)
                     if new_root is not None:
                         # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
                     else:
                         # set this token as the new artificial root
                         array[i, head_col] = 0