From fa5724e9270c3d1f57a63a22d6ad83be12f38bc5 Mon Sep 17 00:00:00 2001
From: richardpaulhudson <richard@explosion.ai>
Date: Fri, 14 Oct 2022 20:24:32 +0200
Subject: [PATCH] Remove unnecessary endianness stuff

---
 spacy/tests/test_util.py | 86 +++++++++++++++++++---------------------
 spacy/util.py            |  4 +-
 2 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 58e56c143..89e0ab1b7 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -2,16 +2,6 @@ import sys
 import spacy
 
 
-def _correct_endianness(littleendian: bytes) -> bytes:
-    if sys.byteorder == "little":
-        return littleendian
-    output = bytearray()
-    for idx in range(0, len(littleendian), 2):
-        output.append(littleendian[idx + 1])
-        output.append(littleendian[idx])
-    return bytes(output)
-
-
 def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
     (
         w1_search,
@@ -22,9 +12,10 @@ def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
         w4_finding,
     ) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
     assert w1_search == b"BEFWbefw"
-    assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00")
-    assert w4_search == _correct_endianness(
-        b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    assert w2_search == b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00"
+    assert (
+        w4_search
+        == b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
     )
     assert w1_finding == w2_finding == w4_finding == w4_search.lower()
 
@@ -54,21 +45,19 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
         w4_finding,
     ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
     assert w1_search == b"BFWbfw"
-    assert w1_finding == _correct_endianness(
-        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
-    )
-    assert w2_search == _correct_endianness(
-        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
+    assert (
+        w1_finding
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
     )
+    assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
     assert (
         w2_finding
         == w4_finding
-        == _correct_endianness(
-            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
-        )
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
     )
-    assert w4_search == _correct_endianness(
-        b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    assert (
+        w4_search
+        == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
     )
 
 
@@ -82,15 +71,13 @@ def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
         w4_finding,
     ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
     assert w1_search == b"bfw"
-    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
-    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00")
+    assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    assert w2_search == b"b\x00f\x00w\x00\xe9\x00"
     assert (
         w2_finding
         == w4_finding
         == w4_search
-        == _correct_endianness(
-            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
-        )
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
     )
 
 
@@ -104,20 +91,26 @@ def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
         w4_finding,
     ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
     assert w1_search == b"BFWbfw"
-    assert w1_finding == _correct_endianness(
-        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    assert (
+        w1_finding
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
     )
-    assert w2_search == _correct_endianness(
-        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
+
+    assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
+
+    assert (
+        w2_finding
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
     )
-    assert w2_finding == _correct_endianness(
-        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+
+    assert (
+        w4_search
+        == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
     )
-    assert w4_search == _correct_endianness(
-        b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
-    )
-    assert w4_finding == _correct_endianness(
-        b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+
+    assert (
+        w4_finding
+        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
     )
 
 
@@ -131,12 +124,15 @@ def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
         w4_finding,
     ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
     assert w1_search == b"bfw"
-    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
-    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00")
-    assert w2_finding == _correct_endianness(
-        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    assert w2_search == b"b\x00f\x00w\x00\xc9\x00\xe9\x00"
+    assert (
+        w2_finding
+        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
     )
+
     assert w4_search == w4_finding
-    assert w4_finding == _correct_endianness(
-        b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    assert (
+        w4_finding
+        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
     )
diff --git a/spacy/util.py b/spacy/util.py
index 47509aac8..16e38917d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1772,9 +1772,9 @@ def get_byte_arrays_for_search_chars(
         int: the width of the character encoding to use
         """
         if width == 4:
-            return ch.encode("UTF-32")[width:]  # remove byte order mark
+            return ch.encode("UTF-32LE")
         elif width == 2:
-            return ch.encode("UTF-16")[width:]  # remove byte order mark
+            return ch.encode("UTF-16LE")
         else:
             return ch.encode("UTF-8")