Intermediate state

This commit is contained in:
richardpaulhudson 2022-10-19 23:20:11 +02:00
parent 356a341096
commit 2707d30ce0
7 changed files with 249 additions and 405 deletions

View File

@ -1,4 +1,5 @@
from typing import List, Optional, Callable, Tuple from typing import List, Optional, Callable, Tuple
from ..util import get_byte_arrays_for_search_chars
from thinc.types import Ints2d from thinc.types import Ints2d
from thinc.api import Model, registry from thinc.api import Model, registry
@ -16,22 +17,30 @@ def RichFeatureExtractor(
suff_search_chars: Optional[str] = None, suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None, suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]: ) -> Model[List[Doc], List[Ints2d]]:
if pref_search_chars is not None:
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
else:
pref_search, pref_ref = bytes(), bytes()
if suff_search_chars is not None:
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
else:
suff_search, suff_ref = bytes(), bytes()
return Model( return Model(
"extract_character_combination_hashes", "extract_character_combination_hashes",
forward, forward,
attrs={ attrs={
"case_sensitive": case_sensitive, "case_sensitive": case_sensitive,
"pref_lengths": pref_lengths if pref_lengths is not None else [], "pref_lengths": pref_lengths if pref_lengths is not None else [],
"pref_search_chars": pref_search_chars "suff_lengths": suff_lengths if suff_lengths is not None else [],
if pref_search_chars is not None "pref_search": pref_search,
else "", "pref_ref": pref_ref,
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
"pref_search_lengths": pref_search_lengths "pref_search_lengths": pref_search_lengths
if pref_search_lengths is not None if pref_search_lengths is not None
else [], else [],
"suff_lengths": suff_lengths if suff_lengths is not None else [], "suff_search": suff_search,
"suff_search_chars": suff_search_chars "suff_ref": suff_ref,
if suff_search_chars is not None "suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
else "",
"suff_search_lengths": suff_search_lengths "suff_search_lengths": suff_search_lengths
if suff_search_lengths is not None if suff_search_lengths is not None
else [], else [],
@ -45,10 +54,14 @@ def forward(
ops = model.ops ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"] case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: List[int] = model.attrs["pref_lengths"] pref_lengths: List[int] = model.attrs["pref_lengths"]
pref_search_chars: str = model.attrs["pref_search_chars"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
suff_lengths: List[int] = model.attrs["suff_lengths"] suff_lengths: List[int] = model.attrs["suff_lengths"]
suff_search_chars: str = model.attrs["suff_search_chars"] pref_search: bytes = model.attrs["pref_search"]
pref_ref: bytes = model.attrs["pref_ref"]
pref_s_char_l: int = model.attr["pref_s_char_l"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"]
suff_ref: bytes = model.attrs["suff_ref"]
suff_s_char_l: int = model.attr["suff_s_char_l"]
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
features: List[Ints2d] = [] features: List[Ints2d] = []
for doc in docs: for doc in docs:
@ -56,9 +69,13 @@ def forward(
case_sensitive=case_sensitive, case_sensitive=case_sensitive,
pref_lengths=pref_lengths, pref_lengths=pref_lengths,
suff_lengths=suff_lengths, suff_lengths=suff_lengths,
pref_search_chars=pref_search_chars, pref_search=pref_search,
pref_ref=pref_ref,
pref_s_char_l=pref_s_char_l,
pref_search_lengths=pref_search_lengths, pref_search_lengths=pref_search_lengths,
suff_search_chars=suff_search_chars, suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=suff_s_char_l,
suff_search_lengths=suff_search_lengths, suff_search_lengths=suff_search_lengths,
) )
features.append(ops.asarray2i(hashes)) features.append(ops.asarray2i(hashes))

View File

@ -14,6 +14,7 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language from spacy.language import Language
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.util import get_byte_arrays_for_search_chars
from spacy.vocab import Vocab from spacy.vocab import Vocab
from .test_underscore import clean_underscore # noqa: F401 from .test_underscore import clean_underscore # noqa: F401
@ -994,7 +995,8 @@ def test_doc_spans_setdefault(en_tokenizer):
def _get_unsigned_32_bit_hash(input: str) -> int: def _get_unsigned_32_bit_hash(input: str) -> int:
working_hash = hash(input.encode("UTF-16")[2:]) input = input.replace(" ", "\x00")
working_hash = hash(input.encode("UTF-32LE"))
if working_hash < 0: if working_hash < 0:
working_hash = working_hash + (2 << 31) working_hash = working_hash + (2 << 31)
return working_hash return working_hash
@ -1004,15 +1006,21 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy") doc = en_tokenizer("spaCy✨ and Prodigy")
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive, cs=case_sensitive,
pref_lengths=[1, 4, 3], pref_lengths=[1, 4, 3],
suff_lengths=[2, 3, 4, 5], suff_lengths=[2, 3, 4, 5],
pref_search_chars="", pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[2], pref_search_lengths=[2],
suff_search_chars="xx✨rp", suff_search=suff_search,
suff_search_lengths=[2, 1], suff_ref=suff_ref,
suff_s_char_l=5 if case_sensitive else 9,
suff_search_lengths=[2,1],
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("s") assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
assert hashes[0][1] == _get_unsigned_32_bit_hash( assert hashes[0][1] == _get_unsigned_32_bit_hash(
"spaC" if case_sensitive else "spac" "spaC" if case_sensitive else "spac"
@ -1031,22 +1039,22 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("") assert hashes[1][0] == _get_unsigned_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("") assert hashes[1][1] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][2] == _get_unsigned_32_bit_hash("") assert hashes[1][2] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][3] == _get_unsigned_32_bit_hash("") assert hashes[1][3] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][4] == _get_unsigned_32_bit_hash("") assert hashes[1][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][5] == _get_unsigned_32_bit_hash("") assert hashes[1][5] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][6] == _get_unsigned_32_bit_hash("") assert hashes[1][6] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][7] == _get_unsigned_32_bit_hash(" ") assert hashes[1][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][8] == _get_unsigned_32_bit_hash("") assert hashes[1][8] == _get_unsigned_32_bit_hash("")
assert hashes[1][9] == _get_unsigned_32_bit_hash("") assert hashes[1][9] == _get_unsigned_32_bit_hash("")
assert hashes[2][0] == _get_unsigned_32_bit_hash("a") assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and") assert hashes[2][1] == _get_unsigned_32_bit_hash("and ")
assert hashes[2][2] == _get_unsigned_32_bit_hash("and") assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
assert hashes[2][4] == _get_unsigned_32_bit_hash("and") assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
assert hashes[2][5] == _get_unsigned_32_bit_hash("and") assert hashes[2][5] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][6] == _get_unsigned_32_bit_hash("and") assert hashes[2][6] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][7] == _get_unsigned_32_bit_hash(" ") assert hashes[2][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][8] == _get_unsigned_32_bit_hash(" ") assert hashes[2][8] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][9] == _get_unsigned_32_bit_hash(" ") assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
@ -1076,17 +1084,23 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer): def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("and𐌞")
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
case_sensitive=True, cs=True,
pref_lengths=[], pref_lengths=[],
suff_lengths=[1, 2, 3], suff_lengths=[1, 2, 3],
pref_search_chars="", pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[], pref_search_lengths=[],
suff_search_chars="a", suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=1,
suff_search_lengths=[1], suff_search_lengths=[1],
) )
assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞") assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞") assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
assert hashes[0][3] == _get_unsigned_32_bit_hash("a") assert hashes[0][3] == _get_unsigned_32_bit_hash("a")

View File

@ -1,138 +1,55 @@
import sys
import spacy import spacy
def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
assert w1_search == b"BEFWbefw"
assert w2_search == b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00"
assert (
w4_search
== b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w1_finding == w2_finding == w4_finding == w4_search.lower()
def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
assert w1_search == b"Tbefw"
assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
assert w1_finding == w2_finding == w4_finding == w4_search
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
( (
w1_search, search,
w1_finding, ref,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False) ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
assert w1_search == b"BFWbfw"
assert ( assert (
w1_finding ref
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert (
w2_finding
== w4_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
) )
assert ( assert (
w4_search search
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
) )
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive(): def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
( (
w1_search, search,
w1_finding, ref,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True) ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
assert w1_search == b"bfw"
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
assert w2_search == b"b\x00f\x00w\x00\xe9\x00"
assert ( assert (
w2_finding ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
== w4_finding
== w4_search
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
) )
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive(): def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
( (
w1_search, search,
w1_finding, ref,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert w1_search == b"BFWbfw"
assert ( assert (
w1_finding search
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert (
w2_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
w4_search
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
) )
assert ( assert (
w4_finding ref
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
) )
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive(): def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
( (
w1_search, search,
w1_finding, ref,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert w1_search == b"bfw" assert search == ref
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
assert w2_search == b"b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert ( assert (
w2_finding ref
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert w4_search == w4_finding
assert (
w4_finding
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
) )

View File

@ -33,35 +33,26 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _populate_affix_buf( cdef void _populate_aff_buf(
const void* str_data_ptr, const Py_UCS4* text_buf,
const unsigned int unicode_byte_width, const int tok_idx,
const int word_idx, const int tok_len,
const int word_len, Py_UCS4* aff_buf,
Py_UCS4* affix_buf,
const int pref_length, const int pref_length,
const int suff_length, const int suff_length,
const bint to_lower const bint to_lower
) )
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes) cdef void _populate_search_buf(
const Py_UCS4* text_buf,
const int tok_idx,
cdef bint _is_searched_char_in_search_chars_v( const int tok_len,
const unsigned short searched_char, Py_UCS4* search_buf,
const unsigned char[:] search_chars_v, Py_UCS4* ref_buf,
const unsigned int search_chars_v_len, const int search_buf_len,
) Py_UCS4* finding_buf,
const int finding_buf_len,
bint suffs_not_prefs
cdef void _set_found_char_buf(
const bint suffs_not_prefs,
const unsigned char[:] searched_string_v,
const unsigned int searched_string_len,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len,
char* found_char_buf,
const unsigned int found_char_buf_len,
) )

View File

@ -181,9 +181,13 @@ class Doc:
pref_lengths: List[int], pref_lengths: List[int],
suff_lengths: List[int], suff_lengths: List[int],
pref_search_chars: str, pref_search_chars: str,
pref_ref_chars: str,
pref_search_char_length: int,
pref_search_lengths: List[int], pref_search_lengths: List[int],
suff_search_chars: str, suff_search_chars: str,
suff_search_lengths: List[int] suff_ref_chars: str,
suff_search_char_length: int,
suff_search_lengths: List[int],
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -3,7 +3,7 @@ from typing import Set, List
cimport cython cimport cython
cimport numpy as np cimport numpy as np
from libc.string cimport memcpy from libc.string cimport memcpy, memcmp, memset
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t
@ -42,12 +42,6 @@ from ..util import get_words_and_spaces
DEF PADDING = 5 DEF PADDING = 5
cdef extern from *:
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
int PyUnicode_KIND(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int bounds_check(int i, int length, int padding) except -1: cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0: if (i + padding) < 0:
raise IndexError(Errors.E026.format(i=i, length=length)) raise IndexError(Errors.E026.format(i=i, length=length))
@ -111,6 +105,16 @@ class SetEntsDefault(str, Enum):
return list(cls.__members__.keys()) return list(cls.__members__.keys())
cdef extern from "unicodeobject.h":
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
void PyUnicode_READY(void * o)
int PyUnicode_KIND(void *data)
int PyUnicode_IS_COMPACT(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary annotations to numpy arrays, losslessly serialize to compressed binary
@ -1742,33 +1746,37 @@ cdef class Doc:
def get_character_combination_hashes( def get_character_combination_hashes(
self, self,
*, *,
bint case_sensitive, bint cs,
pref_lengths: List[int], pref_lengths: List[int],
suff_lengths: List[int], suff_lengths: List[int],
str pref_search_chars, char* pref_search,
char* pref_ref,
int pref_s_char_l,
pref_search_lengths: List[int], pref_search_lengths: List[int],
str suff_search_chars, char* suff_search,
suff_search_lengths: List[int] char* suff_ref,
int suff_s_char_l,
suff_search_lengths: List[int],
): ):
""" """
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the string (text/orth) of each token. derived from the string (text/orth) of each token.
case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings. if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
the prefixes hashed for "spaCy" would be "sp" and "spa". the prefixes hashed for "spaCy" would be "sp" and "spa".
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". *case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
pref_search_chars: a string containing characters to search for within each token, starting at the beginning. pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac". "spaCy" would be "a" and "ac".
suff_search_chars: a string containing characters to search for within each token, starting at the end. suff_search_chars: a string containing characters to search for within each token, starting at the end.
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca". "spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
@ -1779,99 +1787,61 @@ cdef class Doc:
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
""" """
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef int aff_buf_l = max_pref_l + max_suff_l
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0 cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0 cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
cdef Py_UCS4* affix_buf = <Py_UCS4*>self.mem.alloc(4, longest_pref + longest_suff) cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
cdef void* text_ptr = <void*> self.text cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
cdef void* text_data_ptr = <void*> PyUnicode_DATA(text_ptr) # todo change to const void cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
cdef TokenC token_c
cdef str working_str
for tok_idx in range(num_toks):
token_c = self.c[tok_idx]
token_idx = token_c.idx
token_len = token_c.lex.length
_populate_affix_buf(
text_data_ptr,
unicode_byte_width,
token_idx,
token_len,
affix_buf,
longest_pref,
longest_suff,
not case_sensitive
)
cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
cdef char* found_char_buf = found_char_buf_bytes
cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v),
cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths) cdef bytes encoded_text = self.text.encode("utf-32le")
cdef unsigned int num_pref_search_hashes = len(pref_search_lengths) cdef char* intermediate_text = encoded_text
cdef unsigned int num_suff_search_hashes = len(suff_search_lengths) cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
cdef const unsigned char[:] tok_str_v cdef unsigned int num_toks = len(self), aff_len
cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len cdef unsigned int h_pref_n = len(pref_lengths)
cdef attr_t num_tok_attr cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
cdef str str_tok_attr cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
for tok_idx in range(num_toks): cdef TokenC tok_c
num_tok_attr = self.c[tok_idx].lex.orth if case_sensitive else self.c[tok_idx].lex.lower
str_tok_attr = self.vocab.strings[num_tok_attr]
tok_str_v = _get_utf16_memoryview(str_tok_attr, False)
tok_str_v_len = len(tok_str_v)
for hash_idx in range(num_pref_norm_hashes): for tok_i in range(num_toks):
char_comb_len = pref_lengths[hash_idx] * 2 tok_c = self.c[tok_i]
if char_comb_len > tok_str_v_len: tok_idx = tok_c.idx
char_comb_len = tok_str_v_len tok_len = tok_c.lex.length
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[0], char_comb_len, 0)
for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes):
char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2
if char_comb_len > tok_str_v_len:
char_comb_len = tok_str_v_len
affix_start = tok_str_v_len - char_comb_len
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], char_comb_len, 0)
_set_found_char_buf(
False,
tok_str_v,
tok_str_v_len,
pref_search_chars_v,
pref_search_chars_v_len,
found_char_buf,
found_char_buf_len,
)
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes): _populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2 _populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) _populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
for hash_idx in range(h_pref_n):
aff_len = pref_lengths[hash_idx]
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
_set_found_char_buf( for hash_idx in range(h_pref_n, h_suff_end_idx):
True, aff_len = suff_lengths[hash_idx - h_pref_n]
tok_str_v, hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
tok_str_v_len,
suff_search_chars_v, for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
suff_search_chars_v_len, aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
found_char_buf, hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
found_char_buf_len,
)
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes): for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2 aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
self.mem.free(aff_buf)
self.mem.free(pref_f_buf)
self.mem.free(suff_f_buf)
return hashes return hashes
@staticmethod @staticmethod
@ -2055,12 +2025,11 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix return lca_matrix
cdef void _populate_affix_buf( cdef void _populate_aff_buf(
const void* str_data_ptr, const Py_UCS4* text_buf,
const unsigned int unicode_byte_width, const int tok_idx,
const int word_idx, const int tok_len,
const int word_len, Py_UCS4* aff_buf,
Py_UCS4* affix_buf,
const int pref_length, const int pref_length,
const int suff_length, const int suff_length,
const bint to_lower const bint to_lower
@ -2070,107 +2039,75 @@ cdef void _populate_affix_buf(
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
Unicode form (see PEP 393). Unicode form (see PEP 393).
unicode_byte_width: the number of bytes occupied by each character in the containing string. kind: the number of bytes occupied by each character in the containing string.
word_idx: the index of the first character of the word within the containing string. word_idx: the index of the first character of the word within the containing string.
word_len: the length of the word. word_len: the length of the word.
affix_buf: the buffer to populate. aff_buf: the buffer to populate.
pref_length: the length of the prefix. pref_length: the length of the prefix.
suff_length: the length of the suffix. suff_length: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case. to_lower: if *True*, any upper case characters in either affix are converted to lower case.
""" """
cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
cdef Py_UCS4 working_wchar
while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
while affix_buf_idx < pref_length and affix_buf_idx < word_len: memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, in_word_idx)
if to_lower: if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar) aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4) aff_buf_idx += 1
affix_buf_idx += 1
while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len): if aff_buf_idx < buf_size - tok_len:
# fill out the empty middle part of the buffer with zeros # fill out the empty middle part of the buffer with zeros
affix_buf[affix_buf_idx] = 0 memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
affix_buf_idx += 1
while affix_buf_idx < buf_size: while aff_buf_idx < buf_size:
in_word_idx = affix_buf_idx + word_len - buf_size in_word_idx = aff_buf_idx + tok_len - buf_size
# for suffixes we have to track the in-word index separately from the in-buffer index # for suffixes we have to track the in-word index separately from the in-buffer index
if in_word_idx < pref_length: if in_word_idx < pref_length:
# we've already retrieved this character as part of the prefix, so copy it from there # we've already retrieved this character as part of the prefix, so copy it from there
# as that's quicker than retrieving it from the input string a second time # as that's quicker than retrieving it from the input string a second time
memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4) memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
else: else:
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx) memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
if to_lower: if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar) aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4) aff_buf_idx += 1
affix_buf_idx += 1
cdef void _populate_search_buf(
const Py_UCS4* text_buf,
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes): const int tok_idx,
""" const int tok_len,
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform. Py_UCS4* search_buf,
Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation Py_UCS4* ref_buf,
occupies four bytes rather than two. const int search_buf_len,
""" Py_UCS4* finding_buf,
cdef const unsigned char[:] view = unicode_string.encode("UTF-16") const int finding_buf_len,
view = view[2:] # first two bytes express endianness bint suffs_not_prefs
cdef unsigned int unicode_len, view_len
if check_2_bytes:
unicode_len = len(unicode_string)
view_len = len(view)
if unicode_len * 2 != view_len:
raise ValueError(Errors.E1046)
return view
cdef bint _is_searched_char_in_search_chars_v(
const unsigned short searched_char,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len
): ):
cdef unsigned int search_chars_v_idx = 0 cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
while search_chars_v_idx < search_chars_v_len: cdef unsigned int search_buf_idx
if searched_char == (<unsigned short*> &search_chars_v[search_chars_v_idx])[0]: cdef int cmp_res
return True
search_chars_v_idx += 2
return False
while finding_buf_idx < finding_buf_len:
cdef void _set_found_char_buf( for search_buf_idx in range (search_buf_len):
const bint suffs_not_prefs, cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
const unsigned char[:] searched_string_v, if cmp_res == 0:
const unsigned int searched_string_v_len, memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
const unsigned char[:] search_chars_v, finding_buf_idx += 1
const unsigned int search_chars_v_len, if cmp_res >= 0:
char* found_char_buf,
const unsigned int found_char_buf_len,
):
""" Pick the UTF-16 characters from *searched_string_v* that are also in *search_chars_v* and writes them in order to *found_char_buf*.
If *suffs_not_prefs*, the search starts from the end of *searched_string_v* rather than from the beginning.
"""
cdef unsigned int found_char_buf_idx = 0, searched_string_idx = searched_string_v_len - 2 if suffs_not_prefs else 0
cdef unsigned short searched_char, SPACE = 32
while found_char_buf_idx < found_char_buf_len:
searched_char = (<unsigned short*> &searched_string_v[searched_string_idx])[0]
if _is_searched_char_in_search_chars_v(searched_char, search_chars_v, search_chars_v_len):
memcpy(found_char_buf + found_char_buf_idx, &searched_char, 2)
found_char_buf_idx += 2
if suffs_not_prefs:
if searched_string_idx <= 0:
break break
searched_string_idx -= 2 if suffs_not_prefs:
if text_string_idx <= tok_idx:
break
text_string_idx -= 1
else: else:
searched_string_idx += 2 text_string_idx += 1
if searched_string_idx >= searched_string_v_len: if text_string_idx >= tok_idx + tok_len:
break break
while found_char_buf_idx < found_char_buf_len: if finding_buf_idx < finding_buf_len:
memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2) memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
found_char_buf_idx += 2
def pickle_doc(doc): def pickle_doc(doc):

View File

@ -1739,104 +1739,68 @@ def all_equal(iterable):
def get_byte_arrays_for_search_chars( def get_byte_arrays_for_search_chars(
search_chars: str, case_sensitive: bool search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]: ) -> Tuple[bytes, bytes]:
""" """
The text of a spaCy document is stored as a Python-internal Unicode representation
as defined by PEP 393. Each character in such a representation has the width of the
longest character in the string, which is either 1, 2 or 4 bytes.
This function supports the rich feature extractor. It returns search byte arrays with This function supports the rich feature extractor. It returns search byte arrays with
1-, 2- and 4-byte character widths that are used for comparison with each of the three 4-byte character width that are used for comparison when searching document texts
representation types when searching document texts for search characters. Each byte array for search characters. The encoding is little-endian regardless of architecture, as
contains characters that are as wide or narrower than its own width; a byte array can this is what is expected by the murmurhash library used downstream.
ignore characters that are wider than its own width because a spaCy document with the
corresponding representation width could never contain characters wider than that width.
When characters corresponding to search characters are found within a spaCy token Alongside the "search byte array" against which words from document texts are compared
string, they are concatenated together and the resulting "finding byte arrays" are hashed. is the "ref byte array". When a character from the search byte array is matched,
It is crucial that the characters in all finding byte arrays representing a given sequence of the character at the corresponding position in the ref byte array is added to the
characters share the same width so that they all yield the same hash values. While it byte sequence of the configured length that is then hashed. This enables case-sensitivity
would be possible to use the narrowest possible width for the sequence like PEP 393 does, to be handled without converting the case of the words being searched: if
determining this would entain unnecessary processing. Instead, finding byte arrays always use *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
a 4-byte width. Each of the three search byte array therefore has a corresponding finding
byte array that is used to build up the finding byte arrays for specific document token strings.
If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search byte arrays, and both the original character and its have case are added to the search byte arrays, and both the original character and its
other-cased counterpart map to the lower-case version in the finding byte array. other-cased counterpart map to the lower-case version in the ref byte array.
All encodings are little-endian regardless of architecture, as this is what is expected by the
murmurhash library used downstream.
""" """
def encode(ch: str, width: int) -> bytes: def encode(ch: str) -> bytes:
""" """
ch: a single character ch: a single character
int: the width of the character encoding to use
""" """
if width == 4: return ch.encode("UTF-32LE")
return ch.encode("UTF-32LE")
elif width == 2:
return ch.encode("UTF-16LE")
else:
return ch.encode("UTF-8")
def add_to_byte_arrays( def add_to_byte_arrays(
search: List[bytes], finding: List[bytes], ch: str, width: int search: List[bytes], ref: List[bytes], ch: str
) -> None: ) -> None:
"""Add the byte representations of *ch* with representation of width """Add the byte representations of *ch* to the two byte array lists.
*width* to the two byte array lists.
""" """
this_char_bytes = encode(ch, width) this_char_bytes = encode(ch)
this_char_bytes_f = encode(ch, 4)
if not case_sensitive and ch.islower(): if not case_sensitive and ch.islower():
if this_char_bytes not in search: if this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
finding.append(this_char_bytes_f) ref.append(this_char_bytes)
upper_char_bytes = encode(ch.upper(), width) upper_char_bytes = encode(ch.upper())
if upper_char_bytes not in search: if upper_char_bytes not in search:
search.append(upper_char_bytes) search.append(upper_char_bytes)
finding.append(this_char_bytes_f) ref.append(this_char_bytes)
elif not case_sensitive and ch.isupper(): elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower(), width) lower_char_bytes = encode(ch.lower())
lower_char_bytes_f = encode(ch.lower(), 4)
if this_char_bytes not in search: if this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
finding.append(lower_char_bytes_f) ref.append(lower_char_bytes)
if lower_char_bytes not in search: if lower_char_bytes not in search:
search.append(lower_char_bytes) search.append(lower_char_bytes)
finding.append(lower_char_bytes_f) ref.append(lower_char_bytes)
elif this_char_bytes not in search: elif this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
finding.append(this_char_bytes_f) ref.append(this_char_bytes)
def get_ordered_raw_bytes( def get_ordered_raw_bytes(
search: List[bytes], finding: List[bytes] search: List[bytes], ref: List[bytes]
) -> Tuple[bytes, bytes]: ) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search* """Flatten the two lists, ordering both by the entries in *search*
using the native endianness of the platform. using the native endianness of the platform.
""" """
num_search = [list(entry) for entry in search] num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))] search = [entry for _, entry in sorted(zip(num_search, search))]
finding = [entry for _, entry in sorted(zip(num_search, finding))] ref = [entry for _, entry in sorted(zip(num_search, ref))]
return b"".join(search), b"".join(finding) return b"".join(search), b"".join(ref)
w1_search: List[bytes] = [] search: List[bytes] = []
w1_finding: List[bytes] = [] ref: List[bytes] = []
w2_search: List[bytes] = []
w2_finding: List[bytes] = []
w4_search: List[bytes] = []
w4_finding: List[bytes] = []
for ch in search_chars: for ch in search_chars:
add_to_byte_arrays(w4_search, w4_finding, ch, 4) add_to_byte_arrays(search, ref, ch)
if ord(ch) >= 65536: return get_ordered_raw_bytes(search, ref)
continue
add_to_byte_arrays(w2_search, w2_finding, ch, 2)
if ord(ch) >= 128:
continue
add_to_byte_arrays(w1_search, w1_finding, ch, 1)
return (
get_ordered_raw_bytes(w1_search, w1_finding)
+ get_ordered_raw_bytes(w2_search, w2_finding)
+ get_ordered_raw_bytes(w4_search, w4_finding)
)