Add search char byte array feature

This commit is contained in:
richardpaulhudson 2022-10-14 17:03:52 +02:00
parent 1e9176f9c5
commit c116e11942
2 changed files with 247 additions and 0 deletions

145
spacy/tests/test_util.py Normal file
View File

@ -0,0 +1,145 @@
import sys
import spacy
def _correct_endianness(littleendian: bytes) -> bytes:
if sys.byteorder == "little":
return littleendian
output = bytearray()
for idx in range(0, len(littleendian), 2):
output.append(littleendian[idx + 1])
output.append(littleendian[idx])
return bytes(output)
def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
assert w1_search == b"BEFWbefw"
assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00")
assert w4_search == _correct_endianness(
b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w1_finding == w2_finding == w4_finding == w4_search.lower()
def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
assert w1_search == b"Tbefw"
assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
assert w1_finding == w2_finding == w4_finding == w4_search
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
assert w1_search == b"BFWbfw"
assert w1_finding == _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == _correct_endianness(
b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
)
assert (
w2_finding
== w4_finding
== _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
)
assert w4_search == _correct_endianness(
b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
assert w1_search == b"bfw"
assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00")
assert (
w2_finding
== w4_finding
== w4_search
== _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
)
)
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert w1_search == b"BFWbfw"
assert w1_finding == _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == _correct_endianness(
b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
)
assert w2_finding == _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
assert w4_search == _correct_endianness(
b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert w4_finding == _correct_endianness(
b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert w1_search == b"bfw"
assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00")
assert w2_finding == _correct_endianness(
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
w4_search
== w4_finding
== _correct_endianness(
b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
)

View File

@ -1735,3 +1735,105 @@ def all_equal(iterable):
(or if the input is an empty sequence), False otherwise.""" (or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable) g = itertools.groupby(iterable)
return next(g, True) and not next(g, False) return next(g, True) and not next(g, False)
def get_byte_arrays_for_search_chars(
search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
"""
The text of a spaCy document is stored as a Python-internal Unicode representation
as defined by PEP 393. Each character in such a representation has the width of the
longest character in the string, which is either 1, 2 or 4 bytes.
This function supports the rich feature extractor. It returns search byte arrays with
1-, 2- and 4-byte character widths that are used for comparison with each of the three
representation types when searching document texts for search characters. Each byte array
contains characters that are as wide or narrower than its own width; a byte array can
ignore characters that are wider than its own width because a spaCy document with the
corresponding representation width could never contain characters wider than that width.
When characters corresponding to search characters are found within a spaCy token
string, they are concatenated together and the resulting "finding byte arrays" are hashed.
It is crucial that the characters in all finding byte arrays representing a given sequence of
characters share the same width so that they all yield the same hash values. While it
would be possible to use the narrowest possible width for the sequence like PEP 393 does,
determining this would entain unnecessary processing. Instead, finding byte arrays always use
a 4-byte width. Each of the three search byte array therefore has a corresponding finding
byte array that is used to build up the finding byte arrays for specific document token strings.
If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search byte arrays, and both the original character and its
other-cased counterpart map to the lower-case version in the finding byte array.
"""
def encode(ch: str, width: int) -> bytes:
"""
ch: a single character
int: the width of the character encoding to use
"""
if width == 4:
return ch.encode("UTF-32")[width:] # remove byte order mark
elif width == 2:
return ch.encode("UTF-16")[width:] # remove byte order mark
else:
return ch.encode("UTF-8")
def add_to_byte_arrays(
search: List[bytes], finding: List[bytes], ch: str, width: int
) -> None:
"""Add the byte representations of *ch* with representation of width
*width* to the two byte array lists.
"""
this_char_bytes = encode(ch, width)
this_char_bytes_f = encode(ch, 4)
if not case_sensitive and ch.islower():
if this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(this_char_bytes_f)
upper_char_bytes = encode(ch.upper(), width)
if upper_char_bytes not in search:
search.append(upper_char_bytes)
finding.append(this_char_bytes_f)
elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower(), width)
lower_char_bytes_f = encode(ch.lower(), 4)
if this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(lower_char_bytes_f)
if lower_char_bytes not in search:
search.append(lower_char_bytes)
finding.append(lower_char_bytes_f)
elif this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(this_char_bytes_f)
def get_ordered_raw_bytes(
search: List[bytes], finding: List[bytes]
) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search*
using the native endianness of the platform.
"""
num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))]
finding = [entry for _, entry in sorted(zip(num_search, finding))]
return b"".join(search), b"".join(finding)
w1_search: List[bytes] = []
w1_finding: List[bytes] = []
w2_search: List[bytes] = []
w2_finding: List[bytes] = []
w4_search: List[bytes] = []
w4_finding: List[bytes] = []
for ch in search_chars:
add_to_byte_arrays(w4_search, w4_finding, ch, 4)
if ord(ch) >= 65536:
continue
add_to_byte_arrays(w2_search, w2_finding, ch, 2)
if ord(ch) >= 128:
continue
add_to_byte_arrays(w1_search, w1_finding, ch, 1)
return (
get_ordered_raw_bytes(w1_search, w1_finding)
+ get_ordered_raw_bytes(w2_search, w2_finding)
+ get_ordered_raw_bytes(w4_search, w4_finding)
)