diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py new file mode 100644 index 000000000..e84514fde --- /dev/null +++ b/spacy/tests/test_util.py @@ -0,0 +1,145 @@ +import sys +import spacy + + +def _correct_endianness(littleendian: bytes) -> bytes: + if sys.byteorder == "little": + return littleendian + output = bytearray() + for idx in range(0, len(littleendian), 2): + output.append(littleendian[idx + 1]) + output.append(littleendian[idx]) + return bytes(output) + + +def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False) + assert w1_search == b"BEFWbefw" + assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00") + assert w4_search == _correct_endianness( + b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" + ) + assert w1_finding == w2_finding == w4_finding == w4_search.lower() + + +def test_get_byte_arrays_for_search_chars_width_1_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True) + assert w1_search == b"Tbefw" + assert w2_search == b"T\x00b\x00e\x00f\x00w\00" + assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00" + assert w1_finding == w2_finding == w4_finding == w4_search + + +def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False) + assert w1_search == b"BFWbfw" + assert w1_finding == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" + ) + assert w2_search == _correct_endianness( + b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00" + ) + assert ( + w2_finding + == w4_finding + == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" + ) + ) + assert w4_search == _correct_endianness( + b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" + ) + + +def test_get_byte_arrays_for_search_chars_width_2_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True) + assert w1_search == b"bfw" + assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00") + assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00") + assert ( + w2_finding + == w4_finding + == w4_search + == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" + ) + ) + + +def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) + assert w1_search == b"BFWbfw" + assert w1_finding == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" + ) + assert w2_search == _correct_endianness( + b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00" + ) + assert w2_finding == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" + ) + assert w4_search == _correct_endianness( + b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" + ) + assert w4_finding == _correct_endianness( + b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" + ) + + +def test_get_byte_arrays_for_search_chars_width_4_case_sensitive(): + ( + w1_search, + w1_finding, + w2_search, + w2_finding, + w4_search, + w4_finding, + ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) + assert w1_search == b"bfw" + assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00") + assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00") + assert w2_finding == _correct_endianness( + b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" + ) + assert ( + w4_search + == w4_finding + == _correct_endianness( + b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" + ) + ) diff --git a/spacy/util.py b/spacy/util.py index 3034808ba..47509aac8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1735,3 +1735,105 @@ def all_equal(iterable): (or if the input is an empty sequence), False otherwise.""" g = itertools.groupby(iterable) return next(g, True) and not next(g, False) + + +def get_byte_arrays_for_search_chars( + search_chars: str, case_sensitive: bool +) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]: + """ + The text of a spaCy document is stored as a Python-internal Unicode representation + as defined by PEP 393. Each character in such a representation has the width of the + longest character in the string, which is either 1, 2 or 4 bytes. + + This function supports the rich feature extractor. It returns search byte arrays with + 1-, 2- and 4-byte character widths that are used for comparison with each of the three + representation types when searching document texts for search characters. Each byte array + contains characters that are as wide or narrower than its own width; a byte array can + ignore characters that are wider than its own width because a spaCy document with the + corresponding representation width could never contain characters wider than that width. + + When characters corresponding to search characters are found within a spaCy token + string, they are concatenated together and the resulting "finding byte arrays" are hashed. + It is crucial that the characters in all finding byte arrays representing a given sequence of + characters share the same width so that they all yield the same hash values. While it + would be possible to use the narrowest possible width for the sequence like PEP 393 does, + determining this would entain unnecessary processing. Instead, finding byte arrays always use + a 4-byte width. Each of the three search byte array therefore has a corresponding finding + byte array that is used to build up the finding byte arrays for specific document token strings. + + If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that + have case are added to the search byte arrays, and both the original character and its + other-cased counterpart map to the lower-case version in the finding byte array. + """ + + def encode(ch: str, width: int) -> bytes: + """ + ch: a single character + int: the width of the character encoding to use + """ + if width == 4: + return ch.encode("UTF-32")[width:] # remove byte order mark + elif width == 2: + return ch.encode("UTF-16")[width:] # remove byte order mark + else: + return ch.encode("UTF-8") + + def add_to_byte_arrays( + search: List[bytes], finding: List[bytes], ch: str, width: int + ) -> None: + """Add the byte representations of *ch* with representation of width + *width* to the two byte array lists. + """ + this_char_bytes = encode(ch, width) + this_char_bytes_f = encode(ch, 4) + if not case_sensitive and ch.islower(): + if this_char_bytes not in search: + search.append(this_char_bytes) + finding.append(this_char_bytes_f) + upper_char_bytes = encode(ch.upper(), width) + if upper_char_bytes not in search: + search.append(upper_char_bytes) + finding.append(this_char_bytes_f) + elif not case_sensitive and ch.isupper(): + lower_char_bytes = encode(ch.lower(), width) + lower_char_bytes_f = encode(ch.lower(), 4) + if this_char_bytes not in search: + search.append(this_char_bytes) + finding.append(lower_char_bytes_f) + if lower_char_bytes not in search: + search.append(lower_char_bytes) + finding.append(lower_char_bytes_f) + elif this_char_bytes not in search: + search.append(this_char_bytes) + finding.append(this_char_bytes_f) + + def get_ordered_raw_bytes( + search: List[bytes], finding: List[bytes] + ) -> Tuple[bytes, bytes]: + """Flatten the two lists, ordering both by the entries in *search* + using the native endianness of the platform. + """ + num_search = [list(entry) for entry in search] + search = [entry for _, entry in sorted(zip(num_search, search))] + finding = [entry for _, entry in sorted(zip(num_search, finding))] + return b"".join(search), b"".join(finding) + + w1_search: List[bytes] = [] + w1_finding: List[bytes] = [] + w2_search: List[bytes] = [] + w2_finding: List[bytes] = [] + w4_search: List[bytes] = [] + w4_finding: List[bytes] = [] + for ch in search_chars: + add_to_byte_arrays(w4_search, w4_finding, ch, 4) + if ord(ch) >= 65536: + continue + add_to_byte_arrays(w2_search, w2_finding, ch, 2) + if ord(ch) >= 128: + continue + add_to_byte_arrays(w1_search, w1_finding, ch, 1) + return ( + get_ordered_raw_bytes(w1_search, w1_finding) + + get_ordered_raw_bytes(w2_search, w2_finding) + + get_ordered_raw_bytes(w4_search, w4_finding) + )