Add search char byte array feature

2025-08-02 11:20:19 +03:00 · 2022-10-14 17:03:52 +02:00 · 2022-10-14 17:03:52 +02:00 · c116e11942
commit c116e11942
parent 1e9176f9c5
2 changed files with 247 additions and 0 deletions
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -0,0 +1,145 @@
+import sys
+import spacy
+
+
+def _correct_endianness(littleendian: bytes) -> bytes:
+    if sys.byteorder == "little":
+        return littleendian
+    output = bytearray()
+    for idx in range(0, len(littleendian), 2):
+        output.append(littleendian[idx + 1])
+        output.append(littleendian[idx])
+    return bytes(output)
+
+
+def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
+    assert w1_search == b"BEFWbefw"
+    assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00")
+    assert w4_search == _correct_endianness(
+        b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    )
+    assert w1_finding == w2_finding == w4_finding == w4_search.lower()
+
+
+def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
+    assert w1_search == b"Tbefw"
+    assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
+    assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
+    assert w1_finding == w2_finding == w4_finding == w4_search
+
+
+def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
+    assert w1_search == b"BFWbfw"
+    assert w1_finding == _correct_endianness(
+        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    )
+    assert w2_search == _correct_endianness(
+        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
+    )
+    assert (
+        w2_finding
+        == w4_finding
+        == _correct_endianness(
+            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+        )
+    )
+    assert w4_search == _correct_endianness(
+        b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    )
+
+
+def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
+    assert w1_search == b"bfw"
+    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
+    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00")
+    assert (
+        w2_finding
+        == w4_finding
+        == w4_search
+        == _correct_endianness(
+            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
+        )
+    )
+
+
+def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
+    assert w1_search == b"BFWbfw"
+    assert w1_finding == _correct_endianness(
+        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
+    )
+    assert w2_search == _correct_endianness(
+        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
+    )
+    assert w2_finding == _correct_endianness(
+        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+    )
+    assert w4_search == _correct_endianness(
+        b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    )
+    assert w4_finding == _correct_endianness(
+        b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+    )
+
+
+def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
+    (
+        w1_search,
+        w1_finding,
+        w2_search,
+        w2_finding,
+        w4_search,
+        w4_finding,
+    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
+    assert w1_search == b"bfw"
+    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
+    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00")
+    assert w2_finding == _correct_endianness(
+        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    )
+    assert (
+        w4_search
+        == w4_finding
+        == _correct_endianness(
+            b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+        )
+    )
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1735,3 +1735,105 @@ def all_equal(iterable):
    (or if the input is an empty sequence), False otherwise."""
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
+
+
+def get_byte_arrays_for_search_chars(
+    search_chars: str, case_sensitive: bool
+) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
+    """
+    The text of a spaCy document is stored as a Python-internal Unicode representation
+    as defined by PEP 393. Each character in such a representation has the width of the
+    longest character in the string, which is either 1, 2 or 4 bytes.
+
+    This function supports the rich feature extractor. It returns search byte arrays with
+    1-, 2- and 4-byte character widths that are used for comparison with each of the three
+    representation types when searching document texts for search characters. Each byte array
+    contains characters that are as wide or narrower than its own width; a byte array can
+    ignore characters that are wider than its own width because a spaCy document with the
+    corresponding representation width could never contain characters wider than that width.
+
+    When characters corresponding to search characters are found within a spaCy token
+    string, they are concatenated together and the resulting "finding byte arrays" are hashed.
+    It is crucial that the characters in all finding byte arrays representing a given sequence of
+    characters share the same width so that they all yield the same hash values. While it
+    would be possible to use the narrowest possible width for the sequence like PEP 393 does,
+    determining this would entain unnecessary processing. Instead, finding byte arrays always use
+    a 4-byte width. Each of the three search byte array therefore has a corresponding finding
+    byte array that is used to build up the finding byte arrays for specific document token strings.
+
+    If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
+    have case are added to the search byte arrays, and both the original character and its
+    other-cased counterpart map to the lower-case version in the finding byte array.
+    """
+
+    def encode(ch: str, width: int) -> bytes:
+        """
+        ch: a single character
+        int: the width of the character encoding to use
+        """
+        if width == 4:
+            return ch.encode("UTF-32")[width:]  # remove byte order mark
+        elif width == 2:
+            return ch.encode("UTF-16")[width:]  # remove byte order mark
+        else:
+            return ch.encode("UTF-8")
+
+    def add_to_byte_arrays(
+        search: List[bytes], finding: List[bytes], ch: str, width: int
+    ) -> None:
+        """Add the byte representations of *ch* with representation of width
+        *width* to the two byte array lists.
+        """
+        this_char_bytes = encode(ch, width)
+        this_char_bytes_f = encode(ch, 4)
+        if not case_sensitive and ch.islower():
+            if this_char_bytes not in search:
+                search.append(this_char_bytes)
+                finding.append(this_char_bytes_f)
+            upper_char_bytes = encode(ch.upper(), width)
+            if upper_char_bytes not in search:
+                search.append(upper_char_bytes)
+                finding.append(this_char_bytes_f)
+        elif not case_sensitive and ch.isupper():
+            lower_char_bytes = encode(ch.lower(), width)
+            lower_char_bytes_f = encode(ch.lower(), 4)
+            if this_char_bytes not in search:
+                search.append(this_char_bytes)
+                finding.append(lower_char_bytes_f)
+            if lower_char_bytes not in search:
+                search.append(lower_char_bytes)
+                finding.append(lower_char_bytes_f)
+        elif this_char_bytes not in search:
+            search.append(this_char_bytes)
+            finding.append(this_char_bytes_f)
+
+    def get_ordered_raw_bytes(
+        search: List[bytes], finding: List[bytes]
+    ) -> Tuple[bytes, bytes]:
+        """Flatten the two lists, ordering both by the entries in *search*
+        using the native endianness of the platform.
+        """
+        num_search = [list(entry) for entry in search]
+        search = [entry for _, entry in sorted(zip(num_search, search))]
+        finding = [entry for _, entry in sorted(zip(num_search, finding))]
+        return b"".join(search), b"".join(finding)
+
+    w1_search: List[bytes] = []
+    w1_finding: List[bytes] = []
+    w2_search: List[bytes] = []
+    w2_finding: List[bytes] = []
+    w4_search: List[bytes] = []
+    w4_finding: List[bytes] = []
+    for ch in search_chars:
+        add_to_byte_arrays(w4_search, w4_finding, ch, 4)
+        if ord(ch) >= 65536:
+            continue
+        add_to_byte_arrays(w2_search, w2_finding, ch, 2)
+        if ord(ch) >= 128:
+            continue
+        add_to_byte_arrays(w1_search, w1_finding, ch, 1)
+    return (
+        get_ordered_raw_bytes(w1_search, w1_finding)
+        + get_ordered_raw_bytes(w2_search, w2_finding)
+        + get_ordered_raw_bytes(w4_search, w4_finding)
+    )