Add search char byte array feature

2025-08-02 11:20:19 +03:00 · 2022-10-14 17:03:52 +02:00 · 2022-10-14 17:03:52 +02:00 · c116e11942
commit c116e11942
parent 1e9176f9c5
2 changed files with 247 additions and 0 deletions
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -0,0 +1,145 @@
 import sys
 import spacy
 def _correct_endianness(littleendian: bytes) -> bytes:
    if sys.byteorder == "little":
        return littleendian
    output = bytearray()
    for idx in range(0, len(littleendian), 2):
        output.append(littleendian[idx + 1])
        output.append(littleendian[idx])
    return bytes(output)
 def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
    assert w1_search == b"BEFWbefw"
    assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00")
    assert w4_search == _correct_endianness(
        b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
    )
    assert w1_finding == w2_finding == w4_finding == w4_search.lower()
 def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
    assert w1_search == b"Tbefw"
    assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
    assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
    assert w1_finding == w2_finding == w4_finding == w4_search
 def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
    assert w1_search == b"BFWbfw"
    assert w1_finding == _correct_endianness(
        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
    )
    assert w2_search == _correct_endianness(
        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
    )
    assert (
        w2_finding
        == w4_finding
        == _correct_endianness(
            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
        )
    )
    assert w4_search == _correct_endianness(
        b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
 def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
    assert w1_search == b"bfw"
    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00")
    assert (
        w2_finding
        == w4_finding
        == w4_search
        == _correct_endianness(
            b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
        )
    )
 def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
    assert w1_search == b"BFWbfw"
    assert w1_finding == _correct_endianness(
        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
    )
    assert w2_search == _correct_endianness(
        b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
    )
    assert w2_finding == _correct_endianness(
        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
    )
    assert w4_search == _correct_endianness(
        b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
    assert w4_finding == _correct_endianness(
        b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
    )
 def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
    (
        w1_search,
        w1_finding,
        w2_search,
        w2_finding,
        w4_search,
        w4_finding,
    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
    assert w1_search == b"bfw"
    assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
    assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00")
    assert w2_finding == _correct_endianness(
        b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
    assert (
        w4_search
        == w4_finding
        == _correct_endianness(
            b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
        )
    )
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1735,3 +1735,105 @@ def all_equal(iterable):
    (or if the input is an empty sequence), False otherwise."""
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
 def get_byte_arrays_for_search_chars(
    search_chars: str, case_sensitive: bool
 ) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
    """
    The text of a spaCy document is stored as a Python-internal Unicode representation
    as defined by PEP 393. Each character in such a representation has the width of the
    longest character in the string, which is either 1, 2 or 4 bytes.
    This function supports the rich feature extractor. It returns search byte arrays with
    1-, 2- and 4-byte character widths that are used for comparison with each of the three
    representation types when searching document texts for search characters. Each byte array
    contains characters that are as wide or narrower than its own width; a byte array can
    ignore characters that are wider than its own width because a spaCy document with the
    corresponding representation width could never contain characters wider than that width.
    When characters corresponding to search characters are found within a spaCy token
    string, they are concatenated together and the resulting "finding byte arrays" are hashed.
    It is crucial that the characters in all finding byte arrays representing a given sequence of
    characters share the same width so that they all yield the same hash values. While it
    would be possible to use the narrowest possible width for the sequence like PEP 393 does,
    determining this would entain unnecessary processing. Instead, finding byte arrays always use
    a 4-byte width. Each of the three search byte array therefore has a corresponding finding
    byte array that is used to build up the finding byte arrays for specific document token strings.
    If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
    have case are added to the search byte arrays, and both the original character and its
    other-cased counterpart map to the lower-case version in the finding byte array.
    """
    def encode(ch: str, width: int) -> bytes:
        """
        ch: a single character
        int: the width of the character encoding to use
        """
        if width == 4:
            return ch.encode("UTF-32")[width:]  # remove byte order mark
        elif width == 2:
            return ch.encode("UTF-16")[width:]  # remove byte order mark
        else:
            return ch.encode("UTF-8")
    def add_to_byte_arrays(
        search: List[bytes], finding: List[bytes], ch: str, width: int
    ) -> None:
        """Add the byte representations of *ch* with representation of width
        *width* to the two byte array lists.
        """
        this_char_bytes = encode(ch, width)
        this_char_bytes_f = encode(ch, 4)
        if not case_sensitive and ch.islower():
            if this_char_bytes not in search:
                search.append(this_char_bytes)
                finding.append(this_char_bytes_f)
            upper_char_bytes = encode(ch.upper(), width)
            if upper_char_bytes not in search:
                search.append(upper_char_bytes)
                finding.append(this_char_bytes_f)
        elif not case_sensitive and ch.isupper():
            lower_char_bytes = encode(ch.lower(), width)
            lower_char_bytes_f = encode(ch.lower(), 4)
            if this_char_bytes not in search:
                search.append(this_char_bytes)
                finding.append(lower_char_bytes_f)
            if lower_char_bytes not in search:
                search.append(lower_char_bytes)
                finding.append(lower_char_bytes_f)
        elif this_char_bytes not in search:
            search.append(this_char_bytes)
            finding.append(this_char_bytes_f)
    def get_ordered_raw_bytes(
        search: List[bytes], finding: List[bytes]
    ) -> Tuple[bytes, bytes]:
        """Flatten the two lists, ordering both by the entries in *search*
        using the native endianness of the platform.
        """
        num_search = [list(entry) for entry in search]
        search = [entry for _, entry in sorted(zip(num_search, search))]
        finding = [entry for _, entry in sorted(zip(num_search, finding))]
        return b"".join(search), b"".join(finding)
    w1_search: List[bytes] = []
    w1_finding: List[bytes] = []
    w2_search: List[bytes] = []
    w2_finding: List[bytes] = []
    w4_search: List[bytes] = []
    w4_finding: List[bytes] = []
    for ch in search_chars:
        add_to_byte_arrays(w4_search, w4_finding, ch, 4)
        if ord(ch) >= 65536:
            continue
        add_to_byte_arrays(w2_search, w2_finding, ch, 2)
        if ord(ch) >= 128:
            continue
        add_to_byte_arrays(w1_search, w1_finding, ch, 1)
    return (
        get_ordered_raw_bytes(w1_search, w1_finding)
        + get_ordered_raw_bytes(w2_search, w2_finding)
        + get_ordered_raw_bytes(w4_search, w4_finding)
    )