mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Add search char byte array feature
This commit is contained in:
parent
1e9176f9c5
commit
c116e11942
145
spacy/tests/test_util.py
Normal file
145
spacy/tests/test_util.py
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
import sys
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
def _correct_endianness(littleendian: bytes) -> bytes:
|
||||||
|
if sys.byteorder == "little":
|
||||||
|
return littleendian
|
||||||
|
output = bytearray()
|
||||||
|
for idx in range(0, len(littleendian), 2):
|
||||||
|
output.append(littleendian[idx + 1])
|
||||||
|
output.append(littleendian[idx])
|
||||||
|
return bytes(output)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
|
||||||
|
assert w1_search == b"BEFWbefw"
|
||||||
|
assert w2_search == _correct_endianness(b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00")
|
||||||
|
assert w4_search == _correct_endianness(
|
||||||
|
b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert w1_finding == w2_finding == w4_finding == w4_search.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
|
||||||
|
assert w1_search == b"Tbefw"
|
||||||
|
assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
|
||||||
|
assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
|
||||||
|
assert w1_finding == w2_finding == w4_finding == w4_search
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
|
||||||
|
assert w1_search == b"BFWbfw"
|
||||||
|
assert w1_finding == _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert w2_search == _correct_endianness(
|
||||||
|
b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
w2_finding
|
||||||
|
== w4_finding
|
||||||
|
== _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert w4_search == _correct_endianness(
|
||||||
|
b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
|
||||||
|
assert w1_search == b"bfw"
|
||||||
|
assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
|
||||||
|
assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xe9\x00")
|
||||||
|
assert (
|
||||||
|
w2_finding
|
||||||
|
== w4_finding
|
||||||
|
== w4_search
|
||||||
|
== _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
||||||
|
assert w1_search == b"BFWbfw"
|
||||||
|
assert w1_finding == _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert w2_search == _correct_endianness(
|
||||||
|
b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
|
||||||
|
)
|
||||||
|
assert w2_finding == _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert w4_search == _correct_endianness(
|
||||||
|
b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert w4_finding == _correct_endianness(
|
||||||
|
b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
|
||||||
|
(
|
||||||
|
w1_search,
|
||||||
|
w1_finding,
|
||||||
|
w2_search,
|
||||||
|
w2_finding,
|
||||||
|
w4_search,
|
||||||
|
w4_finding,
|
||||||
|
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
||||||
|
assert w1_search == b"bfw"
|
||||||
|
assert w1_finding == _correct_endianness(b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00")
|
||||||
|
assert w2_search == _correct_endianness(b"b\x00f\x00w\x00\xc9\x00\xe9\x00")
|
||||||
|
assert w2_finding == _correct_endianness(
|
||||||
|
b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
w4_search
|
||||||
|
== w4_finding
|
||||||
|
== _correct_endianness(
|
||||||
|
b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
|
)
|
||||||
|
)
|
102
spacy/util.py
102
spacy/util.py
|
@ -1735,3 +1735,105 @@ def all_equal(iterable):
|
||||||
(or if the input is an empty sequence), False otherwise."""
|
(or if the input is an empty sequence), False otherwise."""
|
||||||
g = itertools.groupby(iterable)
|
g = itertools.groupby(iterable)
|
||||||
return next(g, True) and not next(g, False)
|
return next(g, True) and not next(g, False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_byte_arrays_for_search_chars(
|
||||||
|
search_chars: str, case_sensitive: bool
|
||||||
|
) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
|
||||||
|
"""
|
||||||
|
The text of a spaCy document is stored as a Python-internal Unicode representation
|
||||||
|
as defined by PEP 393. Each character in such a representation has the width of the
|
||||||
|
longest character in the string, which is either 1, 2 or 4 bytes.
|
||||||
|
|
||||||
|
This function supports the rich feature extractor. It returns search byte arrays with
|
||||||
|
1-, 2- and 4-byte character widths that are used for comparison with each of the three
|
||||||
|
representation types when searching document texts for search characters. Each byte array
|
||||||
|
contains characters that are as wide or narrower than its own width; a byte array can
|
||||||
|
ignore characters that are wider than its own width because a spaCy document with the
|
||||||
|
corresponding representation width could never contain characters wider than that width.
|
||||||
|
|
||||||
|
When characters corresponding to search characters are found within a spaCy token
|
||||||
|
string, they are concatenated together and the resulting "finding byte arrays" are hashed.
|
||||||
|
It is crucial that the characters in all finding byte arrays representing a given sequence of
|
||||||
|
characters share the same width so that they all yield the same hash values. While it
|
||||||
|
would be possible to use the narrowest possible width for the sequence like PEP 393 does,
|
||||||
|
determining this would entain unnecessary processing. Instead, finding byte arrays always use
|
||||||
|
a 4-byte width. Each of the three search byte array therefore has a corresponding finding
|
||||||
|
byte array that is used to build up the finding byte arrays for specific document token strings.
|
||||||
|
|
||||||
|
If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
||||||
|
have case are added to the search byte arrays, and both the original character and its
|
||||||
|
other-cased counterpart map to the lower-case version in the finding byte array.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def encode(ch: str, width: int) -> bytes:
|
||||||
|
"""
|
||||||
|
ch: a single character
|
||||||
|
int: the width of the character encoding to use
|
||||||
|
"""
|
||||||
|
if width == 4:
|
||||||
|
return ch.encode("UTF-32")[width:] # remove byte order mark
|
||||||
|
elif width == 2:
|
||||||
|
return ch.encode("UTF-16")[width:] # remove byte order mark
|
||||||
|
else:
|
||||||
|
return ch.encode("UTF-8")
|
||||||
|
|
||||||
|
def add_to_byte_arrays(
|
||||||
|
search: List[bytes], finding: List[bytes], ch: str, width: int
|
||||||
|
) -> None:
|
||||||
|
"""Add the byte representations of *ch* with representation of width
|
||||||
|
*width* to the two byte array lists.
|
||||||
|
"""
|
||||||
|
this_char_bytes = encode(ch, width)
|
||||||
|
this_char_bytes_f = encode(ch, 4)
|
||||||
|
if not case_sensitive and ch.islower():
|
||||||
|
if this_char_bytes not in search:
|
||||||
|
search.append(this_char_bytes)
|
||||||
|
finding.append(this_char_bytes_f)
|
||||||
|
upper_char_bytes = encode(ch.upper(), width)
|
||||||
|
if upper_char_bytes not in search:
|
||||||
|
search.append(upper_char_bytes)
|
||||||
|
finding.append(this_char_bytes_f)
|
||||||
|
elif not case_sensitive and ch.isupper():
|
||||||
|
lower_char_bytes = encode(ch.lower(), width)
|
||||||
|
lower_char_bytes_f = encode(ch.lower(), 4)
|
||||||
|
if this_char_bytes not in search:
|
||||||
|
search.append(this_char_bytes)
|
||||||
|
finding.append(lower_char_bytes_f)
|
||||||
|
if lower_char_bytes not in search:
|
||||||
|
search.append(lower_char_bytes)
|
||||||
|
finding.append(lower_char_bytes_f)
|
||||||
|
elif this_char_bytes not in search:
|
||||||
|
search.append(this_char_bytes)
|
||||||
|
finding.append(this_char_bytes_f)
|
||||||
|
|
||||||
|
def get_ordered_raw_bytes(
|
||||||
|
search: List[bytes], finding: List[bytes]
|
||||||
|
) -> Tuple[bytes, bytes]:
|
||||||
|
"""Flatten the two lists, ordering both by the entries in *search*
|
||||||
|
using the native endianness of the platform.
|
||||||
|
"""
|
||||||
|
num_search = [list(entry) for entry in search]
|
||||||
|
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||||
|
finding = [entry for _, entry in sorted(zip(num_search, finding))]
|
||||||
|
return b"".join(search), b"".join(finding)
|
||||||
|
|
||||||
|
w1_search: List[bytes] = []
|
||||||
|
w1_finding: List[bytes] = []
|
||||||
|
w2_search: List[bytes] = []
|
||||||
|
w2_finding: List[bytes] = []
|
||||||
|
w4_search: List[bytes] = []
|
||||||
|
w4_finding: List[bytes] = []
|
||||||
|
for ch in search_chars:
|
||||||
|
add_to_byte_arrays(w4_search, w4_finding, ch, 4)
|
||||||
|
if ord(ch) >= 65536:
|
||||||
|
continue
|
||||||
|
add_to_byte_arrays(w2_search, w2_finding, ch, 2)
|
||||||
|
if ord(ch) >= 128:
|
||||||
|
continue
|
||||||
|
add_to_byte_arrays(w1_search, w1_finding, ch, 1)
|
||||||
|
return (
|
||||||
|
get_ordered_raw_bytes(w1_search, w1_finding)
|
||||||
|
+ get_ordered_raw_bytes(w2_search, w2_finding)
|
||||||
|
+ get_ordered_raw_bytes(w4_search, w4_finding)
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user