mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Intermediate state
This commit is contained in:
parent
356a341096
commit
2707d30ce0
|
@ -1,4 +1,5 @@
|
||||||
from typing import List, Optional, Callable, Tuple
|
from typing import List, Optional, Callable, Tuple
|
||||||
|
from ..util import get_byte_arrays_for_search_chars
|
||||||
from thinc.types import Ints2d
|
from thinc.types import Ints2d
|
||||||
from thinc.api import Model, registry
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
@ -16,22 +17,30 @@ def RichFeatureExtractor(
|
||||||
suff_search_chars: Optional[str] = None,
|
suff_search_chars: Optional[str] = None,
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
suff_search_lengths: Optional[List[int]] = None,
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
if pref_search_chars is not None:
|
||||||
|
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
|
||||||
|
else:
|
||||||
|
pref_search, pref_ref = bytes(), bytes()
|
||||||
|
if suff_search_chars is not None:
|
||||||
|
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
|
||||||
|
else:
|
||||||
|
suff_search, suff_ref = bytes(), bytes()
|
||||||
return Model(
|
return Model(
|
||||||
"extract_character_combination_hashes",
|
"extract_character_combination_hashes",
|
||||||
forward,
|
forward,
|
||||||
attrs={
|
attrs={
|
||||||
"case_sensitive": case_sensitive,
|
"case_sensitive": case_sensitive,
|
||||||
"pref_lengths": pref_lengths if pref_lengths is not None else [],
|
"pref_lengths": pref_lengths if pref_lengths is not None else [],
|
||||||
"pref_search_chars": pref_search_chars
|
"suff_lengths": suff_lengths if suff_lengths is not None else [],
|
||||||
if pref_search_chars is not None
|
"pref_search": pref_search,
|
||||||
else "",
|
"pref_ref": pref_ref,
|
||||||
|
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
|
||||||
"pref_search_lengths": pref_search_lengths
|
"pref_search_lengths": pref_search_lengths
|
||||||
if pref_search_lengths is not None
|
if pref_search_lengths is not None
|
||||||
else [],
|
else [],
|
||||||
"suff_lengths": suff_lengths if suff_lengths is not None else [],
|
"suff_search": suff_search,
|
||||||
"suff_search_chars": suff_search_chars
|
"suff_ref": suff_ref,
|
||||||
if suff_search_chars is not None
|
"suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
|
||||||
else "",
|
|
||||||
"suff_search_lengths": suff_search_lengths
|
"suff_search_lengths": suff_search_lengths
|
||||||
if suff_search_lengths is not None
|
if suff_search_lengths is not None
|
||||||
else [],
|
else [],
|
||||||
|
@ -45,10 +54,14 @@ def forward(
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
pref_lengths: List[int] = model.attrs["pref_lengths"]
|
pref_lengths: List[int] = model.attrs["pref_lengths"]
|
||||||
pref_search_chars: str = model.attrs["pref_search_chars"]
|
|
||||||
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
|
|
||||||
suff_lengths: List[int] = model.attrs["suff_lengths"]
|
suff_lengths: List[int] = model.attrs["suff_lengths"]
|
||||||
suff_search_chars: str = model.attrs["suff_search_chars"]
|
pref_search: bytes = model.attrs["pref_search"]
|
||||||
|
pref_ref: bytes = model.attrs["pref_ref"]
|
||||||
|
pref_s_char_l: int = model.attr["pref_s_char_l"]
|
||||||
|
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
|
||||||
|
suff_search: bytes = model.attrs["suff_search"]
|
||||||
|
suff_ref: bytes = model.attrs["suff_ref"]
|
||||||
|
suff_s_char_l: int = model.attr["suff_s_char_l"]
|
||||||
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -56,9 +69,13 @@ def forward(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
pref_lengths=pref_lengths,
|
pref_lengths=pref_lengths,
|
||||||
suff_lengths=suff_lengths,
|
suff_lengths=suff_lengths,
|
||||||
pref_search_chars=pref_search_chars,
|
pref_search=pref_search,
|
||||||
|
pref_ref=pref_ref,
|
||||||
|
pref_s_char_l=pref_s_char_l,
|
||||||
pref_search_lengths=pref_search_lengths,
|
pref_search_lengths=pref_search_lengths,
|
||||||
suff_search_chars=suff_search_chars,
|
suff_search=suff_search,
|
||||||
|
suff_ref=suff_ref,
|
||||||
|
suff_s_char_l=suff_s_char_l,
|
||||||
suff_search_lengths=suff_search_lengths,
|
suff_search_lengths=suff_search_lengths,
|
||||||
)
|
)
|
||||||
features.append(ops.asarray2i(hashes))
|
features.append(ops.asarray2i(hashes))
|
||||||
|
|
|
@ -14,6 +14,7 @@ from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||||
|
from spacy.util import get_byte_arrays_for_search_chars
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from .test_underscore import clean_underscore # noqa: F401
|
from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
@ -994,7 +995,8 @@ def test_doc_spans_setdefault(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def _get_unsigned_32_bit_hash(input: str) -> int:
|
def _get_unsigned_32_bit_hash(input: str) -> int:
|
||||||
working_hash = hash(input.encode("UTF-16")[2:])
|
input = input.replace(" ", "\x00")
|
||||||
|
working_hash = hash(input.encode("UTF-32LE"))
|
||||||
if working_hash < 0:
|
if working_hash < 0:
|
||||||
working_hash = working_hash + (2 << 31)
|
working_hash = working_hash + (2 << 31)
|
||||||
return working_hash
|
return working_hash
|
||||||
|
@ -1004,15 +1006,21 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
|
||||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||||
|
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
|
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
cs=case_sensitive,
|
||||||
pref_lengths=[1, 4, 3],
|
pref_lengths=[1, 4, 3],
|
||||||
suff_lengths=[2, 3, 4, 5],
|
suff_lengths=[2, 3, 4, 5],
|
||||||
pref_search_chars="",
|
pref_search=bytes(),
|
||||||
|
pref_ref=bytes(),
|
||||||
|
pref_s_char_l = 0,
|
||||||
pref_search_lengths=[2],
|
pref_search_lengths=[2],
|
||||||
suff_search_chars="xx✨rp",
|
suff_search=suff_search,
|
||||||
suff_search_lengths=[2, 1],
|
suff_ref=suff_ref,
|
||||||
|
suff_s_char_l=5 if case_sensitive else 9,
|
||||||
|
suff_search_lengths=[2,1],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash(
|
assert hashes[0][1] == _get_unsigned_32_bit_hash(
|
||||||
"spaC" if case_sensitive else "spac"
|
"spaC" if case_sensitive else "spac"
|
||||||
|
@ -1031,22 +1039,22 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
|
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
|
||||||
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
||||||
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][1] == _get_unsigned_32_bit_hash("✨ ")
|
||||||
assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][2] == _get_unsigned_32_bit_hash("✨ ")
|
||||||
assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
assert hashes[1][4] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][4] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
assert hashes[1][5] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][5] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
assert hashes[1][6] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][6] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
assert hashes[1][7] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[1][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ")
|
assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ")
|
||||||
assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
|
assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
|
||||||
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][1] == _get_unsigned_32_bit_hash("and ")
|
||||||
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
|
||||||
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
|
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
|
||||||
assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
|
||||||
assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][5] == _get_unsigned_32_bit_hash(" and")
|
||||||
assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][6] == _get_unsigned_32_bit_hash(" and")
|
||||||
assert hashes[2][7] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[2][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
assert hashes[2][8] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[2][8] == _get_unsigned_32_bit_hash(" ")
|
||||||
assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
@ -1076,17 +1084,23 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
|
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
|
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=True,
|
cs=True,
|
||||||
pref_lengths=[],
|
pref_lengths=[],
|
||||||
suff_lengths=[1, 2, 3],
|
suff_lengths=[1, 2, 3],
|
||||||
pref_search_chars="",
|
pref_search=bytes(),
|
||||||
|
pref_ref=bytes(),
|
||||||
|
pref_s_char_l = 0,
|
||||||
pref_search_lengths=[],
|
pref_search_lengths=[],
|
||||||
suff_search_chars="a",
|
suff_search=suff_search,
|
||||||
|
suff_ref=suff_ref,
|
||||||
|
suff_s_char_l=1,
|
||||||
suff_search_lengths=[1],
|
suff_search_lengths=[1],
|
||||||
)
|
)
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞")
|
assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
|
||||||
|
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
|
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,138 +1,55 @@
|
||||||
import sys
|
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
|
|
||||||
(
|
|
||||||
w1_search,
|
|
||||||
w1_finding,
|
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
|
|
||||||
assert w1_search == b"BEFWbefw"
|
|
||||||
assert w2_search == b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00"
|
|
||||||
assert (
|
|
||||||
w4_search
|
|
||||||
== b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
|
||||||
)
|
|
||||||
assert w1_finding == w2_finding == w4_finding == w4_search.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
|
|
||||||
(
|
|
||||||
w1_search,
|
|
||||||
w1_finding,
|
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
|
|
||||||
assert w1_search == b"Tbefw"
|
|
||||||
assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
|
|
||||||
assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
|
|
||||||
assert w1_finding == w2_finding == w4_finding == w4_search
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
|
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
|
||||||
(
|
(
|
||||||
w1_search,
|
search,
|
||||||
w1_finding,
|
ref,
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
|
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
|
||||||
assert w1_search == b"BFWbfw"
|
|
||||||
assert (
|
assert (
|
||||||
w1_finding
|
ref
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
|
||||||
)
|
|
||||||
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
|
|
||||||
assert (
|
|
||||||
w2_finding
|
|
||||||
== w4_finding
|
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
w4_search
|
search
|
||||||
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
|
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
|
||||||
(
|
(
|
||||||
w1_search,
|
search,
|
||||||
w1_finding,
|
ref,
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
|
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
|
||||||
assert w1_search == b"bfw"
|
|
||||||
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
|
||||||
assert w2_search == b"b\x00f\x00w\x00\xe9\x00"
|
|
||||||
assert (
|
assert (
|
||||||
w2_finding
|
ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
== w4_finding
|
|
||||||
== w4_search
|
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
|
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
|
||||||
(
|
(
|
||||||
w1_search,
|
search,
|
||||||
w1_finding,
|
ref,
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
||||||
assert w1_search == b"BFWbfw"
|
|
||||||
assert (
|
assert (
|
||||||
w1_finding
|
search
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
|
|
||||||
|
|
||||||
assert (
|
|
||||||
w2_finding
|
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
w4_search
|
|
||||||
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
w4_finding
|
ref
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
|
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
|
||||||
(
|
(
|
||||||
w1_search,
|
search,
|
||||||
w1_finding,
|
ref,
|
||||||
w2_search,
|
|
||||||
w2_finding,
|
|
||||||
w4_search,
|
|
||||||
w4_finding,
|
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
||||||
assert w1_search == b"bfw"
|
assert search == ref
|
||||||
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
|
|
||||||
assert w2_search == b"b\x00f\x00w\x00\xc9\x00\xe9\x00"
|
|
||||||
assert (
|
assert (
|
||||||
w2_finding
|
ref
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert w4_search == w4_finding
|
|
||||||
assert (
|
|
||||||
w4_finding
|
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
|
@ -33,35 +33,26 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _populate_affix_buf(
|
cdef void _populate_aff_buf(
|
||||||
const void* str_data_ptr,
|
const Py_UCS4* text_buf,
|
||||||
const unsigned int unicode_byte_width,
|
const int tok_idx,
|
||||||
const int word_idx,
|
const int tok_len,
|
||||||
const int word_len,
|
Py_UCS4* aff_buf,
|
||||||
Py_UCS4* affix_buf,
|
|
||||||
const int pref_length,
|
const int pref_length,
|
||||||
const int suff_length,
|
const int suff_length,
|
||||||
const bint to_lower
|
const bint to_lower
|
||||||
)
|
)
|
||||||
|
|
||||||
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes)
|
cdef void _populate_search_buf(
|
||||||
|
const Py_UCS4* text_buf,
|
||||||
|
const int tok_idx,
|
||||||
cdef bint _is_searched_char_in_search_chars_v(
|
const int tok_len,
|
||||||
const unsigned short searched_char,
|
Py_UCS4* search_buf,
|
||||||
const unsigned char[:] search_chars_v,
|
Py_UCS4* ref_buf,
|
||||||
const unsigned int search_chars_v_len,
|
const int search_buf_len,
|
||||||
)
|
Py_UCS4* finding_buf,
|
||||||
|
const int finding_buf_len,
|
||||||
|
bint suffs_not_prefs
|
||||||
cdef void _set_found_char_buf(
|
|
||||||
const bint suffs_not_prefs,
|
|
||||||
const unsigned char[:] searched_string_v,
|
|
||||||
const unsigned int searched_string_len,
|
|
||||||
const unsigned char[:] search_chars_v,
|
|
||||||
const unsigned int search_chars_v_len,
|
|
||||||
char* found_char_buf,
|
|
||||||
const unsigned int found_char_buf_len,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -181,9 +181,13 @@ class Doc:
|
||||||
pref_lengths: List[int],
|
pref_lengths: List[int],
|
||||||
suff_lengths: List[int],
|
suff_lengths: List[int],
|
||||||
pref_search_chars: str,
|
pref_search_chars: str,
|
||||||
|
pref_ref_chars: str,
|
||||||
|
pref_search_char_length: int,
|
||||||
pref_search_lengths: List[int],
|
pref_search_lengths: List[int],
|
||||||
suff_search_chars: str,
|
suff_search_chars: str,
|
||||||
suff_search_lengths: List[int]
|
suff_ref_chars: str,
|
||||||
|
suff_search_char_length: int,
|
||||||
|
suff_search_lengths: List[int],
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -3,7 +3,7 @@ from typing import Set, List
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy, memcmp, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
from libc.stdint cimport int32_t, uint64_t
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
|
||||||
|
@ -42,12 +42,6 @@ from ..util import get_words_and_spaces
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
cdef extern from *:
|
|
||||||
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
|
|
||||||
void* PyUnicode_DATA(void* o)
|
|
||||||
int PyUnicode_KIND(void *data)
|
|
||||||
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
|
|
||||||
|
|
||||||
cdef int bounds_check(int i, int length, int padding) except -1:
|
cdef int bounds_check(int i, int length, int padding) except -1:
|
||||||
if (i + padding) < 0:
|
if (i + padding) < 0:
|
||||||
raise IndexError(Errors.E026.format(i=i, length=length))
|
raise IndexError(Errors.E026.format(i=i, length=length))
|
||||||
|
@ -111,6 +105,16 @@ class SetEntsDefault(str, Enum):
|
||||||
return list(cls.__members__.keys())
|
return list(cls.__members__.keys())
|
||||||
|
|
||||||
|
|
||||||
|
cdef extern from "unicodeobject.h":
|
||||||
|
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
|
||||||
|
void* PyUnicode_DATA(void* o)
|
||||||
|
void PyUnicode_READY(void * o)
|
||||||
|
int PyUnicode_KIND(void *data)
|
||||||
|
int PyUnicode_IS_COMPACT(void *data)
|
||||||
|
|
||||||
|
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||||
|
@ -1742,33 +1746,37 @@ cdef class Doc:
|
||||||
|
|
||||||
|
|
||||||
def get_character_combination_hashes(
|
def get_character_combination_hashes(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
bint case_sensitive,
|
bint cs,
|
||||||
pref_lengths: List[int],
|
pref_lengths: List[int],
|
||||||
suff_lengths: List[int],
|
suff_lengths: List[int],
|
||||||
str pref_search_chars,
|
char* pref_search,
|
||||||
|
char* pref_ref,
|
||||||
|
int pref_s_char_l,
|
||||||
pref_search_lengths: List[int],
|
pref_search_lengths: List[int],
|
||||||
str suff_search_chars,
|
char* suff_search,
|
||||||
suff_search_lengths: List[int]
|
char* suff_ref,
|
||||||
|
int suff_s_char_l,
|
||||||
|
suff_search_lengths: List[int],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
derived from the string (text/orth) of each token.
|
derived from the string (text/orth) of each token.
|
||||||
|
|
||||||
case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
|
cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
|
||||||
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
|
if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
|
||||||
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
|
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
|
||||||
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
|
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
|
||||||
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
|
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
|
||||||
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||||
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
|
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "a" and "ac".
|
"spaCy" would be "a" and "ac".
|
||||||
suff_search_chars: a string containing characters to search for within each token, starting at the end.
|
suff_search_chars: a string containing characters to search for within each token, starting at the end.
|
||||||
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||||
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
|
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "c" and "ca".
|
"spaCy" would be "c" and "ca".
|
||||||
|
|
||||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||||
|
@ -1779,99 +1787,61 @@ cdef class Doc:
|
||||||
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
|
||||||
|
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
|
||||||
|
cdef int aff_buf_l = max_pref_l + max_suff_l
|
||||||
|
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
|
||||||
|
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
|
||||||
|
|
||||||
cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0
|
cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
|
||||||
cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0
|
cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
|
||||||
cdef Py_UCS4* affix_buf = <Py_UCS4*>self.mem.alloc(4, longest_pref + longest_suff)
|
cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
|
||||||
|
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
|
||||||
cdef void* text_ptr = <void*> self.text
|
cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
|
||||||
cdef void* text_data_ptr = <void*> PyUnicode_DATA(text_ptr) # todo change to const void
|
cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
|
||||||
cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len
|
cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
|
||||||
|
|
||||||
cdef TokenC token_c
|
|
||||||
cdef str working_str
|
|
||||||
|
|
||||||
for tok_idx in range(num_toks):
|
|
||||||
token_c = self.c[tok_idx]
|
|
||||||
token_idx = token_c.idx
|
|
||||||
token_len = token_c.lex.length
|
|
||||||
_populate_affix_buf(
|
|
||||||
text_data_ptr,
|
|
||||||
unicode_byte_width,
|
|
||||||
token_idx,
|
|
||||||
token_len,
|
|
||||||
affix_buf,
|
|
||||||
longest_pref,
|
|
||||||
longest_suff,
|
|
||||||
not case_sensitive
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
|
|
||||||
cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
|
|
||||||
cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0
|
|
||||||
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
|
|
||||||
cdef char* found_char_buf = found_char_buf_bytes
|
|
||||||
cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v),
|
|
||||||
cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
|
|
||||||
|
|
||||||
cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
|
cdef bytes encoded_text = self.text.encode("utf-32le")
|
||||||
cdef unsigned int num_pref_search_hashes = len(pref_search_lengths)
|
cdef char* intermediate_text = encoded_text
|
||||||
cdef unsigned int num_suff_search_hashes = len(suff_search_lengths)
|
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
|
|
||||||
|
|
||||||
cdef const unsigned char[:] tok_str_v
|
cdef unsigned int num_toks = len(self), aff_len
|
||||||
cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len
|
cdef unsigned int h_pref_n = len(pref_lengths)
|
||||||
cdef attr_t num_tok_attr
|
cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
|
||||||
cdef str str_tok_attr
|
cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
|
||||||
|
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
|
||||||
|
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
|
||||||
|
|
||||||
for tok_idx in range(num_toks):
|
cdef TokenC tok_c
|
||||||
num_tok_attr = self.c[tok_idx].lex.orth if case_sensitive else self.c[tok_idx].lex.lower
|
|
||||||
str_tok_attr = self.vocab.strings[num_tok_attr]
|
|
||||||
tok_str_v = _get_utf16_memoryview(str_tok_attr, False)
|
|
||||||
tok_str_v_len = len(tok_str_v)
|
|
||||||
|
|
||||||
for hash_idx in range(num_pref_norm_hashes):
|
for tok_i in range(num_toks):
|
||||||
char_comb_len = pref_lengths[hash_idx] * 2
|
tok_c = self.c[tok_i]
|
||||||
if char_comb_len > tok_str_v_len:
|
tok_idx = tok_c.idx
|
||||||
char_comb_len = tok_str_v_len
|
tok_len = tok_c.lex.length
|
||||||
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[0], char_comb_len, 0)
|
|
||||||
|
|
||||||
for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes):
|
|
||||||
char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2
|
|
||||||
if char_comb_len > tok_str_v_len:
|
|
||||||
char_comb_len = tok_str_v_len
|
|
||||||
affix_start = tok_str_v_len - char_comb_len
|
|
||||||
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], char_comb_len, 0)
|
|
||||||
|
|
||||||
_set_found_char_buf(
|
|
||||||
False,
|
|
||||||
tok_str_v,
|
|
||||||
tok_str_v_len,
|
|
||||||
pref_search_chars_v,
|
|
||||||
pref_search_chars_v_len,
|
|
||||||
found_char_buf,
|
|
||||||
found_char_buf_len,
|
|
||||||
)
|
|
||||||
|
|
||||||
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes):
|
_populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
|
||||||
char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2
|
_populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
|
||||||
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
|
_populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
|
||||||
|
|
||||||
|
for hash_idx in range(h_pref_n):
|
||||||
|
aff_len = pref_lengths[hash_idx]
|
||||||
|
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
|
||||||
|
|
||||||
_set_found_char_buf(
|
for hash_idx in range(h_pref_n, h_suff_end_idx):
|
||||||
True,
|
aff_len = suff_lengths[hash_idx - h_pref_n]
|
||||||
tok_str_v,
|
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
|
||||||
tok_str_v_len,
|
|
||||||
suff_search_chars_v,
|
for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
|
||||||
suff_search_chars_v_len,
|
aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
|
||||||
found_char_buf,
|
hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
|
||||||
found_char_buf_len,
|
|
||||||
)
|
|
||||||
|
|
||||||
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes):
|
for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
|
||||||
char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2
|
aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
|
||||||
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
|
hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
|
||||||
|
|
||||||
|
self.mem.free(aff_buf)
|
||||||
|
self.mem.free(pref_f_buf)
|
||||||
|
self.mem.free(suff_f_buf)
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -2055,12 +2025,11 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
|
|
||||||
cdef void _populate_affix_buf(
|
cdef void _populate_aff_buf(
|
||||||
const void* str_data_ptr,
|
const Py_UCS4* text_buf,
|
||||||
const unsigned int unicode_byte_width,
|
const int tok_idx,
|
||||||
const int word_idx,
|
const int tok_len,
|
||||||
const int word_len,
|
Py_UCS4* aff_buf,
|
||||||
Py_UCS4* affix_buf,
|
|
||||||
const int pref_length,
|
const int pref_length,
|
||||||
const int suff_length,
|
const int suff_length,
|
||||||
const bint to_lower
|
const bint to_lower
|
||||||
|
@ -2070,107 +2039,75 @@ cdef void _populate_affix_buf(
|
||||||
|
|
||||||
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
|
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
|
||||||
Unicode form (see PEP 393).
|
Unicode form (see PEP 393).
|
||||||
unicode_byte_width: the number of bytes occupied by each character in the containing string.
|
kind: the number of bytes occupied by each character in the containing string.
|
||||||
word_idx: the index of the first character of the word within the containing string.
|
word_idx: the index of the first character of the word within the containing string.
|
||||||
word_len: the length of the word.
|
word_len: the length of the word.
|
||||||
affix_buf: the buffer to populate.
|
aff_buf: the buffer to populate.
|
||||||
pref_length: the length of the prefix.
|
pref_length: the length of the prefix.
|
||||||
suff_length: the length of the suffix.
|
suff_length: the length of the suffix.
|
||||||
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
||||||
"""
|
"""
|
||||||
cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
|
cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
|
||||||
cdef Py_UCS4 working_wchar
|
|
||||||
|
while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
|
||||||
|
|
||||||
while affix_buf_idx < pref_length and affix_buf_idx < word_len:
|
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
|
||||||
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, in_word_idx)
|
|
||||||
if to_lower:
|
if to_lower:
|
||||||
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
|
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
|
||||||
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
|
aff_buf_idx += 1
|
||||||
affix_buf_idx += 1
|
|
||||||
|
|
||||||
while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len):
|
if aff_buf_idx < buf_size - tok_len:
|
||||||
# fill out the empty middle part of the buffer with zeros
|
# fill out the empty middle part of the buffer with zeros
|
||||||
affix_buf[affix_buf_idx] = 0
|
memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
|
||||||
affix_buf_idx += 1
|
|
||||||
|
|
||||||
while affix_buf_idx < buf_size:
|
while aff_buf_idx < buf_size:
|
||||||
in_word_idx = affix_buf_idx + word_len - buf_size
|
in_word_idx = aff_buf_idx + tok_len - buf_size
|
||||||
# for suffixes we have to track the in-word index separately from the in-buffer index
|
# for suffixes we have to track the in-word index separately from the in-buffer index
|
||||||
if in_word_idx < pref_length:
|
if in_word_idx < pref_length:
|
||||||
# we've already retrieved this character as part of the prefix, so copy it from there
|
# we've already retrieved this character as part of the prefix, so copy it from there
|
||||||
# as that's quicker than retrieving it from the input string a second time
|
# as that's quicker than retrieving it from the input string a second time
|
||||||
memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4)
|
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
|
||||||
else:
|
else:
|
||||||
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx)
|
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
|
||||||
if to_lower:
|
if to_lower:
|
||||||
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
|
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
|
||||||
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
|
aff_buf_idx += 1
|
||||||
affix_buf_idx += 1
|
|
||||||
|
|
||||||
|
cdef void _populate_search_buf(
|
||||||
|
const Py_UCS4* text_buf,
|
||||||
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes):
|
const int tok_idx,
|
||||||
"""
|
const int tok_len,
|
||||||
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.
|
Py_UCS4* search_buf,
|
||||||
Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation
|
Py_UCS4* ref_buf,
|
||||||
occupies four bytes rather than two.
|
const int search_buf_len,
|
||||||
"""
|
Py_UCS4* finding_buf,
|
||||||
cdef const unsigned char[:] view = unicode_string.encode("UTF-16")
|
const int finding_buf_len,
|
||||||
view = view[2:] # first two bytes express endianness
|
bint suffs_not_prefs
|
||||||
cdef unsigned int unicode_len, view_len
|
|
||||||
if check_2_bytes:
|
|
||||||
unicode_len = len(unicode_string)
|
|
||||||
view_len = len(view)
|
|
||||||
if unicode_len * 2 != view_len:
|
|
||||||
raise ValueError(Errors.E1046)
|
|
||||||
return view
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint _is_searched_char_in_search_chars_v(
|
|
||||||
const unsigned short searched_char,
|
|
||||||
const unsigned char[:] search_chars_v,
|
|
||||||
const unsigned int search_chars_v_len
|
|
||||||
):
|
):
|
||||||
cdef unsigned int search_chars_v_idx = 0
|
cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
||||||
while search_chars_v_idx < search_chars_v_len:
|
cdef unsigned int search_buf_idx
|
||||||
if searched_char == (<unsigned short*> &search_chars_v[search_chars_v_idx])[0]:
|
cdef int cmp_res
|
||||||
return True
|
|
||||||
search_chars_v_idx += 2
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
while finding_buf_idx < finding_buf_len:
|
||||||
cdef void _set_found_char_buf(
|
for search_buf_idx in range (search_buf_len):
|
||||||
const bint suffs_not_prefs,
|
cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
|
||||||
const unsigned char[:] searched_string_v,
|
if cmp_res == 0:
|
||||||
const unsigned int searched_string_v_len,
|
memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
|
||||||
const unsigned char[:] search_chars_v,
|
finding_buf_idx += 1
|
||||||
const unsigned int search_chars_v_len,
|
if cmp_res >= 0:
|
||||||
char* found_char_buf,
|
|
||||||
const unsigned int found_char_buf_len,
|
|
||||||
):
|
|
||||||
""" Pick the UTF-16 characters from *searched_string_v* that are also in *search_chars_v* and writes them in order to *found_char_buf*.
|
|
||||||
If *suffs_not_prefs*, the search starts from the end of *searched_string_v* rather than from the beginning.
|
|
||||||
"""
|
|
||||||
cdef unsigned int found_char_buf_idx = 0, searched_string_idx = searched_string_v_len - 2 if suffs_not_prefs else 0
|
|
||||||
cdef unsigned short searched_char, SPACE = 32
|
|
||||||
|
|
||||||
while found_char_buf_idx < found_char_buf_len:
|
|
||||||
searched_char = (<unsigned short*> &searched_string_v[searched_string_idx])[0]
|
|
||||||
if _is_searched_char_in_search_chars_v(searched_char, search_chars_v, search_chars_v_len):
|
|
||||||
memcpy(found_char_buf + found_char_buf_idx, &searched_char, 2)
|
|
||||||
found_char_buf_idx += 2
|
|
||||||
if suffs_not_prefs:
|
|
||||||
if searched_string_idx <= 0:
|
|
||||||
break
|
break
|
||||||
searched_string_idx -= 2
|
if suffs_not_prefs:
|
||||||
|
if text_string_idx <= tok_idx:
|
||||||
|
break
|
||||||
|
text_string_idx -= 1
|
||||||
else:
|
else:
|
||||||
searched_string_idx += 2
|
text_string_idx += 1
|
||||||
if searched_string_idx >= searched_string_v_len:
|
if text_string_idx >= tok_idx + tok_len:
|
||||||
break
|
break
|
||||||
|
|
||||||
while found_char_buf_idx < found_char_buf_len:
|
if finding_buf_idx < finding_buf_len:
|
||||||
memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2)
|
memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
|
||||||
found_char_buf_idx += 2
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
|
|
|
@ -1739,104 +1739,68 @@ def all_equal(iterable):
|
||||||
|
|
||||||
def get_byte_arrays_for_search_chars(
|
def get_byte_arrays_for_search_chars(
|
||||||
search_chars: str, case_sensitive: bool
|
search_chars: str, case_sensitive: bool
|
||||||
) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
|
) -> Tuple[bytes, bytes]:
|
||||||
"""
|
"""
|
||||||
The text of a spaCy document is stored as a Python-internal Unicode representation
|
|
||||||
as defined by PEP 393. Each character in such a representation has the width of the
|
|
||||||
longest character in the string, which is either 1, 2 or 4 bytes.
|
|
||||||
|
|
||||||
This function supports the rich feature extractor. It returns search byte arrays with
|
This function supports the rich feature extractor. It returns search byte arrays with
|
||||||
1-, 2- and 4-byte character widths that are used for comparison with each of the three
|
4-byte character width that are used for comparison when searching document texts
|
||||||
representation types when searching document texts for search characters. Each byte array
|
for search characters. The encoding is little-endian regardless of architecture, as
|
||||||
contains characters that are as wide or narrower than its own width; a byte array can
|
this is what is expected by the murmurhash library used downstream.
|
||||||
ignore characters that are wider than its own width because a spaCy document with the
|
|
||||||
corresponding representation width could never contain characters wider than that width.
|
|
||||||
|
|
||||||
When characters corresponding to search characters are found within a spaCy token
|
Alongside the "search byte array" against which words from document texts are compared
|
||||||
string, they are concatenated together and the resulting "finding byte arrays" are hashed.
|
is the "ref byte array". When a character from the search byte array is matched,
|
||||||
It is crucial that the characters in all finding byte arrays representing a given sequence of
|
the character at the corresponding position in the ref byte array is added to the
|
||||||
characters share the same width so that they all yield the same hash values. While it
|
byte sequence of the configured length that is then hashed. This enables case-sensitivity
|
||||||
would be possible to use the narrowest possible width for the sequence like PEP 393 does,
|
to be handled without converting the case of the words being searched: if
|
||||||
determining this would entain unnecessary processing. Instead, finding byte arrays always use
|
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
||||||
a 4-byte width. Each of the three search byte array therefore has a corresponding finding
|
|
||||||
byte array that is used to build up the finding byte arrays for specific document token strings.
|
|
||||||
|
|
||||||
If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
|
||||||
have case are added to the search byte arrays, and both the original character and its
|
have case are added to the search byte arrays, and both the original character and its
|
||||||
other-cased counterpart map to the lower-case version in the finding byte array.
|
other-cased counterpart map to the lower-case version in the ref byte array.
|
||||||
|
|
||||||
All encodings are little-endian regardless of architecture, as this is what is expected by the
|
|
||||||
murmurhash library used downstream.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def encode(ch: str, width: int) -> bytes:
|
def encode(ch: str) -> bytes:
|
||||||
"""
|
"""
|
||||||
ch: a single character
|
ch: a single character
|
||||||
int: the width of the character encoding to use
|
|
||||||
"""
|
"""
|
||||||
if width == 4:
|
return ch.encode("UTF-32LE")
|
||||||
return ch.encode("UTF-32LE")
|
|
||||||
elif width == 2:
|
|
||||||
return ch.encode("UTF-16LE")
|
|
||||||
else:
|
|
||||||
return ch.encode("UTF-8")
|
|
||||||
|
|
||||||
def add_to_byte_arrays(
|
def add_to_byte_arrays(
|
||||||
search: List[bytes], finding: List[bytes], ch: str, width: int
|
search: List[bytes], ref: List[bytes], ch: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Add the byte representations of *ch* with representation of width
|
"""Add the byte representations of *ch* to the two byte array lists.
|
||||||
*width* to the two byte array lists.
|
|
||||||
"""
|
"""
|
||||||
this_char_bytes = encode(ch, width)
|
this_char_bytes = encode(ch)
|
||||||
this_char_bytes_f = encode(ch, 4)
|
|
||||||
if not case_sensitive and ch.islower():
|
if not case_sensitive and ch.islower():
|
||||||
if this_char_bytes not in search:
|
if this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
finding.append(this_char_bytes_f)
|
ref.append(this_char_bytes)
|
||||||
upper_char_bytes = encode(ch.upper(), width)
|
upper_char_bytes = encode(ch.upper())
|
||||||
if upper_char_bytes not in search:
|
if upper_char_bytes not in search:
|
||||||
search.append(upper_char_bytes)
|
search.append(upper_char_bytes)
|
||||||
finding.append(this_char_bytes_f)
|
ref.append(this_char_bytes)
|
||||||
elif not case_sensitive and ch.isupper():
|
elif not case_sensitive and ch.isupper():
|
||||||
lower_char_bytes = encode(ch.lower(), width)
|
lower_char_bytes = encode(ch.lower())
|
||||||
lower_char_bytes_f = encode(ch.lower(), 4)
|
|
||||||
if this_char_bytes not in search:
|
if this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
finding.append(lower_char_bytes_f)
|
ref.append(lower_char_bytes)
|
||||||
if lower_char_bytes not in search:
|
if lower_char_bytes not in search:
|
||||||
search.append(lower_char_bytes)
|
search.append(lower_char_bytes)
|
||||||
finding.append(lower_char_bytes_f)
|
ref.append(lower_char_bytes)
|
||||||
elif this_char_bytes not in search:
|
elif this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
finding.append(this_char_bytes_f)
|
ref.append(this_char_bytes)
|
||||||
|
|
||||||
def get_ordered_raw_bytes(
|
def get_ordered_raw_bytes(
|
||||||
search: List[bytes], finding: List[bytes]
|
search: List[bytes], ref: List[bytes]
|
||||||
) -> Tuple[bytes, bytes]:
|
) -> Tuple[bytes, bytes]:
|
||||||
"""Flatten the two lists, ordering both by the entries in *search*
|
"""Flatten the two lists, ordering both by the entries in *search*
|
||||||
using the native endianness of the platform.
|
using the native endianness of the platform.
|
||||||
"""
|
"""
|
||||||
num_search = [list(entry) for entry in search]
|
num_search = [list(entry) for entry in search]
|
||||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||||
finding = [entry for _, entry in sorted(zip(num_search, finding))]
|
ref = [entry for _, entry in sorted(zip(num_search, ref))]
|
||||||
return b"".join(search), b"".join(finding)
|
return b"".join(search), b"".join(ref)
|
||||||
|
|
||||||
w1_search: List[bytes] = []
|
search: List[bytes] = []
|
||||||
w1_finding: List[bytes] = []
|
ref: List[bytes] = []
|
||||||
w2_search: List[bytes] = []
|
|
||||||
w2_finding: List[bytes] = []
|
|
||||||
w4_search: List[bytes] = []
|
|
||||||
w4_finding: List[bytes] = []
|
|
||||||
for ch in search_chars:
|
for ch in search_chars:
|
||||||
add_to_byte_arrays(w4_search, w4_finding, ch, 4)
|
add_to_byte_arrays(search, ref, ch)
|
||||||
if ord(ch) >= 65536:
|
return get_ordered_raw_bytes(search, ref)
|
||||||
continue
|
|
||||||
add_to_byte_arrays(w2_search, w2_finding, ch, 2)
|
|
||||||
if ord(ch) >= 128:
|
|
||||||
continue
|
|
||||||
add_to_byte_arrays(w1_search, w1_finding, ch, 1)
|
|
||||||
return (
|
|
||||||
get_ordered_raw_bytes(w1_search, w1_finding)
|
|
||||||
+ get_ordered_raw_bytes(w2_search, w2_finding)
|
|
||||||
+ get_ordered_raw_bytes(w4_search, w4_finding)
|
|
||||||
)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user