Intermediate state

This commit is contained in:
richardpaulhudson 2022-10-19 23:20:11 +02:00
parent 356a341096
commit 2707d30ce0
7 changed files with 249 additions and 405 deletions

View File

@ -1,4 +1,5 @@
from typing import List, Optional, Callable, Tuple
from ..util import get_byte_arrays_for_search_chars
from thinc.types import Ints2d
from thinc.api import Model, registry
@ -16,22 +17,30 @@ def RichFeatureExtractor(
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
if pref_search_chars is not None:
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
else:
pref_search, pref_ref = bytes(), bytes()
if suff_search_chars is not None:
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
else:
suff_search, suff_ref = bytes(), bytes()
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"pref_lengths": pref_lengths if pref_lengths is not None else [],
"pref_search_chars": pref_search_chars
if pref_search_chars is not None
else "",
"suff_lengths": suff_lengths if suff_lengths is not None else [],
"pref_search": pref_search,
"pref_ref": pref_ref,
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
"pref_search_lengths": pref_search_lengths
if pref_search_lengths is not None
else [],
"suff_lengths": suff_lengths if suff_lengths is not None else [],
"suff_search_chars": suff_search_chars
if suff_search_chars is not None
else "",
"suff_search": suff_search,
"suff_ref": suff_ref,
"suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
"suff_search_lengths": suff_search_lengths
if suff_search_lengths is not None
else [],
@ -45,10 +54,14 @@ def forward(
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: List[int] = model.attrs["pref_lengths"]
pref_search_chars: str = model.attrs["pref_search_chars"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
suff_lengths: List[int] = model.attrs["suff_lengths"]
suff_search_chars: str = model.attrs["suff_search_chars"]
pref_search: bytes = model.attrs["pref_search"]
pref_ref: bytes = model.attrs["pref_ref"]
pref_s_char_l: int = model.attr["pref_s_char_l"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"]
suff_ref: bytes = model.attrs["suff_ref"]
suff_s_char_l: int = model.attr["suff_s_char_l"]
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
features: List[Ints2d] = []
for doc in docs:
@ -56,9 +69,13 @@ def forward(
case_sensitive=case_sensitive,
pref_lengths=pref_lengths,
suff_lengths=suff_lengths,
pref_search_chars=pref_search_chars,
pref_search=pref_search,
pref_ref=pref_ref,
pref_s_char_l=pref_s_char_l,
pref_search_lengths=pref_search_lengths,
suff_search_chars=suff_search_chars,
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=suff_s_char_l,
suff_search_lengths=suff_search_lengths,
)
features.append(ops.asarray2i(hashes))

View File

@ -14,6 +14,7 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language
from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.util import get_byte_arrays_for_search_chars
from spacy.vocab import Vocab
from .test_underscore import clean_underscore # noqa: F401
@ -994,7 +995,8 @@ def test_doc_spans_setdefault(en_tokenizer):
def _get_unsigned_32_bit_hash(input: str) -> int:
working_hash = hash(input.encode("UTF-16")[2:])
input = input.replace(" ", "\x00")
working_hash = hash(input.encode("UTF-32LE"))
if working_hash < 0:
working_hash = working_hash + (2 << 31)
return working_hash
@ -1004,15 +1006,21 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy")
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
cs=case_sensitive,
pref_lengths=[1, 4, 3],
suff_lengths=[2, 3, 4, 5],
pref_search_chars="",
pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[2],
suff_search_chars="xx✨rp",
suff_search_lengths=[2, 1],
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=5 if case_sensitive else 9,
suff_search_lengths=[2,1],
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
assert hashes[0][1] == _get_unsigned_32_bit_hash(
"spaC" if case_sensitive else "spac"
@ -1031,22 +1039,22 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("")
assert hashes[1][2] == _get_unsigned_32_bit_hash("")
assert hashes[1][3] == _get_unsigned_32_bit_hash("")
assert hashes[1][4] == _get_unsigned_32_bit_hash("")
assert hashes[1][5] == _get_unsigned_32_bit_hash("")
assert hashes[1][6] == _get_unsigned_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][2] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][3] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][5] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][6] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[1][8] == _get_unsigned_32_bit_hash("")
assert hashes[1][9] == _get_unsigned_32_bit_hash("")
assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and ")
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
assert hashes[2][5] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][6] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][8] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
@ -1076,17 +1084,23 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
doc = en_tokenizer("and𐌞")
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
hashes = doc.get_character_combination_hashes(
case_sensitive=True,
cs=True,
pref_lengths=[],
suff_lengths=[1, 2, 3],
pref_search_chars="",
pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[],
suff_search_chars="a",
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=1,
suff_search_lengths=[1],
)
assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞")
assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞")
assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")

View File

@ -1,138 +1,55 @@
import sys
import spacy
def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False)
assert w1_search == b"BEFWbefw"
assert w2_search == b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00"
assert (
w4_search
== b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w1_finding == w2_finding == w4_finding == w4_search.lower()
def test_get_byte_arrays_for_search_chars_width_1_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True)
assert w1_search == b"Tbefw"
assert w2_search == b"T\x00b\x00e\x00f\x00w\00"
assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00"
assert w1_finding == w2_finding == w4_finding == w4_search
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
assert w1_search == b"BFWbfw"
assert (
w1_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert (
w2_finding
== w4_finding
ref
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
w4_search
search
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
assert w1_search == b"bfw"
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
assert w2_search == b"b\x00f\x00w\x00\xe9\x00"
assert (
w2_finding
== w4_finding
== w4_search
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert w1_search == b"BFWbfw"
assert (
w1_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
)
assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert (
w2_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
w4_search
search
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
w4_finding
ref
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
(
w1_search,
w1_finding,
w2_search,
w2_finding,
w4_search,
w4_finding,
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert w1_search == b"bfw"
assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00"
assert w2_search == b"b\x00f\x00w\x00\xc9\x00\xe9\x00"
assert search == ref
assert (
w2_finding
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert w4_search == w4_finding
assert (
w4_finding
ref
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)

View File

@ -33,35 +33,26 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _populate_affix_buf(
const void* str_data_ptr,
const unsigned int unicode_byte_width,
const int word_idx,
const int word_len,
Py_UCS4* affix_buf,
cdef void _populate_aff_buf(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_length,
const int suff_length,
const bint to_lower
)
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes)
cdef bint _is_searched_char_in_search_chars_v(
const unsigned short searched_char,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len,
)
cdef void _set_found_char_buf(
const bint suffs_not_prefs,
const unsigned char[:] searched_string_v,
const unsigned int searched_string_len,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len,
char* found_char_buf,
const unsigned int found_char_buf_len,
cdef void _populate_search_buf(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* ref_buf,
const int search_buf_len,
Py_UCS4* finding_buf,
const int finding_buf_len,
bint suffs_not_prefs
)

View File

@ -181,9 +181,13 @@ class Doc:
pref_lengths: List[int],
suff_lengths: List[int],
pref_search_chars: str,
pref_ref_chars: str,
pref_search_char_length: int,
pref_search_lengths: List[int],
suff_search_chars: str,
suff_search_lengths: List[int]
suff_ref_chars: str,
suff_search_char_length: int,
suff_search_lengths: List[int],
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -3,7 +3,7 @@ from typing import Set, List
cimport cython
cimport numpy as np
from libc.string cimport memcpy
from libc.string cimport memcpy, memcmp, memset
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
@ -42,12 +42,6 @@ from ..util import get_words_and_spaces
DEF PADDING = 5
cdef extern from *:
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
int PyUnicode_KIND(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError(Errors.E026.format(i=i, length=length))
@ -111,6 +105,16 @@ class SetEntsDefault(str, Enum):
return list(cls.__members__.keys())
cdef extern from "unicodeobject.h":
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
void PyUnicode_READY(void * o)
int PyUnicode_KIND(void *data)
int PyUnicode_IS_COMPACT(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
@ -1742,33 +1746,37 @@ cdef class Doc:
def get_character_combination_hashes(
self,
self,
*,
bint case_sensitive,
bint cs,
pref_lengths: List[int],
suff_lengths: List[int],
str pref_search_chars,
char* pref_search,
char* pref_ref,
int pref_s_char_l,
pref_search_lengths: List[int],
str suff_search_chars,
suff_search_lengths: List[int]
char* suff_search,
char* suff_ref,
int suff_s_char_l,
suff_search_lengths: List[int],
):
"""
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the string (text/orth) of each token.
case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
the prefixes hashed for "spaCy" would be "sp" and "spa".
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac".
suff_search_chars: a string containing characters to search for within each token, starting at the end.
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
@ -1779,99 +1787,61 @@ cdef class Doc:
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
"""
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef int aff_buf_l = max_pref_l + max_suff_l
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0
cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef Py_UCS4* affix_buf = <Py_UCS4*>self.mem.alloc(4, longest_pref + longest_suff)
cdef void* text_ptr = <void*> self.text
cdef void* text_data_ptr = <void*> PyUnicode_DATA(text_ptr) # todo change to const void
cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len
cdef TokenC token_c
cdef str working_str
for tok_idx in range(num_toks):
token_c = self.c[tok_idx]
token_idx = token_c.idx
token_len = token_c.lex.length
_populate_affix_buf(
text_data_ptr,
unicode_byte_width,
token_idx,
token_len,
affix_buf,
longest_pref,
longest_suff,
not case_sensitive
)
cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
cdef char* found_char_buf = found_char_buf_bytes
cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v),
cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
cdef unsigned int num_pref_search_hashes = len(pref_search_lengths)
cdef unsigned int num_suff_search_hashes = len(suff_search_lengths)
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
cdef bytes encoded_text = self.text.encode("utf-32le")
cdef char* intermediate_text = encoded_text
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
cdef const unsigned char[:] tok_str_v
cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len
cdef attr_t num_tok_attr
cdef str str_tok_attr
cdef unsigned int num_toks = len(self), aff_len
cdef unsigned int h_pref_n = len(pref_lengths)
cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
for tok_idx in range(num_toks):
num_tok_attr = self.c[tok_idx].lex.orth if case_sensitive else self.c[tok_idx].lex.lower
str_tok_attr = self.vocab.strings[num_tok_attr]
tok_str_v = _get_utf16_memoryview(str_tok_attr, False)
tok_str_v_len = len(tok_str_v)
cdef TokenC tok_c
for hash_idx in range(num_pref_norm_hashes):
char_comb_len = pref_lengths[hash_idx] * 2
if char_comb_len > tok_str_v_len:
char_comb_len = tok_str_v_len
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[0], char_comb_len, 0)
for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes):
char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2
if char_comb_len > tok_str_v_len:
char_comb_len = tok_str_v_len
affix_start = tok_str_v_len - char_comb_len
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], char_comb_len, 0)
_set_found_char_buf(
False,
tok_str_v,
tok_str_v_len,
pref_search_chars_v,
pref_search_chars_v_len,
found_char_buf,
found_char_buf_len,
)
for tok_i in range(num_toks):
tok_c = self.c[tok_i]
tok_idx = tok_c.idx
tok_len = tok_c.lex.length
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes):
char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
_populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
_populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
_populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
for hash_idx in range(h_pref_n):
aff_len = pref_lengths[hash_idx]
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
_set_found_char_buf(
True,
tok_str_v,
tok_str_v_len,
suff_search_chars_v,
suff_search_chars_v_len,
found_char_buf,
found_char_buf_len,
)
for hash_idx in range(h_pref_n, h_suff_end_idx):
aff_len = suff_lengths[hash_idx - h_pref_n]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes):
char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
self.mem.free(aff_buf)
self.mem.free(pref_f_buf)
self.mem.free(suff_f_buf)
return hashes
@staticmethod
@ -2055,12 +2025,11 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix
cdef void _populate_affix_buf(
const void* str_data_ptr,
const unsigned int unicode_byte_width,
const int word_idx,
const int word_len,
Py_UCS4* affix_buf,
cdef void _populate_aff_buf(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_length,
const int suff_length,
const bint to_lower
@ -2070,107 +2039,75 @@ cdef void _populate_affix_buf(
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
Unicode form (see PEP 393).
unicode_byte_width: the number of bytes occupied by each character in the containing string.
kind: the number of bytes occupied by each character in the containing string.
word_idx: the index of the first character of the word within the containing string.
word_len: the length of the word.
affix_buf: the buffer to populate.
aff_buf: the buffer to populate.
pref_length: the length of the prefix.
suff_length: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
"""
cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
cdef Py_UCS4 working_wchar
cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
while affix_buf_idx < pref_length and affix_buf_idx < word_len:
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, in_word_idx)
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
affix_buf_idx += 1
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
aff_buf_idx += 1
while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len):
if aff_buf_idx < buf_size - tok_len:
# fill out the empty middle part of the buffer with zeros
affix_buf[affix_buf_idx] = 0
affix_buf_idx += 1
memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
while affix_buf_idx < buf_size:
in_word_idx = affix_buf_idx + word_len - buf_size
while aff_buf_idx < buf_size:
in_word_idx = aff_buf_idx + tok_len - buf_size
# for suffixes we have to track the in-word index separately from the in-buffer index
if in_word_idx < pref_length:
# we've already retrieved this character as part of the prefix, so copy it from there
# as that's quicker than retrieving it from the input string a second time
memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4)
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
else:
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx)
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
affix_buf_idx += 1
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
aff_buf_idx += 1
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes):
"""
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.
Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation
occupies four bytes rather than two.
"""
cdef const unsigned char[:] view = unicode_string.encode("UTF-16")
view = view[2:] # first two bytes express endianness
cdef unsigned int unicode_len, view_len
if check_2_bytes:
unicode_len = len(unicode_string)
view_len = len(view)
if unicode_len * 2 != view_len:
raise ValueError(Errors.E1046)
return view
cdef bint _is_searched_char_in_search_chars_v(
const unsigned short searched_char,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len
cdef void _populate_search_buf(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* ref_buf,
const int search_buf_len,
Py_UCS4* finding_buf,
const int finding_buf_len,
bint suffs_not_prefs
):
cdef unsigned int search_chars_v_idx = 0
while search_chars_v_idx < search_chars_v_len:
if searched_char == (<unsigned short*> &search_chars_v[search_chars_v_idx])[0]:
return True
search_chars_v_idx += 2
return False
cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef unsigned int search_buf_idx
cdef int cmp_res
cdef void _set_found_char_buf(
const bint suffs_not_prefs,
const unsigned char[:] searched_string_v,
const unsigned int searched_string_v_len,
const unsigned char[:] search_chars_v,
const unsigned int search_chars_v_len,
char* found_char_buf,
const unsigned int found_char_buf_len,
):
""" Pick the UTF-16 characters from *searched_string_v* that are also in *search_chars_v* and writes them in order to *found_char_buf*.
If *suffs_not_prefs*, the search starts from the end of *searched_string_v* rather than from the beginning.
"""
cdef unsigned int found_char_buf_idx = 0, searched_string_idx = searched_string_v_len - 2 if suffs_not_prefs else 0
cdef unsigned short searched_char, SPACE = 32
while found_char_buf_idx < found_char_buf_len:
searched_char = (<unsigned short*> &searched_string_v[searched_string_idx])[0]
if _is_searched_char_in_search_chars_v(searched_char, search_chars_v, search_chars_v_len):
memcpy(found_char_buf + found_char_buf_idx, &searched_char, 2)
found_char_buf_idx += 2
if suffs_not_prefs:
if searched_string_idx <= 0:
while finding_buf_idx < finding_buf_len:
for search_buf_idx in range (search_buf_len):
cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
if cmp_res == 0:
memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
finding_buf_idx += 1
if cmp_res >= 0:
break
searched_string_idx -= 2
if suffs_not_prefs:
if text_string_idx <= tok_idx:
break
text_string_idx -= 1
else:
searched_string_idx += 2
if searched_string_idx >= searched_string_v_len:
text_string_idx += 1
if text_string_idx >= tok_idx + tok_len:
break
while found_char_buf_idx < found_char_buf_len:
memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2)
found_char_buf_idx += 2
if finding_buf_idx < finding_buf_len:
memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
def pickle_doc(doc):

View File

@ -1739,104 +1739,68 @@ def all_equal(iterable):
def get_byte_arrays_for_search_chars(
search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]:
) -> Tuple[bytes, bytes]:
"""
The text of a spaCy document is stored as a Python-internal Unicode representation
as defined by PEP 393. Each character in such a representation has the width of the
longest character in the string, which is either 1, 2 or 4 bytes.
This function supports the rich feature extractor. It returns search byte arrays with
1-, 2- and 4-byte character widths that are used for comparison with each of the three
representation types when searching document texts for search characters. Each byte array
contains characters that are as wide or narrower than its own width; a byte array can
ignore characters that are wider than its own width because a spaCy document with the
corresponding representation width could never contain characters wider than that width.
4-byte character width that are used for comparison when searching document texts
for search characters. The encoding is little-endian regardless of architecture, as
this is what is expected by the murmurhash library used downstream.
When characters corresponding to search characters are found within a spaCy token
string, they are concatenated together and the resulting "finding byte arrays" are hashed.
It is crucial that the characters in all finding byte arrays representing a given sequence of
characters share the same width so that they all yield the same hash values. While it
would be possible to use the narrowest possible width for the sequence like PEP 393 does,
determining this would entain unnecessary processing. Instead, finding byte arrays always use
a 4-byte width. Each of the three search byte array therefore has a corresponding finding
byte array that is used to build up the finding byte arrays for specific document token strings.
If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
Alongside the "search byte array" against which words from document texts are compared
is the "ref byte array". When a character from the search byte array is matched,
the character at the corresponding position in the ref byte array is added to the
byte sequence of the configured length that is then hashed. This enables case-sensitivity
to be handled without converting the case of the words being searched: if
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search byte arrays, and both the original character and its
other-cased counterpart map to the lower-case version in the finding byte array.
All encodings are little-endian regardless of architecture, as this is what is expected by the
murmurhash library used downstream.
other-cased counterpart map to the lower-case version in the ref byte array.
"""
def encode(ch: str, width: int) -> bytes:
def encode(ch: str) -> bytes:
"""
ch: a single character
int: the width of the character encoding to use
"""
if width == 4:
return ch.encode("UTF-32LE")
elif width == 2:
return ch.encode("UTF-16LE")
else:
return ch.encode("UTF-8")
return ch.encode("UTF-32LE")
def add_to_byte_arrays(
search: List[bytes], finding: List[bytes], ch: str, width: int
search: List[bytes], ref: List[bytes], ch: str
) -> None:
"""Add the byte representations of *ch* with representation of width
*width* to the two byte array lists.
"""Add the byte representations of *ch* to the two byte array lists.
"""
this_char_bytes = encode(ch, width)
this_char_bytes_f = encode(ch, 4)
this_char_bytes = encode(ch)
if not case_sensitive and ch.islower():
if this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(this_char_bytes_f)
upper_char_bytes = encode(ch.upper(), width)
ref.append(this_char_bytes)
upper_char_bytes = encode(ch.upper())
if upper_char_bytes not in search:
search.append(upper_char_bytes)
finding.append(this_char_bytes_f)
ref.append(this_char_bytes)
elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower(), width)
lower_char_bytes_f = encode(ch.lower(), 4)
lower_char_bytes = encode(ch.lower())
if this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(lower_char_bytes_f)
ref.append(lower_char_bytes)
if lower_char_bytes not in search:
search.append(lower_char_bytes)
finding.append(lower_char_bytes_f)
ref.append(lower_char_bytes)
elif this_char_bytes not in search:
search.append(this_char_bytes)
finding.append(this_char_bytes_f)
ref.append(this_char_bytes)
def get_ordered_raw_bytes(
search: List[bytes], finding: List[bytes]
search: List[bytes], ref: List[bytes]
) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search*
using the native endianness of the platform.
"""
num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))]
finding = [entry for _, entry in sorted(zip(num_search, finding))]
return b"".join(search), b"".join(finding)
ref = [entry for _, entry in sorted(zip(num_search, ref))]
return b"".join(search), b"".join(ref)
w1_search: List[bytes] = []
w1_finding: List[bytes] = []
w2_search: List[bytes] = []
w2_finding: List[bytes] = []
w4_search: List[bytes] = []
w4_finding: List[bytes] = []
search: List[bytes] = []
ref: List[bytes] = []
for ch in search_chars:
add_to_byte_arrays(w4_search, w4_finding, ch, 4)
if ord(ch) >= 65536:
continue
add_to_byte_arrays(w2_search, w2_finding, ch, 2)
if ord(ch) >= 128:
continue
add_to_byte_arrays(w1_search, w1_finding, ch, 1)
return (
get_ordered_raw_bytes(w1_search, w1_finding)
+ get_ordered_raw_bytes(w2_search, w2_finding)
+ get_ordered_raw_bytes(w4_search, w4_finding)
)
add_to_byte_arrays(search, ref, ch)
return get_ordered_raw_bytes(search, ref)