Changes after review discussion — intermed. state

This commit is contained in:
richardpaulhudson 2022-10-27 18:03:25 +02:00
parent 7d8258bec8
commit a1b8697aab
10 changed files with 294 additions and 308 deletions

View File

@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
"is a Cython extension type.")
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
"aware that this might affect other components in your pipeline.")
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
class Errors(metaclass=ErrorsWithCodes):
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.")
E1047 = ("Invalid rich group config '{label}'.")
E1048 = ("Length > 63 in rich group config '{label}.")
E1049 = ("Error splitting UTF-8 byte string into separate characters.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,5 +1,6 @@
from encodings import search_function
from typing import Optional, List, Union, cast
import warnings
from spacy.ml.richfeatureextractor import RichFeatureExtractor
from thinc.types import Floats2d, Ints2d, Ragged
from thinc.api import chain, clone, concatenate, with_array, with_padded
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc
from ...util import registry
from ...errors import Errors
from ...errors import Errors, Warnings
from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@ -207,6 +208,8 @@ def _verify_rich_config_group(
raise ValueError(Errors.E1047.format(label=label))
elif search_chars is not None:
raise ValueError(Errors.E1047.format(label=label))
if lengths is not None and max(lengths) > 63:
raise ValueError(Errors.E1048.format(label=label))
@registry.architectures("spacy.RichMultiHashEmbed.v1")
@ -246,13 +249,13 @@ def RichMultiHashEmbed(
depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
For most languages used with spaCy, searching is likely to be useful starting
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
is also offered for completeness. Search characters should consist of all
characters that regularly alternate with other characters in the language in
question or whose presence before or after characters that would otherwise
alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the
For most languages used with spaCy, searching is likely to be useful starting
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
is also offered for completeness. Search characters should consist of all
characters that regularly alternate with other characters in the language in
question or whose presence before or after characters that would otherwise
alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the
end of the word.
width (int): The output width. Also used as the width of the embedding tables.
@ -263,27 +266,27 @@ def RichMultiHashEmbed(
same length as attrs.
include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
case_sensitive (bool): Whether lower-case and upper-case letters should be
case_sensitive (bool): Whether lower-case and upper-case letters should be
distinguished when generating the character combinations to use as features.
pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
for each word, e.g. for the word `spaCy`:
pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `s` and `spa` being used as features.
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
for each word, e.g. for the word `spaCy`:
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `y` and `aCy` being used as features.
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
pref_search_chars (Optional[str]): A string containing characters to search for
pref_search_chars (Optional[str]): A string containing characters to search for
starting from the beginning of each word.
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the beginning of each word.
pref_search_rows (Optional[List[int]]): The number of rows for each of
pref_search_rows (Optional[List[int]]): The number of rows for each of
`pref_search_lengths`.
suff_search_chars (Optional[str]): A string containing characters to search for
suff_search_chars (Optional[str]): A string containing characters to search for
starting from the end of each word.
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the end of each word.
suff_search_rows (Optional[List[int]]): The number of rows for each of
suff_search_rows (Optional[List[int]]): The number of rows for each of
`suff_search_lengths`.
"""
@ -313,6 +316,9 @@ def RichMultiHashEmbed(
case_sensitive,
)
if "PREFIX" in attrs or "SUFFIX" in attrs:
warnings.warn(Warnings.W124)
if pref_rows is not None:
rows.extend(pref_rows)
if suff_rows is not None:

View File

@ -1,5 +1,7 @@
from typing import List, Optional, Callable, Tuple
from ..util import get_arrays_for_search_chars
from spacy.util import get_search_char_byte_arrays
# from ..util import get_arrays_for_search_chars
from thinc.types import Ints1d, Ints2d
from thinc.api import Model, registry, get_current_ops
@ -19,17 +21,23 @@ def RichFeatureExtractor(
) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None:
pref_search, pref_lookup = get_arrays_for_search_chars(
pref_search_chars, case_sensitive
)
(
ps_1byte_ch,
ps_2byte_ch,
ps_3byte_ch,
ps_4byte_ch,
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
else:
pref_search, pref_lookup = bytes(), bytes()
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
if suff_search_chars is not None:
suff_search, suff_lookup = get_arrays_for_search_chars(
suff_search_chars, case_sensitive
)
(
ss_1byte_ch,
ss_2byte_ch,
ss_3byte_ch,
ss_4byte_ch,
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
else:
suff_search, suff_lookup = bytes(), bytes()
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
return Model(
"extract_character_combination_hashes",
forward,
@ -41,19 +49,17 @@ def RichFeatureExtractor(
"suff_lengths": ops.asarray1i(suff_lengths)
if suff_lengths is not None
else ops.asarray1i([]),
"pref_search": pref_search,
"pref_lookup": pref_lookup,
"pref_search_char_len": len(pref_search) / 4
if pref_search_chars is not None
else 0,
"pref_search_1_byte": ps_1byte_ch,
"pref_search_2_bytes": ps_2byte_ch,
"pref_search_3_bytes": ps_3byte_ch,
"pref_search_4_bytes": ps_4byte_ch,
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
if pref_search_lengths is not None
else ops.asarray1i([]),
"suff_search": suff_search,
"suff_lookup": suff_lookup,
"suff_search_char_len": len(suff_search) / 4
if suff_search_chars is not None
else 0,
"suff_search_1_byte": ss_1byte_ch,
"suff_search_2_bytes": ss_2byte_ch,
"suff_search_3_bytes": ss_3byte_ch,
"suff_search_4_bytes": ss_4byte_ch,
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
if suff_search_lengths is not None
else ops.asarray1i([]),
@ -68,13 +74,15 @@ def forward(
case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: Ints1d = model.attrs["pref_lengths"]
suff_lengths: Ints1d = model.attrs["suff_lengths"]
pref_search: bytes = model.attrs["pref_search"]
pref_lookup: bytes = model.attrs["pref_lookup"]
pref_search_char_len: int = model.attrs["pref_search_char_len"]
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"]
suff_lookup: bytes = model.attrs["suff_lookup"]
suff_search_char_len: int = model.attrs["suff_search_char_len"]
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
features: List[Ints2d] = []
for doc in docs:
@ -82,13 +90,15 @@ def forward(
cs=case_sensitive,
p_lengths=pref_lengths,
s_lengths=suff_lengths,
ps_search=pref_search,
ps_lookup=pref_lookup,
ps_l=pref_search_char_len,
ps_1byte_ch=ps_1byte_ch,
ps_2byte_ch=ps_2byte_ch,
ps_3byte_ch=ps_3byte_ch,
ps_4byte_ch=ps_4byte_ch,
ps_lengths=pref_search_lengths,
ss_search=suff_search,
ss_lookup=suff_lookup,
ss_l=suff_search_char_len,
ss_1byte_ch=ss_1byte_ch,
ss_2byte_ch=ss_2byte_ch,
ss_3byte_ch=ss_3byte_ch,
ss_4byte_ch=ss_4byte_ch,
ss_lengths=suff_search_lengths,
)
features.append(ops.asarray2i(hashes))

View File

@ -27,3 +27,4 @@ cdef class StringStore:
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
cdef const unsigned char[:] utf8_view(self, attr_t hash_val)

View File

@ -315,3 +315,25 @@ cdef class StringStore:
self._map.set(key, value)
self.keys.push_back(key)
return value
cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
if hash_val == 0:
return ""
elif hash_val < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[hash_val]
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1]
elif string.p[0] < 255:
return string.p[1:string.p[0]+1]
else:
i = 0
length = 0
while string.p[i] == 255:
i += 1
length += 255
length += string.p[i]
i += 1
return string.p[i:length + i]

View File

@ -1,55 +1,57 @@
import spacy
import pytest
def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
(
search,
lookup,
) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
assert (
lookup
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
if case_sensitive:
assert sc1 == b"EPaz"
else:
assert sc1 == b"aepz"
assert sc2 == b""
assert sc3 == b""
assert sc4 == b""
assert (
search
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
assert sc1 == b""
assert sc2 == b""
assert sc3 == b""
assert sc4 == "𐌞".encode("utf-8")
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
if case_sensitive:
assert sc1 == b"Bab"
assert sc2 == "Éé".encode("utf-8")
else:
assert sc1 == b"ab"
assert sc2 == "é".encode("utf-8")
assert sc3 == "".encode("utf-8")
assert sc4 == "𐌞".encode("utf-8")
def test_get_arrays_for_search_chars_width_2_case_sensitive():
(
search,
lookup,
) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
assert (
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_turkish_i_with_dot(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
if case_sensitive:
assert sc2 == "İ".encode("utf-8")
assert sc1 == sc3 == sc4 == b""
else:
assert sc1 == b"i"
assert sc2 == b"\xcc\x87"
assert sc3 == sc4 == b""
def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
(
search,
lookup,
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert (
search
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_arrays_for_search_chars_width_4_case_sensitive():
(
search,
lookup,
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert search == lookup
assert (
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_turkish_i_with_dot_and_normal_i(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
if case_sensitive:
assert sc1 == b"I"
assert sc2 == "İ".encode("utf-8")
assert sc3 == sc4 == b""
else:
assert sc1 == b"i"
assert sc2 == b"\xcc\x87"
assert sc3 == sc4 == b""

View File

@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
)
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
cdef void _set_affix_lengths(
const unsigned char[:] text_buf,
char* aff_len_buf,
const int pref_len,
const int suff_len,
const bint to_lower
)
) nogil
cdef void _search_for_chars(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* lookup_buf,
const int search_buf_len,
Py_UCS4* result_buf,
const int result_buf_len,
cdef bint _search_for_chars(
const unsigned char[:] tok_str,
const unsigned char[:] s_1byte_ch,
const unsigned char[:] s_2byte_ch,
const unsigned char[:] s_3byte_ch,
const unsigned char[:] s_4byte_ch,
unsigned char* res_buf,
unsigned char* len_buf,
bint suffs_not_prefs
) nogil

View File

@ -126,7 +126,7 @@ class Doc:
blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ...,
default: str = ...
default: str = ...,
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
@ -178,16 +178,18 @@ class Doc:
self,
*,
cs: bool,
pref_lengths: Ints1d,
suff_lengths: Ints1d,
pref_search_chars: str,
pref_lookup_chars: str,
pref_search_char_length: int,
pref_search_lengths: Ints1d,
suff_search_chars: str,
suff_lookup_chars: str,
suff_search_char_length: int,
suff_search_lengths: Ints1d,
p_lengths: Ints1d,
s_lengths: Ints1d,
ps_1byte_ch: bytes,
ps_2byte_ch: bytes,
ps_3byte_ch: bytes,
ps_4byte_ch: bytes,
ps_lengths: Ints1d,
ss_1byte_ch: bytes,
ss_2byte_ch: bytes,
ss_3byte_ch: bytes,
ss_4byte_ch: bytes,
ss_lengths: Ints1d,
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -1736,18 +1736,20 @@ cdef class Doc:
return output
def get_character_combination_hashes(self,
def np.ndarray get_character_combination_hashes(self,
*,
const bint cs,
np.ndarray p_lengths,
np.ndarray s_lengths,
const char* ps_search,
const char* ps_lookup,
const int ps_l,
np.ndarray s_lengths,
const unsigned char[:] ps_1byte_ch,
const unsigned char[:] ps_2byte_ch,
const unsigned char[:] ps_3byte_ch,
const unsigned char[:] ps_4byte_ch,
np.ndarray ps_lengths,
const char* ss_search,
const char* ss_lookup,
const int ss_l,
const unsigned char[:] ss_1byte_ch,
const unsigned char[:] ss_2byte_ch,
const unsigned char[:] ss_3byte_ch,
const unsigned char[:] ss_4byte_ch,
np.ndarray ss_lengths,
):
"""
@ -1766,44 +1768,26 @@ cdef class Doc:
the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
ps_search: a byte array containing characters to search for within each token, starting at the beginning.
ps_lookup: a byte array containing characters that are added to the result string when a character at
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the beginning.
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac".
ss_search: a byte array containing characters to search for within each token, starting at the end.
ss_lookup: a byte array containing characters that are added to the result string when a character at
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the end.
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
would correspond to
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
"""
# Encode the document text
cdef bytes encoded_text = self.text.encode("utf-32le")
cdef char* intermediate_text = encoded_text
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
# Define the result array and work out what is used for what in axis 1
cdef int num_toks = len(self)
cdef int p_h_num = p_lengths.shape[0]
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
cdef np.ndarray[np.int64_t, ndim=2] hashes
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
@ -1811,15 +1795,13 @@ cdef class Doc:
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
# Define / allocate buffer (pr/sr: result buffers)
cdef int aff_buf_l = p_max_l + s_max_l
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
# Define / allocate buffers
cdef int aff_l = p_max_l + s_max_l
cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
# Define memory views on length arrays
cdef int[:] p_lengths_v = p_lengths
@ -1829,38 +1811,51 @@ cdef class Doc:
# Define working variables
cdef TokenC tok_c
cdef int tok_i, tok_idx, tok_len, aff_len
cdef int tok_i, offset
cdef uint64_t hash_val
cdef attr_t num_tok_attr
cdef const unsigned char[:] tok_str
for tok_i in range(num_toks):
tok_c = self.c[tok_i]
tok_idx = tok_c.idx
tok_len = tok_c.lex.length
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
tok_str = self.vocab.strings.utf8_view(num_tok_attr)
if aff_buf_l > 0:
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
if aff_l > 0:
_set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
for hash_idx in range(p_h_num):
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
offset = aff_len_buf[p_lengths_v[hash_idx]]
if offset > 0:
hash_val = hash32(<void*> &qcktest2[0], offset, 0)
hashes[tok_i, hash_idx] = hash_val
for hash_idx in range(p_h_num, s_h_end):
aff_len = s_lengths_v[hash_idx - p_h_num]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
offset = s_lengths_v[hash_idx - p_h_num]
if offset > 0:
hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
hashes[tok_i, hash_idx] = hash_val
if ps_h_num > 0:
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
if (
ps_h_num > 0 and
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
):
for hash_idx in range(s_h_end, ps_h_end):
aff_len = ps_lengths_v[hash_idx - s_h_end]
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
if ss_h_num > 0:
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
if (
ss_h_num > 0 and
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
):
for hash_idx in range(ps_h_end, ss_h_end):
aff_len = ss_lengths_v[hash_idx - ps_h_end]
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
self.mem.free(aff_buf)
self.mem.free(ps_r_buf)
self.mem.free(ss_r_buf)
self.mem.free(aff_len_buf)
self.mem.free(ps_res_buf)
self.mem.free(ps_len_buf)
self.mem.free(ss_res_buf)
self.mem.free(ss_len_buf)
return hashes
@staticmethod
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix
cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
):
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
any upper-case characters to lower case within the target buffer.
"""
cdef int idx
memcpy(target, source, length * sizeof(Py_UCS4))
if to_lower:
for idx in range(length):
if Py_UNICODE_ISUPPER(target[idx]):
target[idx] = Py_UNICODE_TOLOWER(target[idx])
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
cdef void _set_affix_lengths(
const unsigned char[:] text_buf,
char* aff_len_buf,
const int pref_len,
const int suff_len,
const bint to_lower
):
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
) nogil:
""" TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
text_buf: a pointer to a UTF-32LE representation of the containing string.
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
suff_len: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
"""
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
if pref_len > 0:
filled_pref_len = pref_len if pref_len < tok_len else tok_len
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
aff_buf_idx = filled_pref_len
cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
if tok_len < pref_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
aff_buf_idx = aff_buf_len - suff_len
while aff_len_buf_idx < pref_len:
if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
aff_len_buf_idx += 1
text_buf_idx += 1
if text_buf_idx == len(text_buf):
break
if tok_len < suff_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
aff_buf_idx = aff_buf_len - tok_len
if aff_len_buf_idx < pref_len:
memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
aff_len_buf_idx = pref_len
if suff_len > 0:
# in_word_idx: the index within the token where the suffix starts
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
if in_word_idx < pref_len:
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
aff_buf_idx += filled_pref_len - in_word_idx
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
if aff_buf_idx < aff_buf_len:
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
text_buf_idx = 1
while aff_len_buf_idx < pref_len + suff_len:
if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
aff_len_buf_idx += 1
text_buf_idx += 1
if text_buf_idx > text_buf_len:
break
if aff_len_buf_idx < pref_len + suff_len:
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
cdef void _search_for_chars(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* lookup_buf,
const int search_buf_len,
Py_UCS4* result_buf,
const int result_buf_len,
cdef bint _search_for_chars(
const unsigned char[:] tok_str,
const unsigned char[:] s_1byte_ch,
const unsigned char[:] s_2byte_ch,
const unsigned char[:] s_3byte_ch,
const unsigned char[:] s_4byte_ch,
unsigned char* res_buf,
unsigned char* len_buf,
bint suffs_not_prefs
) nogil:
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
result_buf: the buffer in which to place the results.
result_buf_len: the length of *result_buf*.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
"""
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef int search_buf_idx
@ -2158,6 +2134,8 @@ cdef void _search_for_chars(
# fill in any unused characters in the result buffer with zeros
if result_buf_idx < result_buf_len:
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
return result_buf_idx > 0
def pickle_doc(doc):

View File

@ -1737,69 +1737,42 @@ def all_equal(iterable):
return next(g, True) and not next(g, False)
def get_arrays_for_search_chars(
def get_search_char_byte_arrays(
search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes]:
) -> Tuple[bytes, bytes, bytes, bytes]:
"""
This function supports the rich feature extractor. It returns search byte arrays with
4-byte character width that are used for comparison when searching document texts
for search characters. The encoding is little-endian regardless of architecture, as
this is what is expected by the murmurhash library used downstream.
Alongside the "search array" against which words from document texts are compared
is the "lookup array". When a character from the search array is matched,
the character at the corresponding position in the lookup array is added to the
sequence that then goes on to be hashed. This enables case-sensitivity
to be handled without converting the case of the words being searched: if
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search array, and both the original character and its
other-cased counterpart map to the lower-case version in the lookup array.
This function supports the rich feature extractor. It splits the UTF-8 representation
of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
is converted to lower case if *case_sensitive==False*.
"""
def encode(ch: str) -> bytes:
"""
ch: a single character
"""
return ch.encode("UTF-32LE")
def add_to_arrays(
search: List[bytes], lookup: List[bytes], ch: str
) -> None:
"""Add the byte representations of *ch* to the two byte array lists.
"""
this_char_bytes = encode(ch)
if not case_sensitive and ch.islower():
if this_char_bytes not in search:
search.append(this_char_bytes)
lookup.append(this_char_bytes)
upper_char_bytes = encode(ch.upper())
if upper_char_bytes not in search:
search.append(upper_char_bytes)
lookup.append(this_char_bytes)
elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower())
if this_char_bytes not in search:
search.append(this_char_bytes)
lookup.append(lower_char_bytes)
if lower_char_bytes not in search:
search.append(lower_char_bytes)
lookup.append(lower_char_bytes)
elif this_char_bytes not in search:
search.append(this_char_bytes)
lookup.append(this_char_bytes)
def get_ordered_raw_bytes(
search: List[bytes], lookup: List[bytes]
) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search*.
"""
num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))]
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
return b"".join(search), b"".join(lookup)
search: List[bytes] = []
lookup: List[bytes] = []
for ch in search_chars:
add_to_arrays(search, lookup, ch)
return get_ordered_raw_bytes(search, lookup)
sc1 = bytearray()
sc2 = bytearray()
sc3 = bytearray()
sc4 = bytearray()
if not case_sensitive:
search_chars = search_chars.lower()
ordered_search_chars = "".join(sorted(set(search_chars)))
encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
working_start = 0
for idx in range(len(encoded_search_char_bytes) + 1):
if idx == 0:
continue
if (
idx == len(encoded_search_char_bytes)
or encoded_search_char_bytes[idx] & 0xC0 != 0x80 # not continuation byte
):
char_length = idx - working_start
if char_length == 1:
sc1.extend(encoded_search_char_bytes[working_start:idx])
elif char_length == 2:
sc2.extend(encoded_search_char_bytes[working_start:idx])
elif char_length == 3:
sc3.extend(encoded_search_char_bytes[working_start:idx])
elif char_length == 4:
sc4.extend(encoded_search_char_bytes[working_start:idx])
else:
raise RuntimeError(Errors.E1049)
working_start = idx
return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)