diff --git a/spacy/errors.py b/spacy/errors.py index 9a30f5e69..16188ca8a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes): "is a Cython extension type.") W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " "aware that this might affect other components in your pipeline.") + W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same " + "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.") class Errors(metaclass=ErrorsWithCodes): @@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes): E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " "knowledge base, use `InMemoryLookupKB`.") E1047 = ("Invalid rich group config '{label}'.") + E1048 = ("Length > 63 in rich group config '{label}.") + E1049 = ("Error splitting UTF-8 byte string into separate characters.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index e552d04f2..3906f6b42 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,5 +1,6 @@ from encodings import search_function from typing import Optional, List, Union, cast +import warnings from spacy.ml.richfeatureextractor import RichFeatureExtractor from thinc.types import Floats2d, Ints2d, Ragged from thinc.api import chain, clone, concatenate, with_array, with_padded @@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from ...tokens import Doc from ...util import registry -from ...errors import Errors +from ...errors import Errors, Warnings from ...ml import _character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor @@ -207,6 +208,8 @@ def _verify_rich_config_group( raise ValueError(Errors.E1047.format(label=label)) elif search_chars is not None: raise ValueError(Errors.E1047.format(label=label)) + if lengths is not None and max(lengths) > 63: + raise ValueError(Errors.E1048.format(label=label)) @registry.architectures("spacy.RichMultiHashEmbed.v1") @@ -246,13 +249,13 @@ def RichMultiHashEmbed( depending on the presence of some other letter before or after it, e.g. German plural nouns where the final two vowels are `ä-e` regularly correspond to singular lemmas where the `e` is no longer present and the `ä` has become `a`. - For most languages used with spaCy, searching is likely to be useful starting - at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) - is also offered for completeness. Search characters should consist of all - characters that regularly alternate with other characters in the language in - question or whose presence before or after characters that would otherwise - alternate prevents the alternation from occurring, e.g. an `ä` in a German - plural noun does not become `a` if it is the third or fourth vowel from the + For most languages used with spaCy, searching is likely to be useful starting + at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) + is also offered for completeness. Search characters should consist of all + characters that regularly alternate with other characters in the language in + question or whose presence before or after characters that would otherwise + alternate prevents the alternation from occurring, e.g. an `ä` in a German + plural noun does not become `a` if it is the third or fourth vowel from the end of the word. width (int): The output width. Also used as the width of the embedding tables. @@ -263,27 +266,27 @@ def RichMultiHashEmbed( same length as attrs. include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. - case_sensitive (bool): Whether lower-case and upper-case letters should be + case_sensitive (bool): Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. - pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features - for each word, e.g. for the word `spaCy`: + pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features + for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`. - suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features - for each word, e.g. for the word `spaCy`: + suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features + for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`. - pref_search_chars (Optional[str]): A string containing characters to search for + pref_search_chars (Optional[str]): A string containing characters to search for starting from the beginning of each word. - pref_search_lengths (Optional[List[int]]): The lengths of search result strings + pref_search_lengths (Optional[List[int]]): The lengths of search result strings to use as features, where the searches start from the beginning of each word. - pref_search_rows (Optional[List[int]]): The number of rows for each of + pref_search_rows (Optional[List[int]]): The number of rows for each of `pref_search_lengths`. - suff_search_chars (Optional[str]): A string containing characters to search for + suff_search_chars (Optional[str]): A string containing characters to search for starting from the end of each word. - suff_search_lengths (Optional[List[int]]): The lengths of search result strings + suff_search_lengths (Optional[List[int]]): The lengths of search result strings to use as features, where the searches start from the end of each word. - suff_search_rows (Optional[List[int]]): The number of rows for each of + suff_search_rows (Optional[List[int]]): The number of rows for each of `suff_search_lengths`. """ @@ -313,6 +316,9 @@ def RichMultiHashEmbed( case_sensitive, ) + if "PREFIX" in attrs or "SUFFIX" in attrs: + warnings.warn(Warnings.W124) + if pref_rows is not None: rows.extend(pref_rows) if suff_rows is not None: diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 7293c69a9..4186f3750 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -1,5 +1,7 @@ from typing import List, Optional, Callable, Tuple -from ..util import get_arrays_for_search_chars +from spacy.util import get_search_char_byte_arrays + +# from ..util import get_arrays_for_search_chars from thinc.types import Ints1d, Ints2d from thinc.api import Model, registry, get_current_ops @@ -19,17 +21,23 @@ def RichFeatureExtractor( ) -> Model[List[Doc], List[Ints2d]]: ops = get_current_ops() if pref_search_chars is not None: - pref_search, pref_lookup = get_arrays_for_search_chars( - pref_search_chars, case_sensitive - ) + ( + ps_1byte_ch, + ps_2byte_ch, + ps_3byte_ch, + ps_4byte_ch, + ) = get_search_char_byte_arrays(pref_search_chars, case_sensitive) else: - pref_search, pref_lookup = bytes(), bytes() + ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes() if suff_search_chars is not None: - suff_search, suff_lookup = get_arrays_for_search_chars( - suff_search_chars, case_sensitive - ) + ( + ss_1byte_ch, + ss_2byte_ch, + ss_3byte_ch, + ss_4byte_ch, + ) = get_search_char_byte_arrays(suff_search_chars, case_sensitive) else: - suff_search, suff_lookup = bytes(), bytes() + ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes() return Model( "extract_character_combination_hashes", forward, @@ -41,19 +49,17 @@ def RichFeatureExtractor( "suff_lengths": ops.asarray1i(suff_lengths) if suff_lengths is not None else ops.asarray1i([]), - "pref_search": pref_search, - "pref_lookup": pref_lookup, - "pref_search_char_len": len(pref_search) / 4 - if pref_search_chars is not None - else 0, + "pref_search_1_byte": ps_1byte_ch, + "pref_search_2_bytes": ps_2byte_ch, + "pref_search_3_bytes": ps_3byte_ch, + "pref_search_4_bytes": ps_4byte_ch, "pref_search_lengths": ops.asarray1i(pref_search_lengths) if pref_search_lengths is not None else ops.asarray1i([]), - "suff_search": suff_search, - "suff_lookup": suff_lookup, - "suff_search_char_len": len(suff_search) / 4 - if suff_search_chars is not None - else 0, + "suff_search_1_byte": ss_1byte_ch, + "suff_search_2_bytes": ss_2byte_ch, + "suff_search_3_bytes": ss_3byte_ch, + "suff_search_4_bytes": ss_4byte_ch, "suff_search_lengths": ops.asarray1i(suff_search_lengths) if suff_search_lengths is not None else ops.asarray1i([]), @@ -68,13 +74,15 @@ def forward( case_sensitive: bool = model.attrs["case_sensitive"] pref_lengths: Ints1d = model.attrs["pref_lengths"] suff_lengths: Ints1d = model.attrs["suff_lengths"] - pref_search: bytes = model.attrs["pref_search"] - pref_lookup: bytes = model.attrs["pref_lookup"] - pref_search_char_len: int = model.attrs["pref_search_char_len"] + ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"] + ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] + ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] + ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"] - suff_search: bytes = model.attrs["suff_search"] - suff_lookup: bytes = model.attrs["suff_lookup"] - suff_search_char_len: int = model.attrs["suff_search_char_len"] + ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"] + ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] + ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] + ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"] features: List[Ints2d] = [] for doc in docs: @@ -82,13 +90,15 @@ def forward( cs=case_sensitive, p_lengths=pref_lengths, s_lengths=suff_lengths, - ps_search=pref_search, - ps_lookup=pref_lookup, - ps_l=pref_search_char_len, + ps_1byte_ch=ps_1byte_ch, + ps_2byte_ch=ps_2byte_ch, + ps_3byte_ch=ps_3byte_ch, + ps_4byte_ch=ps_4byte_ch, ps_lengths=pref_search_lengths, - ss_search=suff_search, - ss_lookup=suff_lookup, - ss_l=suff_search_char_len, + ss_1byte_ch=ss_1byte_ch, + ss_2byte_ch=ss_2byte_ch, + ss_3byte_ch=ss_3byte_ch, + ss_4byte_ch=ss_4byte_ch, ss_lengths=suff_search_lengths, ) features.append(ops.asarray2i(hashes)) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 5f03a9a28..7b33a498e 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -27,3 +27,4 @@ cdef class StringStore: cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) + cdef const unsigned char[:] utf8_view(self, attr_t hash_val) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c5f218342..b64cbbed2 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -315,3 +315,25 @@ cdef class StringStore: self._map.set(key, value) self.keys.push_back(key) return value + + cdef const unsigned char[:] utf8_view(self, attr_t hash_val): + if hash_val == 0: + return "" + elif hash_val < len(SYMBOLS_BY_INT): + return SYMBOLS_BY_INT[hash_val] + cdef Utf8Str* string = self._map.get(hash_val) + cdef int i, length + if string.s[0] < sizeof(string.s) and string.s[0] != 0: + return string.s[1:string.s[0]+1] + elif string.p[0] < 255: + return string.p[1:string.p[0]+1] + else: + i = 0 + length = 0 + while string.p[i] == 255: + i += 1 + length += 255 + length += string.p[i] + i += 1 + return string.p[i:length + i] + diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index b4b5744ad..2da57657e 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,55 +1,57 @@ import spacy +import pytest -def test_get_arrays_for_search_chars_width_2_not_case_sensitive(): - ( - search, - lookup, - ) = spacy.util.get_arrays_for_search_chars("bféwfw", False) - assert ( - lookup - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" - ) +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_search_char_byte_arrays_1_width_only(case_sensitive): + sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive) + if case_sensitive: + assert sc1 == b"EPaz" + else: + assert sc1 == b"aepz" + assert sc2 == b"" + assert sc3 == b"" + assert sc4 == b"" - assert ( - search - == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" - ) +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_search_char_byte_arrays_4_width_only(case_sensitive): + sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive) + assert sc1 == b"" + assert sc2 == b"" + assert sc3 == b"" + assert sc4 == "𐌞".encode("utf-8") +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_search_char_byte_arrays_all_widths(case_sensitive): + sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive) + if case_sensitive: + assert sc1 == b"Bab" + assert sc2 == "Éé".encode("utf-8") + else: + assert sc1 == b"ab" + assert sc2 == "é".encode("utf-8") + assert sc3 == "—".encode("utf-8") + assert sc4 == "𐌞".encode("utf-8") -def test_get_arrays_for_search_chars_width_2_case_sensitive(): - ( - search, - lookup, - ) = spacy.util.get_arrays_for_search_chars("bféwfw", True) - assert ( - lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" - ) +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_turkish_i_with_dot(case_sensitive): + sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive) + if case_sensitive: + assert sc2 == "İ".encode("utf-8") + assert sc1 == sc3 == sc4 == b"" + else: + assert sc1 == b"i" + assert sc2 == b"\xcc\x87" + assert sc3 == sc4 == b"" - -def test_get_arrays_for_search_chars_width_4_not_case_sensitive(): - ( - search, - lookup, - ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) - assert ( - search - == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" - ) - - assert ( - lookup - == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" - ) - - -def test_get_arrays_for_search_chars_width_4_case_sensitive(): - ( - search, - lookup, - ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) - assert search == lookup - assert ( - lookup - == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" - ) +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_turkish_i_with_dot_and_normal_i(case_sensitive): + sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive) + if case_sensitive: + assert sc1 == b"I" + assert sc2 == "İ".encode("utf-8") + assert sc3 == sc4 == b"" + else: + assert sc1 == b"i" + assert sc2 == b"\xcc\x87" + assert sc3 == sc4 == b"" \ No newline at end of file diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 05e75dc17..880d65759 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int [:,:] _get_lca_matrix(Doc, int start, int end) -cdef void _copy_chars( - Py_UCS4* target, - const Py_UCS4* source, - const int length, - const bint to_lower -) - - -cdef void _set_affixes( - const Py_UCS4* text_buf, - const int tok_idx, - const int tok_len, - Py_UCS4* aff_buf, +cdef void _set_affix_lengths( + const unsigned char[:] text_buf, + char* aff_len_buf, const int pref_len, const int suff_len, - const bint to_lower -) +) nogil -cdef void _search_for_chars( - const Py_UCS4* text_buf, - const int tok_idx, - const int tok_len, - Py_UCS4* search_buf, - Py_UCS4* lookup_buf, - const int search_buf_len, - Py_UCS4* result_buf, - const int result_buf_len, +cdef bint _search_for_chars( + const unsigned char[:] tok_str, + const unsigned char[:] s_1byte_ch, + const unsigned char[:] s_2byte_ch, + const unsigned char[:] s_3byte_ch, + const unsigned char[:] s_4byte_ch, + unsigned char* res_buf, + unsigned char* len_buf, bint suffs_not_prefs ) nogil diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 7e4962953..dc26b6010 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -126,7 +126,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... @@ -178,16 +178,18 @@ class Doc: self, *, cs: bool, - pref_lengths: Ints1d, - suff_lengths: Ints1d, - pref_search_chars: str, - pref_lookup_chars: str, - pref_search_char_length: int, - pref_search_lengths: Ints1d, - suff_search_chars: str, - suff_lookup_chars: str, - suff_search_char_length: int, - suff_search_lengths: Ints1d, + p_lengths: Ints1d, + s_lengths: Ints1d, + ps_1byte_ch: bytes, + ps_2byte_ch: bytes, + ps_3byte_ch: bytes, + ps_4byte_ch: bytes, + ps_lengths: Ints1d, + ss_1byte_ch: bytes, + ss_2byte_ch: bytes, + ss_3byte_ch: bytes, + ss_4byte_ch: bytes, + ss_lengths: Ints1d, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f6cd0a0bc..ca2f040d9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1736,18 +1736,20 @@ cdef class Doc: return output - def get_character_combination_hashes(self, + def np.ndarray get_character_combination_hashes(self, *, const bint cs, np.ndarray p_lengths, - np.ndarray s_lengths, - const char* ps_search, - const char* ps_lookup, - const int ps_l, + np.ndarray s_lengths, + const unsigned char[:] ps_1byte_ch, + const unsigned char[:] ps_2byte_ch, + const unsigned char[:] ps_3byte_ch, + const unsigned char[:] ps_4byte_ch, np.ndarray ps_lengths, - const char* ss_search, - const char* ss_lookup, - const int ss_l, + const unsigned char[:] ss_1byte_ch, + const unsigned char[:] ss_2byte_ch, + const unsigned char[:] ss_3byte_ch, + const unsigned char[:] ss_4byte_ch, np.ndarray ss_lengths, ): """ @@ -1766,44 +1768,26 @@ cdef class Doc: the prefixes hashed for "spaCy" would be "sp" and "spa". s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". - ps_search: a byte array containing characters to search for within each token, starting at the beginning. - ps_lookup: a byte array containing characters that are added to the result string when a character at - the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables - case-insensitivity to be handled efficiently. - ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup* + ps_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, + starting at the beginning. ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "a" and "ac". - ss_search: a byte array containing characters to search for within each token, starting at the end. - ss_lookup: a byte array containing characters that are added to the result string when a character at - the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables - case-insensitivity to be handled efficiently. - ss_l: the number of characters in *ss_search* and hence also in *ss_lookup* + ss_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, + starting at the end. ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". - - For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by - *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* - would correspond to - - [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")], - [hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))], - [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] """ - # Encode the document text - cdef bytes encoded_text = self.text.encode("utf-32le") - cdef char* intermediate_text = encoded_text - cdef Py_UCS4* text_buf = intermediate_text - # Define the result array and work out what is used for what in axis 1 cdef int num_toks = len(self) cdef int p_h_num = p_lengths.shape[0] cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") + cdef np.ndarray[np.int64_t, ndim=2] hashes + hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0 @@ -1811,15 +1795,13 @@ cdef class Doc: cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0 cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0 - # Define / allocate buffer (pr/sr: result buffers) - cdef int aff_buf_l = p_max_l + s_max_l - cdef Py_UCS4* aff_buf = self.mem.alloc(aff_buf_l, sizeof(Py_UCS4)) - cdef Py_UCS4* ps_s_buf = ps_search - cdef Py_UCS4* ps_l_buf = ps_lookup - cdef Py_UCS4* ps_r_buf = self.mem.alloc(ps_max_l, sizeof(Py_UCS4)) - cdef Py_UCS4* ss_s_buf = ss_search - cdef Py_UCS4* ss_l_buf = ss_lookup - cdef Py_UCS4* ss_r_buf = self.mem.alloc(ss_max_l, sizeof(Py_UCS4)) + # Define / allocate buffers + cdef int aff_l = p_max_l + s_max_l + cdef char* aff_len_buf = self.mem.alloc(aff_l, 1) + cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4) + cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1) + cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4) + cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1) # Define memory views on length arrays cdef int[:] p_lengths_v = p_lengths @@ -1829,38 +1811,51 @@ cdef class Doc: # Define working variables cdef TokenC tok_c - cdef int tok_i, tok_idx, tok_len, aff_len + cdef int tok_i, offset + cdef uint64_t hash_val + cdef attr_t num_tok_attr + cdef const unsigned char[:] tok_str for tok_i in range(num_toks): tok_c = self.c[tok_i] - tok_idx = tok_c.idx - tok_len = tok_c.lex.length + num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower + tok_str = self.vocab.strings.utf8_view(num_tok_attr) - if aff_buf_l > 0: - _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs) - + if aff_l > 0: + _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l) for hash_idx in range(p_h_num): - hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0) + offset = aff_len_buf[p_lengths_v[hash_idx]] + if offset > 0: + hash_val = hash32( &qcktest2[0], offset, 0) + hashes[tok_i, hash_idx] = hash_val for hash_idx in range(p_h_num, s_h_end): - aff_len = s_lengths_v[hash_idx - p_h_num] - hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0) + offset = s_lengths_v[hash_idx - p_h_num] + if offset > 0: + hash_val = hash32( &qcktest2[len(qcktest2) - offset], offset, 0) + hashes[tok_i, hash_idx] = hash_val - if ps_h_num > 0: - _search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False) + if ( + ps_h_num > 0 and + _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False) + ): for hash_idx in range(s_h_end, ps_h_end): aff_len = ps_lengths_v[hash_idx - s_h_end] hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0) - if ss_h_num > 0: - _search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True) + if ( + ss_h_num > 0 and + _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True) + ): for hash_idx in range(ps_h_end, ss_h_end): aff_len = ss_lengths_v[hash_idx - ps_h_end] hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0) - self.mem.free(aff_buf) - self.mem.free(ps_r_buf) - self.mem.free(ss_r_buf) + self.mem.free(aff_len_buf) + self.mem.free(ps_res_buf) + self.mem.free(ps_len_buf) + self.mem.free(ss_res_buf) + self.mem.free(ss_len_buf) return hashes @staticmethod @@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): return lca_matrix -cdef void _copy_chars( - Py_UCS4* target, - const Py_UCS4* source, - const int length, - const bint to_lower -): - """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts - any upper-case characters to lower case within the target buffer. - """ - cdef int idx - - memcpy(target, source, length * sizeof(Py_UCS4)) - if to_lower: - for idx in range(length): - if Py_UNICODE_ISUPPER(target[idx]): - target[idx] = Py_UNICODE_TOLOWER(target[idx]) - - -cdef void _set_affixes( - const Py_UCS4* text_buf, - const int tok_idx, - const int tok_len, - Py_UCS4* aff_buf, +cdef void _set_affix_lengths( + const unsigned char[:] text_buf, + char* aff_len_buf, const int pref_len, const int suff_len, - const bint to_lower -): - """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string. +) nogil: + """ TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string. If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros. text_buf: a pointer to a UTF-32LE representation of the containing string. @@ -2082,41 +2056,41 @@ cdef void _set_affixes( suff_len: the length of the suffix. to_lower: if *True*, any upper case characters in either affix are converted to lower case. """ - cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len - - if pref_len > 0: - filled_pref_len = pref_len if pref_len < tok_len else tok_len - _copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower) - aff_buf_idx = filled_pref_len + cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf) - if tok_len < pref_len: - memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len)) - aff_buf_idx = aff_buf_len - suff_len + while aff_len_buf_idx < pref_len: + if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character + aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1 + aff_len_buf_idx += 1 + text_buf_idx += 1 + if text_buf_idx == len(text_buf): + break - if tok_len < suff_len: - memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len)) - aff_buf_idx = aff_buf_len - tok_len + if aff_len_buf_idx < pref_len: + memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx) + aff_len_buf_idx = pref_len - if suff_len > 0: - # in_word_idx: the index within the token where the suffix starts - in_word_idx = aff_buf_idx + tok_len - aff_buf_len - if in_word_idx < pref_len: - memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx)) - aff_buf_idx += filled_pref_len - in_word_idx - in_word_idx = aff_buf_idx + tok_len - aff_buf_len - if aff_buf_idx < aff_buf_len: - _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower) + text_buf_idx = 1 + while aff_len_buf_idx < pref_len + suff_len: + if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character + aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx + aff_len_buf_idx += 1 + text_buf_idx += 1 + if text_buf_idx > text_buf_len: + break + + if aff_len_buf_idx < pref_len + suff_len: + memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx) -cdef void _search_for_chars( - const Py_UCS4* text_buf, - const int tok_idx, - const int tok_len, - Py_UCS4* search_buf, - Py_UCS4* lookup_buf, - const int search_buf_len, - Py_UCS4* result_buf, - const int result_buf_len, +cdef bint _search_for_chars( + const unsigned char[:] tok_str, + const unsigned char[:] s_1byte_ch, + const unsigned char[:] s_2byte_ch, + const unsigned char[:] s_3byte_ch, + const unsigned char[:] s_4byte_ch, + unsigned char* res_buf, + unsigned char* len_buf, bint suffs_not_prefs ) nogil: """ Search a word within a string for characters within *search_buf*, starting at the beginning or @@ -2133,6 +2107,8 @@ cdef void _search_for_chars( result_buf: the buffer in which to place the results. result_buf_len: the length of *result_buf*. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. + + Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*. """ cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx cdef int search_buf_idx @@ -2158,6 +2134,8 @@ cdef void _search_for_chars( # fill in any unused characters in the result buffer with zeros if result_buf_idx < result_buf_len: memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4)) + + return result_buf_idx > 0 def pickle_doc(doc): diff --git a/spacy/util.py b/spacy/util.py index c3add9fc9..4a656b61b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1737,69 +1737,42 @@ def all_equal(iterable): return next(g, True) and not next(g, False) -def get_arrays_for_search_chars( +def get_search_char_byte_arrays( search_chars: str, case_sensitive: bool -) -> Tuple[bytes, bytes]: +) -> Tuple[bytes, bytes, bytes, bytes]: """ - This function supports the rich feature extractor. It returns search byte arrays with - 4-byte character width that are used for comparison when searching document texts - for search characters. The encoding is little-endian regardless of architecture, as - this is what is expected by the murmurhash library used downstream. - - Alongside the "search array" against which words from document texts are compared - is the "lookup array". When a character from the search array is matched, - the character at the corresponding position in the lookup array is added to the - sequence that then goes on to be hashed. This enables case-sensitivity - to be handled without converting the case of the words being searched: if - *case_sensitive==False*, the lower- or uppercase counterparts of any characters that - have case are added to the search array, and both the original character and its - other-cased counterpart map to the lower-case version in the lookup array. + This function supports the rich feature extractor. It splits the UTF-8 representation + of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte + characters respectively. Any duplicates in *search_chars* are removed, and *search_chars* + is converted to lower case if *case_sensitive==False*. """ - def encode(ch: str) -> bytes: - """ - ch: a single character - """ - return ch.encode("UTF-32LE") - - def add_to_arrays( - search: List[bytes], lookup: List[bytes], ch: str - ) -> None: - """Add the byte representations of *ch* to the two byte array lists. - """ - this_char_bytes = encode(ch) - if not case_sensitive and ch.islower(): - if this_char_bytes not in search: - search.append(this_char_bytes) - lookup.append(this_char_bytes) - upper_char_bytes = encode(ch.upper()) - if upper_char_bytes not in search: - search.append(upper_char_bytes) - lookup.append(this_char_bytes) - elif not case_sensitive and ch.isupper(): - lower_char_bytes = encode(ch.lower()) - if this_char_bytes not in search: - search.append(this_char_bytes) - lookup.append(lower_char_bytes) - if lower_char_bytes not in search: - search.append(lower_char_bytes) - lookup.append(lower_char_bytes) - elif this_char_bytes not in search: - search.append(this_char_bytes) - lookup.append(this_char_bytes) - - def get_ordered_raw_bytes( - search: List[bytes], lookup: List[bytes] - ) -> Tuple[bytes, bytes]: - """Flatten the two lists, ordering both by the entries in *search*. - """ - num_search = [list(entry) for entry in search] - search = [entry for _, entry in sorted(zip(num_search, search))] - lookup = [entry for _, entry in sorted(zip(num_search, lookup))] - return b"".join(search), b"".join(lookup) - - search: List[bytes] = [] - lookup: List[bytes] = [] - for ch in search_chars: - add_to_arrays(search, lookup, ch) - return get_ordered_raw_bytes(search, lookup) + sc1 = bytearray() + sc2 = bytearray() + sc3 = bytearray() + sc4 = bytearray() + if not case_sensitive: + search_chars = search_chars.lower() + ordered_search_chars = "".join(sorted(set(search_chars))) + encoded_search_char_bytes = ordered_search_chars.encode("UTF-8") + working_start = 0 + for idx in range(len(encoded_search_char_bytes) + 1): + if idx == 0: + continue + if ( + idx == len(encoded_search_char_bytes) + or encoded_search_char_bytes[idx] & 0xC0 != 0x80 # not continuation byte + ): + char_length = idx - working_start + if char_length == 1: + sc1.extend(encoded_search_char_bytes[working_start:idx]) + elif char_length == 2: + sc2.extend(encoded_search_char_bytes[working_start:idx]) + elif char_length == 3: + sc3.extend(encoded_search_char_bytes[working_start:idx]) + elif char_length == 4: + sc4.extend(encoded_search_char_bytes[working_start:idx]) + else: + raise RuntimeError(Errors.E1049) + working_start = idx + return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)