mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Changes after review discussion — intermed. state
This commit is contained in:
parent
7d8258bec8
commit
a1b8697aab
|
@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"is a Cython extension type.")
|
"is a Cython extension type.")
|
||||||
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
||||||
"aware that this might affect other components in your pipeline.")
|
"aware that this might affect other components in your pipeline.")
|
||||||
|
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
|
||||||
|
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||||
"knowledge base, use `InMemoryLookupKB`.")
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
E1047 = ("Invalid rich group config '{label}'.")
|
E1047 = ("Invalid rich group config '{label}'.")
|
||||||
|
E1048 = ("Length > 63 in rich group config '{label}.")
|
||||||
|
E1049 = ("Error splitting UTF-8 byte string into separate characters.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from encodings import search_function
|
from encodings import search_function
|
||||||
from typing import Optional, List, Union, cast
|
from typing import Optional, List, Union, cast
|
||||||
|
import warnings
|
||||||
from spacy.ml.richfeatureextractor import RichFeatureExtractor
|
from spacy.ml.richfeatureextractor import RichFeatureExtractor
|
||||||
from thinc.types import Floats2d, Ints2d, Ragged
|
from thinc.types import Floats2d, Ints2d, Ragged
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
|
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...errors import Errors
|
from ...errors import Errors, Warnings
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
|
@ -207,6 +208,8 @@ def _verify_rich_config_group(
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
elif search_chars is not None:
|
elif search_chars is not None:
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
|
if lengths is not None and max(lengths) > 63:
|
||||||
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
||||||
|
@ -246,13 +249,13 @@ def RichMultiHashEmbed(
|
||||||
depending on the presence of some other letter before or after it, e.g. German
|
depending on the presence of some other letter before or after it, e.g. German
|
||||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||||
For most languages used with spaCy, searching is likely to be useful starting
|
For most languages used with spaCy, searching is likely to be useful starting
|
||||||
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||||
is also offered for completeness. Search characters should consist of all
|
is also offered for completeness. Search characters should consist of all
|
||||||
characters that regularly alternate with other characters in the language in
|
characters that regularly alternate with other characters in the language in
|
||||||
question or whose presence before or after characters that would otherwise
|
question or whose presence before or after characters that would otherwise
|
||||||
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
||||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||||
end of the word.
|
end of the word.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
|
@ -263,27 +266,27 @@ def RichMultiHashEmbed(
|
||||||
same length as attrs.
|
same length as attrs.
|
||||||
include_static_vectors (bool): Whether to also use static word vectors.
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
case_sensitive (bool): Whether lower-case and upper-case letters should be
|
case_sensitive (bool): Whether lower-case and upper-case letters should be
|
||||||
distinguished when generating the character combinations to use as features.
|
distinguished when generating the character combinations to use as features.
|
||||||
pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
|
pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
|
||||||
for each word, e.g. for the word `spaCy`:
|
for each word, e.g. for the word `spaCy`:
|
||||||
`[1, 3]` would lead to `s` and `spa` being used as features.
|
`[1, 3]` would lead to `s` and `spa` being used as features.
|
||||||
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
||||||
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
||||||
for each word, e.g. for the word `spaCy`:
|
for each word, e.g. for the word `spaCy`:
|
||||||
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
||||||
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
||||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
pref_search_chars (Optional[str]): A string containing characters to search for
|
||||||
starting from the beginning of each word.
|
starting from the beginning of each word.
|
||||||
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||||
to use as features, where the searches start from the beginning of each word.
|
to use as features, where the searches start from the beginning of each word.
|
||||||
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
||||||
`pref_search_lengths`.
|
`pref_search_lengths`.
|
||||||
suff_search_chars (Optional[str]): A string containing characters to search for
|
suff_search_chars (Optional[str]): A string containing characters to search for
|
||||||
starting from the end of each word.
|
starting from the end of each word.
|
||||||
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||||
to use as features, where the searches start from the end of each word.
|
to use as features, where the searches start from the end of each word.
|
||||||
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
||||||
`suff_search_lengths`.
|
`suff_search_lengths`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -313,6 +316,9 @@ def RichMultiHashEmbed(
|
||||||
case_sensitive,
|
case_sensitive,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
||||||
|
warnings.warn(Warnings.W124)
|
||||||
|
|
||||||
if pref_rows is not None:
|
if pref_rows is not None:
|
||||||
rows.extend(pref_rows)
|
rows.extend(pref_rows)
|
||||||
if suff_rows is not None:
|
if suff_rows is not None:
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from typing import List, Optional, Callable, Tuple
|
from typing import List, Optional, Callable, Tuple
|
||||||
from ..util import get_arrays_for_search_chars
|
from spacy.util import get_search_char_byte_arrays
|
||||||
|
|
||||||
|
# from ..util import get_arrays_for_search_chars
|
||||||
from thinc.types import Ints1d, Ints2d
|
from thinc.types import Ints1d, Ints2d
|
||||||
from thinc.api import Model, registry, get_current_ops
|
from thinc.api import Model, registry, get_current_ops
|
||||||
|
|
||||||
|
@ -19,17 +21,23 @@ def RichFeatureExtractor(
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
if pref_search_chars is not None:
|
if pref_search_chars is not None:
|
||||||
pref_search, pref_lookup = get_arrays_for_search_chars(
|
(
|
||||||
pref_search_chars, case_sensitive
|
ps_1byte_ch,
|
||||||
)
|
ps_2byte_ch,
|
||||||
|
ps_3byte_ch,
|
||||||
|
ps_4byte_ch,
|
||||||
|
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||||
else:
|
else:
|
||||||
pref_search, pref_lookup = bytes(), bytes()
|
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
|
||||||
if suff_search_chars is not None:
|
if suff_search_chars is not None:
|
||||||
suff_search, suff_lookup = get_arrays_for_search_chars(
|
(
|
||||||
suff_search_chars, case_sensitive
|
ss_1byte_ch,
|
||||||
)
|
ss_2byte_ch,
|
||||||
|
ss_3byte_ch,
|
||||||
|
ss_4byte_ch,
|
||||||
|
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||||
else:
|
else:
|
||||||
suff_search, suff_lookup = bytes(), bytes()
|
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
|
||||||
return Model(
|
return Model(
|
||||||
"extract_character_combination_hashes",
|
"extract_character_combination_hashes",
|
||||||
forward,
|
forward,
|
||||||
|
@ -41,19 +49,17 @@ def RichFeatureExtractor(
|
||||||
"suff_lengths": ops.asarray1i(suff_lengths)
|
"suff_lengths": ops.asarray1i(suff_lengths)
|
||||||
if suff_lengths is not None
|
if suff_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else ops.asarray1i([]),
|
||||||
"pref_search": pref_search,
|
"pref_search_1_byte": ps_1byte_ch,
|
||||||
"pref_lookup": pref_lookup,
|
"pref_search_2_bytes": ps_2byte_ch,
|
||||||
"pref_search_char_len": len(pref_search) / 4
|
"pref_search_3_bytes": ps_3byte_ch,
|
||||||
if pref_search_chars is not None
|
"pref_search_4_bytes": ps_4byte_ch,
|
||||||
else 0,
|
|
||||||
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
|
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
|
||||||
if pref_search_lengths is not None
|
if pref_search_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else ops.asarray1i([]),
|
||||||
"suff_search": suff_search,
|
"suff_search_1_byte": ss_1byte_ch,
|
||||||
"suff_lookup": suff_lookup,
|
"suff_search_2_bytes": ss_2byte_ch,
|
||||||
"suff_search_char_len": len(suff_search) / 4
|
"suff_search_3_bytes": ss_3byte_ch,
|
||||||
if suff_search_chars is not None
|
"suff_search_4_bytes": ss_4byte_ch,
|
||||||
else 0,
|
|
||||||
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
|
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
|
||||||
if suff_search_lengths is not None
|
if suff_search_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else ops.asarray1i([]),
|
||||||
|
@ -68,13 +74,15 @@ def forward(
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
pref_lengths: Ints1d = model.attrs["pref_lengths"]
|
pref_lengths: Ints1d = model.attrs["pref_lengths"]
|
||||||
suff_lengths: Ints1d = model.attrs["suff_lengths"]
|
suff_lengths: Ints1d = model.attrs["suff_lengths"]
|
||||||
pref_search: bytes = model.attrs["pref_search"]
|
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||||
pref_lookup: bytes = model.attrs["pref_lookup"]
|
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||||
pref_search_char_len: int = model.attrs["pref_search_char_len"]
|
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||||
|
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||||
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
|
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
|
||||||
suff_search: bytes = model.attrs["suff_search"]
|
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||||
suff_lookup: bytes = model.attrs["suff_lookup"]
|
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||||
suff_search_char_len: int = model.attrs["suff_search_char_len"]
|
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||||
|
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||||
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
|
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -82,13 +90,15 @@ def forward(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=pref_lengths,
|
p_lengths=pref_lengths,
|
||||||
s_lengths=suff_lengths,
|
s_lengths=suff_lengths,
|
||||||
ps_search=pref_search,
|
ps_1byte_ch=ps_1byte_ch,
|
||||||
ps_lookup=pref_lookup,
|
ps_2byte_ch=ps_2byte_ch,
|
||||||
ps_l=pref_search_char_len,
|
ps_3byte_ch=ps_3byte_ch,
|
||||||
|
ps_4byte_ch=ps_4byte_ch,
|
||||||
ps_lengths=pref_search_lengths,
|
ps_lengths=pref_search_lengths,
|
||||||
ss_search=suff_search,
|
ss_1byte_ch=ss_1byte_ch,
|
||||||
ss_lookup=suff_lookup,
|
ss_2byte_ch=ss_2byte_ch,
|
||||||
ss_l=suff_search_char_len,
|
ss_3byte_ch=ss_3byte_ch,
|
||||||
|
ss_4byte_ch=ss_4byte_ch,
|
||||||
ss_lengths=suff_search_lengths,
|
ss_lengths=suff_search_lengths,
|
||||||
)
|
)
|
||||||
features.append(ops.asarray2i(hashes))
|
features.append(ops.asarray2i(hashes))
|
||||||
|
|
|
@ -27,3 +27,4 @@ cdef class StringStore:
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||||
|
cdef const unsigned char[:] utf8_view(self, attr_t hash_val)
|
||||||
|
|
|
@ -315,3 +315,25 @@ cdef class StringStore:
|
||||||
self._map.set(key, value)
|
self._map.set(key, value)
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
|
||||||
|
if hash_val == 0:
|
||||||
|
return ""
|
||||||
|
elif hash_val < len(SYMBOLS_BY_INT):
|
||||||
|
return SYMBOLS_BY_INT[hash_val]
|
||||||
|
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
||||||
|
cdef int i, length
|
||||||
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
|
return string.s[1:string.s[0]+1]
|
||||||
|
elif string.p[0] < 255:
|
||||||
|
return string.p[1:string.p[0]+1]
|
||||||
|
else:
|
||||||
|
i = 0
|
||||||
|
length = 0
|
||||||
|
while string.p[i] == 255:
|
||||||
|
i += 1
|
||||||
|
length += 255
|
||||||
|
length += string.p[i]
|
||||||
|
i += 1
|
||||||
|
return string.p[i:length + i]
|
||||||
|
|
||||||
|
|
|
@ -1,55 +1,57 @@
|
||||||
import spacy
|
import spacy
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
(
|
def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
|
||||||
search,
|
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
|
||||||
lookup,
|
if case_sensitive:
|
||||||
) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
|
assert sc1 == b"EPaz"
|
||||||
assert (
|
else:
|
||||||
lookup
|
assert sc1 == b"aepz"
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
assert sc2 == b""
|
||||||
)
|
assert sc3 == b""
|
||||||
|
assert sc4 == b""
|
||||||
|
|
||||||
assert (
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
search
|
def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
|
||||||
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
|
||||||
)
|
assert sc1 == b""
|
||||||
|
assert sc2 == b""
|
||||||
|
assert sc3 == b""
|
||||||
|
assert sc4 == "𐌞".encode("utf-8")
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
|
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
|
||||||
|
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
|
||||||
|
if case_sensitive:
|
||||||
|
assert sc1 == b"Bab"
|
||||||
|
assert sc2 == "Éé".encode("utf-8")
|
||||||
|
else:
|
||||||
|
assert sc1 == b"ab"
|
||||||
|
assert sc2 == "é".encode("utf-8")
|
||||||
|
assert sc3 == "—".encode("utf-8")
|
||||||
|
assert sc4 == "𐌞".encode("utf-8")
|
||||||
|
|
||||||
def test_get_arrays_for_search_chars_width_2_case_sensitive():
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
(
|
def test_turkish_i_with_dot(case_sensitive):
|
||||||
search,
|
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
|
||||||
lookup,
|
if case_sensitive:
|
||||||
) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
|
assert sc2 == "İ".encode("utf-8")
|
||||||
assert (
|
assert sc1 == sc3 == sc4 == b""
|
||||||
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
else:
|
||||||
)
|
assert sc1 == b"i"
|
||||||
|
assert sc2 == b"\xcc\x87"
|
||||||
|
assert sc3 == sc4 == b""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
|
def test_turkish_i_with_dot_and_normal_i(case_sensitive):
|
||||||
(
|
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
|
||||||
search,
|
if case_sensitive:
|
||||||
lookup,
|
assert sc1 == b"I"
|
||||||
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
assert sc2 == "İ".encode("utf-8")
|
||||||
assert (
|
assert sc3 == sc4 == b""
|
||||||
search
|
else:
|
||||||
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
assert sc1 == b"i"
|
||||||
)
|
assert sc2 == b"\xcc\x87"
|
||||||
|
assert sc3 == sc4 == b""
|
||||||
assert (
|
|
||||||
lookup
|
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_arrays_for_search_chars_width_4_case_sensitive():
|
|
||||||
(
|
|
||||||
search,
|
|
||||||
lookup,
|
|
||||||
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
|
||||||
assert search == lookup
|
|
||||||
assert (
|
|
||||||
lookup
|
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
|
||||||
)
|
|
|
@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _copy_chars(
|
cdef void _set_affix_lengths(
|
||||||
Py_UCS4* target,
|
const unsigned char[:] text_buf,
|
||||||
const Py_UCS4* source,
|
char* aff_len_buf,
|
||||||
const int length,
|
|
||||||
const bint to_lower
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affixes(
|
|
||||||
const Py_UCS4* text_buf,
|
|
||||||
const int tok_idx,
|
|
||||||
const int tok_len,
|
|
||||||
Py_UCS4* aff_buf,
|
|
||||||
const int pref_len,
|
const int pref_len,
|
||||||
const int suff_len,
|
const int suff_len,
|
||||||
const bint to_lower
|
) nogil
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
cdef bint _search_for_chars(
|
||||||
const Py_UCS4* text_buf,
|
const unsigned char[:] tok_str,
|
||||||
const int tok_idx,
|
const unsigned char[:] s_1byte_ch,
|
||||||
const int tok_len,
|
const unsigned char[:] s_2byte_ch,
|
||||||
Py_UCS4* search_buf,
|
const unsigned char[:] s_3byte_ch,
|
||||||
Py_UCS4* lookup_buf,
|
const unsigned char[:] s_4byte_ch,
|
||||||
const int search_buf_len,
|
unsigned char* res_buf,
|
||||||
Py_UCS4* result_buf,
|
unsigned char* len_buf,
|
||||||
const int result_buf_len,
|
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
|
@ -126,7 +126,7 @@ class Doc:
|
||||||
blocked: Optional[List[Span]] = ...,
|
blocked: Optional[List[Span]] = ...,
|
||||||
missing: Optional[List[Span]] = ...,
|
missing: Optional[List[Span]] = ...,
|
||||||
outside: Optional[List[Span]] = ...,
|
outside: Optional[List[Span]] = ...,
|
||||||
default: str = ...
|
default: str = ...,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self) -> Iterator[Span]: ...
|
def noun_chunks(self) -> Iterator[Span]: ...
|
||||||
|
@ -178,16 +178,18 @@ class Doc:
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
cs: bool,
|
cs: bool,
|
||||||
pref_lengths: Ints1d,
|
p_lengths: Ints1d,
|
||||||
suff_lengths: Ints1d,
|
s_lengths: Ints1d,
|
||||||
pref_search_chars: str,
|
ps_1byte_ch: bytes,
|
||||||
pref_lookup_chars: str,
|
ps_2byte_ch: bytes,
|
||||||
pref_search_char_length: int,
|
ps_3byte_ch: bytes,
|
||||||
pref_search_lengths: Ints1d,
|
ps_4byte_ch: bytes,
|
||||||
suff_search_chars: str,
|
ps_lengths: Ints1d,
|
||||||
suff_lookup_chars: str,
|
ss_1byte_ch: bytes,
|
||||||
suff_search_char_length: int,
|
ss_2byte_ch: bytes,
|
||||||
suff_search_lengths: Ints1d,
|
ss_3byte_ch: bytes,
|
||||||
|
ss_4byte_ch: bytes,
|
||||||
|
ss_lengths: Ints1d,
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -1736,18 +1736,20 @@ cdef class Doc:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_character_combination_hashes(self,
|
def np.ndarray get_character_combination_hashes(self,
|
||||||
*,
|
*,
|
||||||
const bint cs,
|
const bint cs,
|
||||||
np.ndarray p_lengths,
|
np.ndarray p_lengths,
|
||||||
np.ndarray s_lengths,
|
np.ndarray s_lengths,
|
||||||
const char* ps_search,
|
const unsigned char[:] ps_1byte_ch,
|
||||||
const char* ps_lookup,
|
const unsigned char[:] ps_2byte_ch,
|
||||||
const int ps_l,
|
const unsigned char[:] ps_3byte_ch,
|
||||||
|
const unsigned char[:] ps_4byte_ch,
|
||||||
np.ndarray ps_lengths,
|
np.ndarray ps_lengths,
|
||||||
const char* ss_search,
|
const unsigned char[:] ss_1byte_ch,
|
||||||
const char* ss_lookup,
|
const unsigned char[:] ss_2byte_ch,
|
||||||
const int ss_l,
|
const unsigned char[:] ss_3byte_ch,
|
||||||
|
const unsigned char[:] ss_4byte_ch,
|
||||||
np.ndarray ss_lengths,
|
np.ndarray ss_lengths,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -1766,44 +1768,26 @@ cdef class Doc:
|
||||||
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
|
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
|
||||||
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
ps_search: a byte array containing characters to search for within each token, starting at the beginning.
|
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
ps_lookup: a byte array containing characters that are added to the result string when a character at
|
starting at the beginning.
|
||||||
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
|
||||||
case-insensitivity to be handled efficiently.
|
|
||||||
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
|
||||||
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
||||||
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "a" and "ac".
|
"spaCy" would be "a" and "ac".
|
||||||
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
ss_lookup: a byte array containing characters that are added to the result string when a character at
|
starting at the end.
|
||||||
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
|
||||||
case-insensitivity to be handled efficiently.
|
|
||||||
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
|
||||||
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
||||||
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "c" and "ca".
|
"spaCy" would be "c" and "ca".
|
||||||
|
|
||||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
|
||||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
|
|
||||||
would correspond to
|
|
||||||
|
|
||||||
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
|
||||||
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
|
||||||
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Encode the document text
|
|
||||||
cdef bytes encoded_text = self.text.encode("utf-32le")
|
|
||||||
cdef char* intermediate_text = encoded_text
|
|
||||||
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
|
|
||||||
|
|
||||||
# Define the result array and work out what is used for what in axis 1
|
# Define the result array and work out what is used for what in axis 1
|
||||||
cdef int num_toks = len(self)
|
cdef int num_toks = len(self)
|
||||||
cdef int p_h_num = p_lengths.shape[0]
|
cdef int p_h_num = p_lengths.shape[0]
|
||||||
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
|
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
|
||||||
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
|
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
|
||||||
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
|
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
cdef np.ndarray[np.int64_t, ndim=2] hashes
|
||||||
|
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
||||||
|
|
||||||
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
|
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
|
||||||
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
|
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
|
||||||
|
@ -1811,15 +1795,13 @@ cdef class Doc:
|
||||||
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
|
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
|
||||||
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
|
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
|
||||||
|
|
||||||
# Define / allocate buffer (pr/sr: result buffers)
|
# Define / allocate buffers
|
||||||
cdef int aff_buf_l = p_max_l + s_max_l
|
cdef int aff_l = p_max_l + s_max_l
|
||||||
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
|
||||||
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
|
cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
|
||||||
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
|
cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
|
||||||
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
|
||||||
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
|
cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
|
||||||
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
|
|
||||||
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
|
||||||
|
|
||||||
# Define memory views on length arrays
|
# Define memory views on length arrays
|
||||||
cdef int[:] p_lengths_v = p_lengths
|
cdef int[:] p_lengths_v = p_lengths
|
||||||
|
@ -1829,38 +1811,51 @@ cdef class Doc:
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
cdef int tok_i, tok_idx, tok_len, aff_len
|
cdef int tok_i, offset
|
||||||
|
cdef uint64_t hash_val
|
||||||
|
cdef attr_t num_tok_attr
|
||||||
|
cdef const unsigned char[:] tok_str
|
||||||
|
|
||||||
for tok_i in range(num_toks):
|
for tok_i in range(num_toks):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
tok_idx = tok_c.idx
|
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||||
tok_len = tok_c.lex.length
|
tok_str = self.vocab.strings.utf8_view(num_tok_attr)
|
||||||
|
|
||||||
if aff_buf_l > 0:
|
if aff_l > 0:
|
||||||
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
_set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
|
||||||
|
|
||||||
for hash_idx in range(p_h_num):
|
for hash_idx in range(p_h_num):
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
|
offset = aff_len_buf[p_lengths_v[hash_idx]]
|
||||||
|
if offset > 0:
|
||||||
|
hash_val = hash32(<void*> &qcktest2[0], offset, 0)
|
||||||
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
for hash_idx in range(p_h_num, s_h_end):
|
for hash_idx in range(p_h_num, s_h_end):
|
||||||
aff_len = s_lengths_v[hash_idx - p_h_num]
|
offset = s_lengths_v[hash_idx - p_h_num]
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
if offset > 0:
|
||||||
|
hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
|
||||||
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
if ps_h_num > 0:
|
if (
|
||||||
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
|
ps_h_num > 0 and
|
||||||
|
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
|
||||||
|
):
|
||||||
for hash_idx in range(s_h_end, ps_h_end):
|
for hash_idx in range(s_h_end, ps_h_end):
|
||||||
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
||||||
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
if ss_h_num > 0:
|
if (
|
||||||
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
|
ss_h_num > 0 and
|
||||||
|
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
|
||||||
|
):
|
||||||
for hash_idx in range(ps_h_end, ss_h_end):
|
for hash_idx in range(ps_h_end, ss_h_end):
|
||||||
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
||||||
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
self.mem.free(aff_buf)
|
self.mem.free(aff_len_buf)
|
||||||
self.mem.free(ps_r_buf)
|
self.mem.free(ps_res_buf)
|
||||||
self.mem.free(ss_r_buf)
|
self.mem.free(ps_len_buf)
|
||||||
|
self.mem.free(ss_res_buf)
|
||||||
|
self.mem.free(ss_len_buf)
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
|
|
||||||
cdef void _copy_chars(
|
cdef void _set_affix_lengths(
|
||||||
Py_UCS4* target,
|
const unsigned char[:] text_buf,
|
||||||
const Py_UCS4* source,
|
char* aff_len_buf,
|
||||||
const int length,
|
|
||||||
const bint to_lower
|
|
||||||
):
|
|
||||||
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
|
||||||
any upper-case characters to lower case within the target buffer.
|
|
||||||
"""
|
|
||||||
cdef int idx
|
|
||||||
|
|
||||||
memcpy(target, source, length * sizeof(Py_UCS4))
|
|
||||||
if to_lower:
|
|
||||||
for idx in range(length):
|
|
||||||
if Py_UNICODE_ISUPPER(target[idx]):
|
|
||||||
target[idx] = Py_UNICODE_TOLOWER(target[idx])
|
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affixes(
|
|
||||||
const Py_UCS4* text_buf,
|
|
||||||
const int tok_idx,
|
|
||||||
const int tok_len,
|
|
||||||
Py_UCS4* aff_buf,
|
|
||||||
const int pref_len,
|
const int pref_len,
|
||||||
const int suff_len,
|
const int suff_len,
|
||||||
const bint to_lower
|
) nogil:
|
||||||
):
|
""" TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
||||||
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
|
||||||
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
|
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
|
||||||
|
|
||||||
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
||||||
|
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
|
||||||
suff_len: the length of the suffix.
|
suff_len: the length of the suffix.
|
||||||
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
||||||
"""
|
"""
|
||||||
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
|
cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
|
||||||
|
|
||||||
if pref_len > 0:
|
|
||||||
filled_pref_len = pref_len if pref_len < tok_len else tok_len
|
|
||||||
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
|
|
||||||
aff_buf_idx = filled_pref_len
|
|
||||||
|
|
||||||
if tok_len < pref_len:
|
while aff_len_buf_idx < pref_len:
|
||||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
||||||
aff_buf_idx = aff_buf_len - suff_len
|
aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
|
||||||
|
aff_len_buf_idx += 1
|
||||||
|
text_buf_idx += 1
|
||||||
|
if text_buf_idx == len(text_buf):
|
||||||
|
break
|
||||||
|
|
||||||
if tok_len < suff_len:
|
if aff_len_buf_idx < pref_len:
|
||||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
|
||||||
aff_buf_idx = aff_buf_len - tok_len
|
aff_len_buf_idx = pref_len
|
||||||
|
|
||||||
if suff_len > 0:
|
text_buf_idx = 1
|
||||||
# in_word_idx: the index within the token where the suffix starts
|
while aff_len_buf_idx < pref_len + suff_len:
|
||||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
||||||
if in_word_idx < pref_len:
|
aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
|
||||||
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
aff_len_buf_idx += 1
|
||||||
aff_buf_idx += filled_pref_len - in_word_idx
|
text_buf_idx += 1
|
||||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
if text_buf_idx > text_buf_len:
|
||||||
if aff_buf_idx < aff_buf_len:
|
break
|
||||||
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
|
||||||
|
if aff_len_buf_idx < pref_len + suff_len:
|
||||||
|
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
cdef bint _search_for_chars(
|
||||||
const Py_UCS4* text_buf,
|
const unsigned char[:] tok_str,
|
||||||
const int tok_idx,
|
const unsigned char[:] s_1byte_ch,
|
||||||
const int tok_len,
|
const unsigned char[:] s_2byte_ch,
|
||||||
Py_UCS4* search_buf,
|
const unsigned char[:] s_3byte_ch,
|
||||||
Py_UCS4* lookup_buf,
|
const unsigned char[:] s_4byte_ch,
|
||||||
const int search_buf_len,
|
unsigned char* res_buf,
|
||||||
Py_UCS4* result_buf,
|
unsigned char* len_buf,
|
||||||
const int result_buf_len,
|
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
) nogil:
|
) nogil:
|
||||||
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
||||||
|
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
|
||||||
result_buf: the buffer in which to place the results.
|
result_buf: the buffer in which to place the results.
|
||||||
result_buf_len: the length of *result_buf*.
|
result_buf_len: the length of *result_buf*.
|
||||||
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||||
|
|
||||||
|
Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
|
||||||
"""
|
"""
|
||||||
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
||||||
cdef int search_buf_idx
|
cdef int search_buf_idx
|
||||||
|
@ -2158,6 +2134,8 @@ cdef void _search_for_chars(
|
||||||
# fill in any unused characters in the result buffer with zeros
|
# fill in any unused characters in the result buffer with zeros
|
||||||
if result_buf_idx < result_buf_len:
|
if result_buf_idx < result_buf_len:
|
||||||
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
|
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
|
||||||
|
|
||||||
|
return result_buf_idx > 0
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
|
|
|
@ -1737,69 +1737,42 @@ def all_equal(iterable):
|
||||||
return next(g, True) and not next(g, False)
|
return next(g, True) and not next(g, False)
|
||||||
|
|
||||||
|
|
||||||
def get_arrays_for_search_chars(
|
def get_search_char_byte_arrays(
|
||||||
search_chars: str, case_sensitive: bool
|
search_chars: str, case_sensitive: bool
|
||||||
) -> Tuple[bytes, bytes]:
|
) -> Tuple[bytes, bytes, bytes, bytes]:
|
||||||
"""
|
"""
|
||||||
This function supports the rich feature extractor. It returns search byte arrays with
|
This function supports the rich feature extractor. It splits the UTF-8 representation
|
||||||
4-byte character width that are used for comparison when searching document texts
|
of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
|
||||||
for search characters. The encoding is little-endian regardless of architecture, as
|
characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
|
||||||
this is what is expected by the murmurhash library used downstream.
|
is converted to lower case if *case_sensitive==False*.
|
||||||
|
|
||||||
Alongside the "search array" against which words from document texts are compared
|
|
||||||
is the "lookup array". When a character from the search array is matched,
|
|
||||||
the character at the corresponding position in the lookup array is added to the
|
|
||||||
sequence that then goes on to be hashed. This enables case-sensitivity
|
|
||||||
to be handled without converting the case of the words being searched: if
|
|
||||||
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
|
||||||
have case are added to the search array, and both the original character and its
|
|
||||||
other-cased counterpart map to the lower-case version in the lookup array.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def encode(ch: str) -> bytes:
|
sc1 = bytearray()
|
||||||
"""
|
sc2 = bytearray()
|
||||||
ch: a single character
|
sc3 = bytearray()
|
||||||
"""
|
sc4 = bytearray()
|
||||||
return ch.encode("UTF-32LE")
|
if not case_sensitive:
|
||||||
|
search_chars = search_chars.lower()
|
||||||
def add_to_arrays(
|
ordered_search_chars = "".join(sorted(set(search_chars)))
|
||||||
search: List[bytes], lookup: List[bytes], ch: str
|
encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
|
||||||
) -> None:
|
working_start = 0
|
||||||
"""Add the byte representations of *ch* to the two byte array lists.
|
for idx in range(len(encoded_search_char_bytes) + 1):
|
||||||
"""
|
if idx == 0:
|
||||||
this_char_bytes = encode(ch)
|
continue
|
||||||
if not case_sensitive and ch.islower():
|
if (
|
||||||
if this_char_bytes not in search:
|
idx == len(encoded_search_char_bytes)
|
||||||
search.append(this_char_bytes)
|
or encoded_search_char_bytes[idx] & 0xC0 != 0x80 # not continuation byte
|
||||||
lookup.append(this_char_bytes)
|
):
|
||||||
upper_char_bytes = encode(ch.upper())
|
char_length = idx - working_start
|
||||||
if upper_char_bytes not in search:
|
if char_length == 1:
|
||||||
search.append(upper_char_bytes)
|
sc1.extend(encoded_search_char_bytes[working_start:idx])
|
||||||
lookup.append(this_char_bytes)
|
elif char_length == 2:
|
||||||
elif not case_sensitive and ch.isupper():
|
sc2.extend(encoded_search_char_bytes[working_start:idx])
|
||||||
lower_char_bytes = encode(ch.lower())
|
elif char_length == 3:
|
||||||
if this_char_bytes not in search:
|
sc3.extend(encoded_search_char_bytes[working_start:idx])
|
||||||
search.append(this_char_bytes)
|
elif char_length == 4:
|
||||||
lookup.append(lower_char_bytes)
|
sc4.extend(encoded_search_char_bytes[working_start:idx])
|
||||||
if lower_char_bytes not in search:
|
else:
|
||||||
search.append(lower_char_bytes)
|
raise RuntimeError(Errors.E1049)
|
||||||
lookup.append(lower_char_bytes)
|
working_start = idx
|
||||||
elif this_char_bytes not in search:
|
return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
|
||||||
search.append(this_char_bytes)
|
|
||||||
lookup.append(this_char_bytes)
|
|
||||||
|
|
||||||
def get_ordered_raw_bytes(
|
|
||||||
search: List[bytes], lookup: List[bytes]
|
|
||||||
) -> Tuple[bytes, bytes]:
|
|
||||||
"""Flatten the two lists, ordering both by the entries in *search*.
|
|
||||||
"""
|
|
||||||
num_search = [list(entry) for entry in search]
|
|
||||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
|
||||||
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
|
|
||||||
return b"".join(search), b"".join(lookup)
|
|
||||||
|
|
||||||
search: List[bytes] = []
|
|
||||||
lookup: List[bytes] = []
|
|
||||||
for ch in search_chars:
|
|
||||||
add_to_arrays(search, lookup, ch)
|
|
||||||
return get_ordered_raw_bytes(search, lookup)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user