mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Changes after review discussion — intermed. state
This commit is contained in:
parent
7d8258bec8
commit
a1b8697aab
|
@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"is a Cython extension type.")
|
||||
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
||||
"aware that this might affect other components in your pipeline.")
|
||||
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
|
||||
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("Invalid rich group config '{label}'.")
|
||||
E1048 = ("Length > 63 in rich group config '{label}.")
|
||||
E1049 = ("Error splitting UTF-8 byte string into separate characters.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from encodings import search_function
|
||||
from typing import Optional, List, Union, cast
|
||||
import warnings
|
||||
from spacy.ml.richfeatureextractor import RichFeatureExtractor
|
||||
from thinc.types import Floats2d, Ints2d, Ragged
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
|
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|||
|
||||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
from ...errors import Errors
|
||||
from ...errors import Errors, Warnings
|
||||
from ...ml import _character_embed
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
|
@ -207,6 +208,8 @@ def _verify_rich_config_group(
|
|||
raise ValueError(Errors.E1047.format(label=label))
|
||||
elif search_chars is not None:
|
||||
raise ValueError(Errors.E1047.format(label=label))
|
||||
if lengths is not None and max(lengths) > 63:
|
||||
raise ValueError(Errors.E1048.format(label=label))
|
||||
|
||||
|
||||
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
||||
|
@ -313,6 +316,9 @@ def RichMultiHashEmbed(
|
|||
case_sensitive,
|
||||
)
|
||||
|
||||
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
||||
warnings.warn(Warnings.W124)
|
||||
|
||||
if pref_rows is not None:
|
||||
rows.extend(pref_rows)
|
||||
if suff_rows is not None:
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
from ..util import get_arrays_for_search_chars
|
||||
from spacy.util import get_search_char_byte_arrays
|
||||
|
||||
# from ..util import get_arrays_for_search_chars
|
||||
from thinc.types import Ints1d, Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
|
||||
|
@ -19,17 +21,23 @@ def RichFeatureExtractor(
|
|||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
ops = get_current_ops()
|
||||
if pref_search_chars is not None:
|
||||
pref_search, pref_lookup = get_arrays_for_search_chars(
|
||||
pref_search_chars, case_sensitive
|
||||
)
|
||||
(
|
||||
ps_1byte_ch,
|
||||
ps_2byte_ch,
|
||||
ps_3byte_ch,
|
||||
ps_4byte_ch,
|
||||
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||
else:
|
||||
pref_search, pref_lookup = bytes(), bytes()
|
||||
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
|
||||
if suff_search_chars is not None:
|
||||
suff_search, suff_lookup = get_arrays_for_search_chars(
|
||||
suff_search_chars, case_sensitive
|
||||
)
|
||||
(
|
||||
ss_1byte_ch,
|
||||
ss_2byte_ch,
|
||||
ss_3byte_ch,
|
||||
ss_4byte_ch,
|
||||
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||
else:
|
||||
suff_search, suff_lookup = bytes(), bytes()
|
||||
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
|
||||
return Model(
|
||||
"extract_character_combination_hashes",
|
||||
forward,
|
||||
|
@ -41,19 +49,17 @@ def RichFeatureExtractor(
|
|||
"suff_lengths": ops.asarray1i(suff_lengths)
|
||||
if suff_lengths is not None
|
||||
else ops.asarray1i([]),
|
||||
"pref_search": pref_search,
|
||||
"pref_lookup": pref_lookup,
|
||||
"pref_search_char_len": len(pref_search) / 4
|
||||
if pref_search_chars is not None
|
||||
else 0,
|
||||
"pref_search_1_byte": ps_1byte_ch,
|
||||
"pref_search_2_bytes": ps_2byte_ch,
|
||||
"pref_search_3_bytes": ps_3byte_ch,
|
||||
"pref_search_4_bytes": ps_4byte_ch,
|
||||
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
|
||||
if pref_search_lengths is not None
|
||||
else ops.asarray1i([]),
|
||||
"suff_search": suff_search,
|
||||
"suff_lookup": suff_lookup,
|
||||
"suff_search_char_len": len(suff_search) / 4
|
||||
if suff_search_chars is not None
|
||||
else 0,
|
||||
"suff_search_1_byte": ss_1byte_ch,
|
||||
"suff_search_2_bytes": ss_2byte_ch,
|
||||
"suff_search_3_bytes": ss_3byte_ch,
|
||||
"suff_search_4_bytes": ss_4byte_ch,
|
||||
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
|
||||
if suff_search_lengths is not None
|
||||
else ops.asarray1i([]),
|
||||
|
@ -68,13 +74,15 @@ def forward(
|
|||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
pref_lengths: Ints1d = model.attrs["pref_lengths"]
|
||||
suff_lengths: Ints1d = model.attrs["suff_lengths"]
|
||||
pref_search: bytes = model.attrs["pref_search"]
|
||||
pref_lookup: bytes = model.attrs["pref_lookup"]
|
||||
pref_search_char_len: int = model.attrs["pref_search_char_len"]
|
||||
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
|
||||
suff_search: bytes = model.attrs["suff_search"]
|
||||
suff_lookup: bytes = model.attrs["suff_lookup"]
|
||||
suff_search_char_len: int = model.attrs["suff_search_char_len"]
|
||||
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
|
@ -82,13 +90,15 @@ def forward(
|
|||
cs=case_sensitive,
|
||||
p_lengths=pref_lengths,
|
||||
s_lengths=suff_lengths,
|
||||
ps_search=pref_search,
|
||||
ps_lookup=pref_lookup,
|
||||
ps_l=pref_search_char_len,
|
||||
ps_1byte_ch=ps_1byte_ch,
|
||||
ps_2byte_ch=ps_2byte_ch,
|
||||
ps_3byte_ch=ps_3byte_ch,
|
||||
ps_4byte_ch=ps_4byte_ch,
|
||||
ps_lengths=pref_search_lengths,
|
||||
ss_search=suff_search,
|
||||
ss_lookup=suff_lookup,
|
||||
ss_l=suff_search_char_len,
|
||||
ss_1byte_ch=ss_1byte_ch,
|
||||
ss_2byte_ch=ss_2byte_ch,
|
||||
ss_3byte_ch=ss_3byte_ch,
|
||||
ss_4byte_ch=ss_4byte_ch,
|
||||
ss_lengths=suff_search_lengths,
|
||||
)
|
||||
features.append(ops.asarray2i(hashes))
|
||||
|
|
|
@ -27,3 +27,4 @@ cdef class StringStore:
|
|||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
cdef const unsigned char[:] utf8_view(self, attr_t hash_val)
|
||||
|
|
|
@ -315,3 +315,25 @@ cdef class StringStore:
|
|||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
||||
cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
|
||||
if hash_val == 0:
|
||||
return ""
|
||||
elif hash_val < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[hash_val]
|
||||
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1]
|
||||
elif string.p[0] < 255:
|
||||
return string.p[1:string.p[0]+1]
|
||||
else:
|
||||
i = 0
|
||||
length = 0
|
||||
while string.p[i] == 255:
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
return string.p[i:length + i]
|
||||
|
||||
|
|
|
@ -1,55 +1,57 @@
|
|||
import spacy
|
||||
import pytest
|
||||
|
||||
|
||||
def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
|
||||
(
|
||||
search,
|
||||
lookup,
|
||||
) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
|
||||
assert (
|
||||
lookup
|
||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
|
||||
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
|
||||
if case_sensitive:
|
||||
assert sc1 == b"EPaz"
|
||||
else:
|
||||
assert sc1 == b"aepz"
|
||||
assert sc2 == b""
|
||||
assert sc3 == b""
|
||||
assert sc4 == b""
|
||||
|
||||
assert (
|
||||
search
|
||||
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
|
||||
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
|
||||
assert sc1 == b""
|
||||
assert sc2 == b""
|
||||
assert sc3 == b""
|
||||
assert sc4 == "𐌞".encode("utf-8")
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
|
||||
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
|
||||
if case_sensitive:
|
||||
assert sc1 == b"Bab"
|
||||
assert sc2 == "Éé".encode("utf-8")
|
||||
else:
|
||||
assert sc1 == b"ab"
|
||||
assert sc2 == "é".encode("utf-8")
|
||||
assert sc3 == "—".encode("utf-8")
|
||||
assert sc4 == "𐌞".encode("utf-8")
|
||||
|
||||
def test_get_arrays_for_search_chars_width_2_case_sensitive():
|
||||
(
|
||||
search,
|
||||
lookup,
|
||||
) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
|
||||
assert (
|
||||
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_turkish_i_with_dot(case_sensitive):
|
||||
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
|
||||
if case_sensitive:
|
||||
assert sc2 == "İ".encode("utf-8")
|
||||
assert sc1 == sc3 == sc4 == b""
|
||||
else:
|
||||
assert sc1 == b"i"
|
||||
assert sc2 == b"\xcc\x87"
|
||||
assert sc3 == sc4 == b""
|
||||
|
||||
|
||||
def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
|
||||
(
|
||||
search,
|
||||
lookup,
|
||||
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
||||
assert (
|
||||
search
|
||||
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
|
||||
assert (
|
||||
lookup
|
||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
|
||||
|
||||
def test_get_arrays_for_search_chars_width_4_case_sensitive():
|
||||
(
|
||||
search,
|
||||
lookup,
|
||||
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
||||
assert search == lookup
|
||||
assert (
|
||||
lookup
|
||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||
)
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_turkish_i_with_dot_and_normal_i(case_sensitive):
|
||||
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
|
||||
if case_sensitive:
|
||||
assert sc1 == b"I"
|
||||
assert sc2 == "İ".encode("utf-8")
|
||||
assert sc3 == sc4 == b""
|
||||
else:
|
||||
assert sc1 == b"i"
|
||||
assert sc2 == b"\xcc\x87"
|
||||
assert sc3 == sc4 == b""
|
|
@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
|||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||
|
||||
|
||||
cdef void _copy_chars(
|
||||
Py_UCS4* target,
|
||||
const Py_UCS4* source,
|
||||
const int length,
|
||||
const bint to_lower
|
||||
)
|
||||
|
||||
|
||||
cdef void _set_affixes(
|
||||
const Py_UCS4* text_buf,
|
||||
const int tok_idx,
|
||||
const int tok_len,
|
||||
Py_UCS4* aff_buf,
|
||||
cdef void _set_affix_lengths(
|
||||
const unsigned char[:] text_buf,
|
||||
char* aff_len_buf,
|
||||
const int pref_len,
|
||||
const int suff_len,
|
||||
const bint to_lower
|
||||
)
|
||||
) nogil
|
||||
|
||||
|
||||
cdef void _search_for_chars(
|
||||
const Py_UCS4* text_buf,
|
||||
const int tok_idx,
|
||||
const int tok_len,
|
||||
Py_UCS4* search_buf,
|
||||
Py_UCS4* lookup_buf,
|
||||
const int search_buf_len,
|
||||
Py_UCS4* result_buf,
|
||||
const int result_buf_len,
|
||||
cdef bint _search_for_chars(
|
||||
const unsigned char[:] tok_str,
|
||||
const unsigned char[:] s_1byte_ch,
|
||||
const unsigned char[:] s_2byte_ch,
|
||||
const unsigned char[:] s_3byte_ch,
|
||||
const unsigned char[:] s_4byte_ch,
|
||||
unsigned char* res_buf,
|
||||
unsigned char* len_buf,
|
||||
bint suffs_not_prefs
|
||||
) nogil
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ class Doc:
|
|||
blocked: Optional[List[Span]] = ...,
|
||||
missing: Optional[List[Span]] = ...,
|
||||
outside: Optional[List[Span]] = ...,
|
||||
default: str = ...
|
||||
default: str = ...,
|
||||
) -> None: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
|
@ -178,16 +178,18 @@ class Doc:
|
|||
self,
|
||||
*,
|
||||
cs: bool,
|
||||
pref_lengths: Ints1d,
|
||||
suff_lengths: Ints1d,
|
||||
pref_search_chars: str,
|
||||
pref_lookup_chars: str,
|
||||
pref_search_char_length: int,
|
||||
pref_search_lengths: Ints1d,
|
||||
suff_search_chars: str,
|
||||
suff_lookup_chars: str,
|
||||
suff_search_char_length: int,
|
||||
suff_search_lengths: Ints1d,
|
||||
p_lengths: Ints1d,
|
||||
s_lengths: Ints1d,
|
||||
ps_1byte_ch: bytes,
|
||||
ps_2byte_ch: bytes,
|
||||
ps_3byte_ch: bytes,
|
||||
ps_4byte_ch: bytes,
|
||||
ps_lengths: Ints1d,
|
||||
ss_1byte_ch: bytes,
|
||||
ss_2byte_ch: bytes,
|
||||
ss_3byte_ch: bytes,
|
||||
ss_4byte_ch: bytes,
|
||||
ss_lengths: Ints1d,
|
||||
) -> Ints2d: ...
|
||||
@staticmethod
|
||||
def _get_array_attrs() -> Tuple[Any]: ...
|
||||
|
|
|
@ -1736,18 +1736,20 @@ cdef class Doc:
|
|||
return output
|
||||
|
||||
|
||||
def get_character_combination_hashes(self,
|
||||
def np.ndarray get_character_combination_hashes(self,
|
||||
*,
|
||||
const bint cs,
|
||||
np.ndarray p_lengths,
|
||||
np.ndarray s_lengths,
|
||||
const char* ps_search,
|
||||
const char* ps_lookup,
|
||||
const int ps_l,
|
||||
const unsigned char[:] ps_1byte_ch,
|
||||
const unsigned char[:] ps_2byte_ch,
|
||||
const unsigned char[:] ps_3byte_ch,
|
||||
const unsigned char[:] ps_4byte_ch,
|
||||
np.ndarray ps_lengths,
|
||||
const char* ss_search,
|
||||
const char* ss_lookup,
|
||||
const int ss_l,
|
||||
const unsigned char[:] ss_1byte_ch,
|
||||
const unsigned char[:] ss_2byte_ch,
|
||||
const unsigned char[:] ss_3byte_ch,
|
||||
const unsigned char[:] ss_4byte_ch,
|
||||
np.ndarray ss_lengths,
|
||||
):
|
||||
"""
|
||||
|
@ -1766,44 +1768,26 @@ cdef class Doc:
|
|||
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
|
||||
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||
ps_search: a byte array containing characters to search for within each token, starting at the beginning.
|
||||
ps_lookup: a byte array containing characters that are added to the result string when a character at
|
||||
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
||||
case-insensitivity to be handled efficiently.
|
||||
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
||||
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||
starting at the beginning.
|
||||
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
||||
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||
"spaCy" would be "a" and "ac".
|
||||
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
||||
ss_lookup: a byte array containing characters that are added to the result string when a character at
|
||||
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
||||
case-insensitivity to be handled efficiently.
|
||||
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
||||
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||
starting at the end.
|
||||
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
||||
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
||||
"spaCy" would be "c" and "ca".
|
||||
|
||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
|
||||
would correspond to
|
||||
|
||||
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
||||
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
||||
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
||||
"""
|
||||
|
||||
# Encode the document text
|
||||
cdef bytes encoded_text = self.text.encode("utf-32le")
|
||||
cdef char* intermediate_text = encoded_text
|
||||
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
|
||||
|
||||
# Define the result array and work out what is used for what in axis 1
|
||||
cdef int num_toks = len(self)
|
||||
cdef int p_h_num = p_lengths.shape[0]
|
||||
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
|
||||
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
|
||||
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
|
||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
||||
cdef np.ndarray[np.int64_t, ndim=2] hashes
|
||||
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
||||
|
||||
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
|
||||
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
|
||||
|
@ -1811,15 +1795,13 @@ cdef class Doc:
|
|||
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
|
||||
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
|
||||
|
||||
# Define / allocate buffer (pr/sr: result buffers)
|
||||
cdef int aff_buf_l = p_max_l + s_max_l
|
||||
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
|
||||
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
|
||||
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
|
||||
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
|
||||
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
||||
# Define / allocate buffers
|
||||
cdef int aff_l = p_max_l + s_max_l
|
||||
cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
|
||||
cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
|
||||
cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
|
||||
cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
|
||||
cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
|
||||
|
||||
# Define memory views on length arrays
|
||||
cdef int[:] p_lengths_v = p_lengths
|
||||
|
@ -1829,38 +1811,51 @@ cdef class Doc:
|
|||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int tok_i, tok_idx, tok_len, aff_len
|
||||
cdef int tok_i, offset
|
||||
cdef uint64_t hash_val
|
||||
cdef attr_t num_tok_attr
|
||||
cdef const unsigned char[:] tok_str
|
||||
|
||||
for tok_i in range(num_toks):
|
||||
tok_c = self.c[tok_i]
|
||||
tok_idx = tok_c.idx
|
||||
tok_len = tok_c.lex.length
|
||||
|
||||
if aff_buf_l > 0:
|
||||
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
||||
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||
tok_str = self.vocab.strings.utf8_view(num_tok_attr)
|
||||
|
||||
if aff_l > 0:
|
||||
_set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
|
||||
for hash_idx in range(p_h_num):
|
||||
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
|
||||
offset = aff_len_buf[p_lengths_v[hash_idx]]
|
||||
if offset > 0:
|
||||
hash_val = hash32(<void*> &qcktest2[0], offset, 0)
|
||||
hashes[tok_i, hash_idx] = hash_val
|
||||
|
||||
for hash_idx in range(p_h_num, s_h_end):
|
||||
aff_len = s_lengths_v[hash_idx - p_h_num]
|
||||
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
||||
offset = s_lengths_v[hash_idx - p_h_num]
|
||||
if offset > 0:
|
||||
hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
|
||||
hashes[tok_i, hash_idx] = hash_val
|
||||
|
||||
if ps_h_num > 0:
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
|
||||
if (
|
||||
ps_h_num > 0 and
|
||||
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
|
||||
):
|
||||
for hash_idx in range(s_h_end, ps_h_end):
|
||||
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
|
||||
if ss_h_num > 0:
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
|
||||
if (
|
||||
ss_h_num > 0 and
|
||||
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
|
||||
):
|
||||
for hash_idx in range(ps_h_end, ss_h_end):
|
||||
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
|
||||
self.mem.free(aff_buf)
|
||||
self.mem.free(ps_r_buf)
|
||||
self.mem.free(ss_r_buf)
|
||||
self.mem.free(aff_len_buf)
|
||||
self.mem.free(ps_res_buf)
|
||||
self.mem.free(ps_len_buf)
|
||||
self.mem.free(ss_res_buf)
|
||||
self.mem.free(ss_len_buf)
|
||||
return hashes
|
||||
|
||||
@staticmethod
|
||||
|
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
return lca_matrix
|
||||
|
||||
|
||||
cdef void _copy_chars(
|
||||
Py_UCS4* target,
|
||||
const Py_UCS4* source,
|
||||
const int length,
|
||||
const bint to_lower
|
||||
):
|
||||
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
||||
any upper-case characters to lower case within the target buffer.
|
||||
"""
|
||||
cdef int idx
|
||||
|
||||
memcpy(target, source, length * sizeof(Py_UCS4))
|
||||
if to_lower:
|
||||
for idx in range(length):
|
||||
if Py_UNICODE_ISUPPER(target[idx]):
|
||||
target[idx] = Py_UNICODE_TOLOWER(target[idx])
|
||||
|
||||
|
||||
cdef void _set_affixes(
|
||||
const Py_UCS4* text_buf,
|
||||
const int tok_idx,
|
||||
const int tok_len,
|
||||
Py_UCS4* aff_buf,
|
||||
cdef void _set_affix_lengths(
|
||||
const unsigned char[:] text_buf,
|
||||
char* aff_len_buf,
|
||||
const int pref_len,
|
||||
const int suff_len,
|
||||
const bint to_lower
|
||||
):
|
||||
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
||||
) nogil:
|
||||
""" TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
||||
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
|
||||
|
||||
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
||||
|
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
|
|||
suff_len: the length of the suffix.
|
||||
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
||||
"""
|
||||
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
|
||||
cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
|
||||
|
||||
if pref_len > 0:
|
||||
filled_pref_len = pref_len if pref_len < tok_len else tok_len
|
||||
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
|
||||
aff_buf_idx = filled_pref_len
|
||||
while aff_len_buf_idx < pref_len:
|
||||
if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
||||
aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
|
||||
aff_len_buf_idx += 1
|
||||
text_buf_idx += 1
|
||||
if text_buf_idx == len(text_buf):
|
||||
break
|
||||
|
||||
if tok_len < pref_len:
|
||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
||||
aff_buf_idx = aff_buf_len - suff_len
|
||||
if aff_len_buf_idx < pref_len:
|
||||
memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
|
||||
aff_len_buf_idx = pref_len
|
||||
|
||||
if tok_len < suff_len:
|
||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
||||
aff_buf_idx = aff_buf_len - tok_len
|
||||
text_buf_idx = 1
|
||||
while aff_len_buf_idx < pref_len + suff_len:
|
||||
if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
||||
aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
|
||||
aff_len_buf_idx += 1
|
||||
text_buf_idx += 1
|
||||
if text_buf_idx > text_buf_len:
|
||||
break
|
||||
|
||||
if suff_len > 0:
|
||||
# in_word_idx: the index within the token where the suffix starts
|
||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||
if in_word_idx < pref_len:
|
||||
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
||||
aff_buf_idx += filled_pref_len - in_word_idx
|
||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||
if aff_buf_idx < aff_buf_len:
|
||||
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
||||
if aff_len_buf_idx < pref_len + suff_len:
|
||||
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
|
||||
|
||||
|
||||
cdef void _search_for_chars(
|
||||
const Py_UCS4* text_buf,
|
||||
const int tok_idx,
|
||||
const int tok_len,
|
||||
Py_UCS4* search_buf,
|
||||
Py_UCS4* lookup_buf,
|
||||
const int search_buf_len,
|
||||
Py_UCS4* result_buf,
|
||||
const int result_buf_len,
|
||||
cdef bint _search_for_chars(
|
||||
const unsigned char[:] tok_str,
|
||||
const unsigned char[:] s_1byte_ch,
|
||||
const unsigned char[:] s_2byte_ch,
|
||||
const unsigned char[:] s_3byte_ch,
|
||||
const unsigned char[:] s_4byte_ch,
|
||||
unsigned char* res_buf,
|
||||
unsigned char* len_buf,
|
||||
bint suffs_not_prefs
|
||||
) nogil:
|
||||
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
||||
|
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
|
|||
result_buf: the buffer in which to place the results.
|
||||
result_buf_len: the length of *result_buf*.
|
||||
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||
|
||||
Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
|
||||
"""
|
||||
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
||||
cdef int search_buf_idx
|
||||
|
@ -2159,6 +2135,8 @@ cdef void _search_for_chars(
|
|||
if result_buf_idx < result_buf_len:
|
||||
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
|
||||
|
||||
return result_buf_idx > 0
|
||||
|
||||
|
||||
def pickle_doc(doc):
|
||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
||||
|
|
|
@ -1737,69 +1737,42 @@ def all_equal(iterable):
|
|||
return next(g, True) and not next(g, False)
|
||||
|
||||
|
||||
def get_arrays_for_search_chars(
|
||||
def get_search_char_byte_arrays(
|
||||
search_chars: str, case_sensitive: bool
|
||||
) -> Tuple[bytes, bytes]:
|
||||
) -> Tuple[bytes, bytes, bytes, bytes]:
|
||||
"""
|
||||
This function supports the rich feature extractor. It returns search byte arrays with
|
||||
4-byte character width that are used for comparison when searching document texts
|
||||
for search characters. The encoding is little-endian regardless of architecture, as
|
||||
this is what is expected by the murmurhash library used downstream.
|
||||
|
||||
Alongside the "search array" against which words from document texts are compared
|
||||
is the "lookup array". When a character from the search array is matched,
|
||||
the character at the corresponding position in the lookup array is added to the
|
||||
sequence that then goes on to be hashed. This enables case-sensitivity
|
||||
to be handled without converting the case of the words being searched: if
|
||||
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
||||
have case are added to the search array, and both the original character and its
|
||||
other-cased counterpart map to the lower-case version in the lookup array.
|
||||
This function supports the rich feature extractor. It splits the UTF-8 representation
|
||||
of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
|
||||
characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
|
||||
is converted to lower case if *case_sensitive==False*.
|
||||
"""
|
||||
|
||||
def encode(ch: str) -> bytes:
|
||||
"""
|
||||
ch: a single character
|
||||
"""
|
||||
return ch.encode("UTF-32LE")
|
||||
|
||||
def add_to_arrays(
|
||||
search: List[bytes], lookup: List[bytes], ch: str
|
||||
) -> None:
|
||||
"""Add the byte representations of *ch* to the two byte array lists.
|
||||
"""
|
||||
this_char_bytes = encode(ch)
|
||||
if not case_sensitive and ch.islower():
|
||||
if this_char_bytes not in search:
|
||||
search.append(this_char_bytes)
|
||||
lookup.append(this_char_bytes)
|
||||
upper_char_bytes = encode(ch.upper())
|
||||
if upper_char_bytes not in search:
|
||||
search.append(upper_char_bytes)
|
||||
lookup.append(this_char_bytes)
|
||||
elif not case_sensitive and ch.isupper():
|
||||
lower_char_bytes = encode(ch.lower())
|
||||
if this_char_bytes not in search:
|
||||
search.append(this_char_bytes)
|
||||
lookup.append(lower_char_bytes)
|
||||
if lower_char_bytes not in search:
|
||||
search.append(lower_char_bytes)
|
||||
lookup.append(lower_char_bytes)
|
||||
elif this_char_bytes not in search:
|
||||
search.append(this_char_bytes)
|
||||
lookup.append(this_char_bytes)
|
||||
|
||||
def get_ordered_raw_bytes(
|
||||
search: List[bytes], lookup: List[bytes]
|
||||
) -> Tuple[bytes, bytes]:
|
||||
"""Flatten the two lists, ordering both by the entries in *search*.
|
||||
"""
|
||||
num_search = [list(entry) for entry in search]
|
||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
|
||||
return b"".join(search), b"".join(lookup)
|
||||
|
||||
search: List[bytes] = []
|
||||
lookup: List[bytes] = []
|
||||
for ch in search_chars:
|
||||
add_to_arrays(search, lookup, ch)
|
||||
return get_ordered_raw_bytes(search, lookup)
|
||||
sc1 = bytearray()
|
||||
sc2 = bytearray()
|
||||
sc3 = bytearray()
|
||||
sc4 = bytearray()
|
||||
if not case_sensitive:
|
||||
search_chars = search_chars.lower()
|
||||
ordered_search_chars = "".join(sorted(set(search_chars)))
|
||||
encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
|
||||
working_start = 0
|
||||
for idx in range(len(encoded_search_char_bytes) + 1):
|
||||
if idx == 0:
|
||||
continue
|
||||
if (
|
||||
idx == len(encoded_search_char_bytes)
|
||||
or encoded_search_char_bytes[idx] & 0xC0 != 0x80 # not continuation byte
|
||||
):
|
||||
char_length = idx - working_start
|
||||
if char_length == 1:
|
||||
sc1.extend(encoded_search_char_bytes[working_start:idx])
|
||||
elif char_length == 2:
|
||||
sc2.extend(encoded_search_char_bytes[working_start:idx])
|
||||
elif char_length == 3:
|
||||
sc3.extend(encoded_search_char_bytes[working_start:idx])
|
||||
elif char_length == 4:
|
||||
sc4.extend(encoded_search_char_bytes[working_start:idx])
|
||||
else:
|
||||
raise RuntimeError(Errors.E1049)
|
||||
working_start = idx
|
||||
return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
|
||||
|
|
Loading…
Reference in New Issue
Block a user