Changes after review discussion — intermed. state

This commit is contained in:
richardpaulhudson 2022-10-27 18:03:25 +02:00
parent 7d8258bec8
commit a1b8697aab
10 changed files with 294 additions and 308 deletions

View File

@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
"is a Cython extension type.") "is a Cython extension type.")
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
"aware that this might affect other components in your pipeline.") "aware that this might affect other components in your pipeline.")
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.") "knowledge base, use `InMemoryLookupKB`.")
E1047 = ("Invalid rich group config '{label}'.") E1047 = ("Invalid rich group config '{label}'.")
E1048 = ("Length > 63 in rich group config '{label}.")
E1049 = ("Error splitting UTF-8 byte string into separate characters.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,5 +1,6 @@
from encodings import search_function from encodings import search_function
from typing import Optional, List, Union, cast from typing import Optional, List, Union, cast
import warnings
from spacy.ml.richfeatureextractor import RichFeatureExtractor from spacy.ml.richfeatureextractor import RichFeatureExtractor
from thinc.types import Floats2d, Ints2d, Ragged from thinc.types import Floats2d, Ints2d, Ragged
from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import chain, clone, concatenate, with_array, with_padded
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ...errors import Errors from ...errors import Errors, Warnings
from ...ml import _character_embed from ...ml import _character_embed
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor from ..featureextractor import FeatureExtractor
@ -207,6 +208,8 @@ def _verify_rich_config_group(
raise ValueError(Errors.E1047.format(label=label)) raise ValueError(Errors.E1047.format(label=label))
elif search_chars is not None: elif search_chars is not None:
raise ValueError(Errors.E1047.format(label=label)) raise ValueError(Errors.E1047.format(label=label))
if lengths is not None and max(lengths) > 63:
raise ValueError(Errors.E1048.format(label=label))
@registry.architectures("spacy.RichMultiHashEmbed.v1") @registry.architectures("spacy.RichMultiHashEmbed.v1")
@ -246,13 +249,13 @@ def RichMultiHashEmbed(
depending on the presence of some other letter before or after it, e.g. German depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`. singular lemmas where the `e` is no longer present and the `ä` has become `a`.
For most languages used with spaCy, searching is likely to be useful starting For most languages used with spaCy, searching is likely to be useful starting
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
is also offered for completeness. Search characters should consist of all is also offered for completeness. Search characters should consist of all
characters that regularly alternate with other characters in the language in characters that regularly alternate with other characters in the language in
question or whose presence before or after characters that would otherwise question or whose presence before or after characters that would otherwise
alternate prevents the alternation from occurring, e.g. an `ä` in a German alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the plural noun does not become `a` if it is the third or fourth vowel from the
end of the word. end of the word.
width (int): The output width. Also used as the width of the embedding tables. width (int): The output width. Also used as the width of the embedding tables.
@ -263,27 +266,27 @@ def RichMultiHashEmbed(
same length as attrs. same length as attrs.
include_static_vectors (bool): Whether to also use static word vectors. include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
case_sensitive (bool): Whether lower-case and upper-case letters should be case_sensitive (bool): Whether lower-case and upper-case letters should be
distinguished when generating the character combinations to use as features. distinguished when generating the character combinations to use as features.
pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
for each word, e.g. for the word `spaCy`: for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `s` and `spa` being used as features. `[1, 3]` would lead to `s` and `spa` being used as features.
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`. pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
for each word, e.g. for the word `spaCy`: for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `y` and `aCy` being used as features. `[1, 3]` would lead to `y` and `aCy` being used as features.
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`. suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
pref_search_chars (Optional[str]): A string containing characters to search for pref_search_chars (Optional[str]): A string containing characters to search for
starting from the beginning of each word. starting from the beginning of each word.
pref_search_lengths (Optional[List[int]]): The lengths of search result strings pref_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the beginning of each word. to use as features, where the searches start from the beginning of each word.
pref_search_rows (Optional[List[int]]): The number of rows for each of pref_search_rows (Optional[List[int]]): The number of rows for each of
`pref_search_lengths`. `pref_search_lengths`.
suff_search_chars (Optional[str]): A string containing characters to search for suff_search_chars (Optional[str]): A string containing characters to search for
starting from the end of each word. starting from the end of each word.
suff_search_lengths (Optional[List[int]]): The lengths of search result strings suff_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the end of each word. to use as features, where the searches start from the end of each word.
suff_search_rows (Optional[List[int]]): The number of rows for each of suff_search_rows (Optional[List[int]]): The number of rows for each of
`suff_search_lengths`. `suff_search_lengths`.
""" """
@ -313,6 +316,9 @@ def RichMultiHashEmbed(
case_sensitive, case_sensitive,
) )
if "PREFIX" in attrs or "SUFFIX" in attrs:
warnings.warn(Warnings.W124)
if pref_rows is not None: if pref_rows is not None:
rows.extend(pref_rows) rows.extend(pref_rows)
if suff_rows is not None: if suff_rows is not None:

View File

@ -1,5 +1,7 @@
from typing import List, Optional, Callable, Tuple from typing import List, Optional, Callable, Tuple
from ..util import get_arrays_for_search_chars from spacy.util import get_search_char_byte_arrays
# from ..util import get_arrays_for_search_chars
from thinc.types import Ints1d, Ints2d from thinc.types import Ints1d, Ints2d
from thinc.api import Model, registry, get_current_ops from thinc.api import Model, registry, get_current_ops
@ -19,17 +21,23 @@ def RichFeatureExtractor(
) -> Model[List[Doc], List[Ints2d]]: ) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops() ops = get_current_ops()
if pref_search_chars is not None: if pref_search_chars is not None:
pref_search, pref_lookup = get_arrays_for_search_chars( (
pref_search_chars, case_sensitive ps_1byte_ch,
) ps_2byte_ch,
ps_3byte_ch,
ps_4byte_ch,
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
else: else:
pref_search, pref_lookup = bytes(), bytes() ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
if suff_search_chars is not None: if suff_search_chars is not None:
suff_search, suff_lookup = get_arrays_for_search_chars( (
suff_search_chars, case_sensitive ss_1byte_ch,
) ss_2byte_ch,
ss_3byte_ch,
ss_4byte_ch,
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
else: else:
suff_search, suff_lookup = bytes(), bytes() ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
return Model( return Model(
"extract_character_combination_hashes", "extract_character_combination_hashes",
forward, forward,
@ -41,19 +49,17 @@ def RichFeatureExtractor(
"suff_lengths": ops.asarray1i(suff_lengths) "suff_lengths": ops.asarray1i(suff_lengths)
if suff_lengths is not None if suff_lengths is not None
else ops.asarray1i([]), else ops.asarray1i([]),
"pref_search": pref_search, "pref_search_1_byte": ps_1byte_ch,
"pref_lookup": pref_lookup, "pref_search_2_bytes": ps_2byte_ch,
"pref_search_char_len": len(pref_search) / 4 "pref_search_3_bytes": ps_3byte_ch,
if pref_search_chars is not None "pref_search_4_bytes": ps_4byte_ch,
else 0,
"pref_search_lengths": ops.asarray1i(pref_search_lengths) "pref_search_lengths": ops.asarray1i(pref_search_lengths)
if pref_search_lengths is not None if pref_search_lengths is not None
else ops.asarray1i([]), else ops.asarray1i([]),
"suff_search": suff_search, "suff_search_1_byte": ss_1byte_ch,
"suff_lookup": suff_lookup, "suff_search_2_bytes": ss_2byte_ch,
"suff_search_char_len": len(suff_search) / 4 "suff_search_3_bytes": ss_3byte_ch,
if suff_search_chars is not None "suff_search_4_bytes": ss_4byte_ch,
else 0,
"suff_search_lengths": ops.asarray1i(suff_search_lengths) "suff_search_lengths": ops.asarray1i(suff_search_lengths)
if suff_search_lengths is not None if suff_search_lengths is not None
else ops.asarray1i([]), else ops.asarray1i([]),
@ -68,13 +74,15 @@ def forward(
case_sensitive: bool = model.attrs["case_sensitive"] case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: Ints1d = model.attrs["pref_lengths"] pref_lengths: Ints1d = model.attrs["pref_lengths"]
suff_lengths: Ints1d = model.attrs["suff_lengths"] suff_lengths: Ints1d = model.attrs["suff_lengths"]
pref_search: bytes = model.attrs["pref_search"] ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
pref_lookup: bytes = model.attrs["pref_lookup"] ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
pref_search_char_len: int = model.attrs["pref_search_char_len"] ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"] pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"] ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
suff_lookup: bytes = model.attrs["suff_lookup"] ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
suff_search_char_len: int = model.attrs["suff_search_char_len"] ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"] suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
features: List[Ints2d] = [] features: List[Ints2d] = []
for doc in docs: for doc in docs:
@ -82,13 +90,15 @@ def forward(
cs=case_sensitive, cs=case_sensitive,
p_lengths=pref_lengths, p_lengths=pref_lengths,
s_lengths=suff_lengths, s_lengths=suff_lengths,
ps_search=pref_search, ps_1byte_ch=ps_1byte_ch,
ps_lookup=pref_lookup, ps_2byte_ch=ps_2byte_ch,
ps_l=pref_search_char_len, ps_3byte_ch=ps_3byte_ch,
ps_4byte_ch=ps_4byte_ch,
ps_lengths=pref_search_lengths, ps_lengths=pref_search_lengths,
ss_search=suff_search, ss_1byte_ch=ss_1byte_ch,
ss_lookup=suff_lookup, ss_2byte_ch=ss_2byte_ch,
ss_l=suff_search_char_len, ss_3byte_ch=ss_3byte_ch,
ss_4byte_ch=ss_4byte_ch,
ss_lengths=suff_search_lengths, ss_lengths=suff_search_lengths,
) )
features.append(ops.asarray2i(hashes)) features.append(ops.asarray2i(hashes))

View File

@ -27,3 +27,4 @@ cdef class StringStore:
cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
cdef const unsigned char[:] utf8_view(self, attr_t hash_val)

View File

@ -315,3 +315,25 @@ cdef class StringStore:
self._map.set(key, value) self._map.set(key, value)
self.keys.push_back(key) self.keys.push_back(key)
return value return value
cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
if hash_val == 0:
return ""
elif hash_val < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[hash_val]
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1]
elif string.p[0] < 255:
return string.p[1:string.p[0]+1]
else:
i = 0
length = 0
while string.p[i] == 255:
i += 1
length += 255
length += string.p[i]
i += 1
return string.p[i:length + i]

View File

@ -1,55 +1,57 @@
import spacy import spacy
import pytest
def test_get_arrays_for_search_chars_width_2_not_case_sensitive(): @pytest.mark.parametrize("case_sensitive", [True, False])
( def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
search, sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
lookup, if case_sensitive:
) = spacy.util.get_arrays_for_search_chars("bféwfw", False) assert sc1 == b"EPaz"
assert ( else:
lookup assert sc1 == b"aepz"
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" assert sc2 == b""
) assert sc3 == b""
assert sc4 == b""
assert ( @pytest.mark.parametrize("case_sensitive", [True, False])
search def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
== b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
) assert sc1 == b""
assert sc2 == b""
assert sc3 == b""
assert sc4 == "𐌞".encode("utf-8")
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
if case_sensitive:
assert sc1 == b"Bab"
assert sc2 == "Éé".encode("utf-8")
else:
assert sc1 == b"ab"
assert sc2 == "é".encode("utf-8")
assert sc3 == "".encode("utf-8")
assert sc4 == "𐌞".encode("utf-8")
def test_get_arrays_for_search_chars_width_2_case_sensitive(): @pytest.mark.parametrize("case_sensitive", [True, False])
( def test_turkish_i_with_dot(case_sensitive):
search, sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
lookup, if case_sensitive:
) = spacy.util.get_arrays_for_search_chars("bféwfw", True) assert sc2 == "İ".encode("utf-8")
assert ( assert sc1 == sc3 == sc4 == b""
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" else:
) assert sc1 == b"i"
assert sc2 == b"\xcc\x87"
assert sc3 == sc4 == b""
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_arrays_for_search_chars_width_4_not_case_sensitive(): def test_turkish_i_with_dot_and_normal_i(case_sensitive):
( sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
search, if case_sensitive:
lookup, assert sc1 == b"I"
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) assert sc2 == "İ".encode("utf-8")
assert ( assert sc3 == sc4 == b""
search else:
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" assert sc1 == b"i"
) assert sc2 == b"\xcc\x87"
assert sc3 == sc4 == b""
assert (
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_arrays_for_search_chars_width_4_case_sensitive():
(
search,
lookup,
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert search == lookup
assert (
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)

View File

@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _copy_chars( cdef void _set_affix_lengths(
Py_UCS4* target, const unsigned char[:] text_buf,
const Py_UCS4* source, char* aff_len_buf,
const int length,
const bint to_lower
)
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_len, const int pref_len,
const int suff_len, const int suff_len,
const bint to_lower ) nogil
)
cdef void _search_for_chars( cdef bint _search_for_chars(
const Py_UCS4* text_buf, const unsigned char[:] tok_str,
const int tok_idx, const unsigned char[:] s_1byte_ch,
const int tok_len, const unsigned char[:] s_2byte_ch,
Py_UCS4* search_buf, const unsigned char[:] s_3byte_ch,
Py_UCS4* lookup_buf, const unsigned char[:] s_4byte_ch,
const int search_buf_len, unsigned char* res_buf,
Py_UCS4* result_buf, unsigned char* len_buf,
const int result_buf_len,
bint suffs_not_prefs bint suffs_not_prefs
) nogil ) nogil

View File

@ -126,7 +126,7 @@ class Doc:
blocked: Optional[List[Span]] = ..., blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ...,
default: str = ... default: str = ...,
) -> None: ... ) -> None: ...
@property @property
def noun_chunks(self) -> Iterator[Span]: ... def noun_chunks(self) -> Iterator[Span]: ...
@ -178,16 +178,18 @@ class Doc:
self, self,
*, *,
cs: bool, cs: bool,
pref_lengths: Ints1d, p_lengths: Ints1d,
suff_lengths: Ints1d, s_lengths: Ints1d,
pref_search_chars: str, ps_1byte_ch: bytes,
pref_lookup_chars: str, ps_2byte_ch: bytes,
pref_search_char_length: int, ps_3byte_ch: bytes,
pref_search_lengths: Ints1d, ps_4byte_ch: bytes,
suff_search_chars: str, ps_lengths: Ints1d,
suff_lookup_chars: str, ss_1byte_ch: bytes,
suff_search_char_length: int, ss_2byte_ch: bytes,
suff_search_lengths: Ints1d, ss_3byte_ch: bytes,
ss_4byte_ch: bytes,
ss_lengths: Ints1d,
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -1736,18 +1736,20 @@ cdef class Doc:
return output return output
def get_character_combination_hashes(self, def np.ndarray get_character_combination_hashes(self,
*, *,
const bint cs, const bint cs,
np.ndarray p_lengths, np.ndarray p_lengths,
np.ndarray s_lengths, np.ndarray s_lengths,
const char* ps_search, const unsigned char[:] ps_1byte_ch,
const char* ps_lookup, const unsigned char[:] ps_2byte_ch,
const int ps_l, const unsigned char[:] ps_3byte_ch,
const unsigned char[:] ps_4byte_ch,
np.ndarray ps_lengths, np.ndarray ps_lengths,
const char* ss_search, const unsigned char[:] ss_1byte_ch,
const char* ss_lookup, const unsigned char[:] ss_2byte_ch,
const int ss_l, const unsigned char[:] ss_3byte_ch,
const unsigned char[:] ss_4byte_ch,
np.ndarray ss_lengths, np.ndarray ss_lengths,
): ):
""" """
@ -1766,44 +1768,26 @@ cdef class Doc:
the prefixes hashed for "spaCy" would be "sp" and "spa". the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
ps_search: a byte array containing characters to search for within each token, starting at the beginning. ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
ps_lookup: a byte array containing characters that are added to the result string when a character at starting at the beginning.
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac". "spaCy" would be "a" and "ac".
ss_search: a byte array containing characters to search for within each token, starting at the end. ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
ss_lookup: a byte array containing characters that are added to the result string when a character at starting at the end.
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca". "spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
would correspond to
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
""" """
# Encode the document text
cdef bytes encoded_text = self.text.encode("utf-32le")
cdef char* intermediate_text = encoded_text
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
# Define the result array and work out what is used for what in axis 1 # Define the result array and work out what is used for what in axis 1
cdef int num_toks = len(self) cdef int num_toks = len(self)
cdef int p_h_num = p_lengths.shape[0] cdef int p_h_num = p_lengths.shape[0]
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") cdef np.ndarray[np.int64_t, ndim=2] hashes
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0 cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
@ -1811,15 +1795,13 @@ cdef class Doc:
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0 cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0 cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
# Define / allocate buffer (pr/sr: result buffers) # Define / allocate buffers
cdef int aff_buf_l = p_max_l + s_max_l cdef int aff_l = p_max_l + s_max_l
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4)) cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4)) cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
# Define memory views on length arrays # Define memory views on length arrays
cdef int[:] p_lengths_v = p_lengths cdef int[:] p_lengths_v = p_lengths
@ -1829,38 +1811,51 @@ cdef class Doc:
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int tok_i, tok_idx, tok_len, aff_len cdef int tok_i, offset
cdef uint64_t hash_val
cdef attr_t num_tok_attr
cdef const unsigned char[:] tok_str
for tok_i in range(num_toks): for tok_i in range(num_toks):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
tok_idx = tok_c.idx num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
tok_len = tok_c.lex.length tok_str = self.vocab.strings.utf8_view(num_tok_attr)
if aff_buf_l > 0: if aff_l > 0:
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs) _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
for hash_idx in range(p_h_num): for hash_idx in range(p_h_num):
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0) offset = aff_len_buf[p_lengths_v[hash_idx]]
if offset > 0:
hash_val = hash32(<void*> &qcktest2[0], offset, 0)
hashes[tok_i, hash_idx] = hash_val
for hash_idx in range(p_h_num, s_h_end): for hash_idx in range(p_h_num, s_h_end):
aff_len = s_lengths_v[hash_idx - p_h_num] offset = s_lengths_v[hash_idx - p_h_num]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0) if offset > 0:
hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
hashes[tok_i, hash_idx] = hash_val
if ps_h_num > 0: if (
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False) ps_h_num > 0 and
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
):
for hash_idx in range(s_h_end, ps_h_end): for hash_idx in range(s_h_end, ps_h_end):
aff_len = ps_lengths_v[hash_idx - s_h_end] aff_len = ps_lengths_v[hash_idx - s_h_end]
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0) hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
if ss_h_num > 0: if (
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True) ss_h_num > 0 and
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
):
for hash_idx in range(ps_h_end, ss_h_end): for hash_idx in range(ps_h_end, ss_h_end):
aff_len = ss_lengths_v[hash_idx - ps_h_end] aff_len = ss_lengths_v[hash_idx - ps_h_end]
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0) hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
self.mem.free(aff_buf) self.mem.free(aff_len_buf)
self.mem.free(ps_r_buf) self.mem.free(ps_res_buf)
self.mem.free(ss_r_buf) self.mem.free(ps_len_buf)
self.mem.free(ss_res_buf)
self.mem.free(ss_len_buf)
return hashes return hashes
@staticmethod @staticmethod
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix return lca_matrix
cdef void _copy_chars( cdef void _set_affix_lengths(
Py_UCS4* target, const unsigned char[:] text_buf,
const Py_UCS4* source, char* aff_len_buf,
const int length,
const bint to_lower
):
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
any upper-case characters to lower case within the target buffer.
"""
cdef int idx
memcpy(target, source, length * sizeof(Py_UCS4))
if to_lower:
for idx in range(length):
if Py_UNICODE_ISUPPER(target[idx]):
target[idx] = Py_UNICODE_TOLOWER(target[idx])
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_len, const int pref_len,
const int suff_len, const int suff_len,
const bint to_lower ) nogil:
): """ TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros. If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
text_buf: a pointer to a UTF-32LE representation of the containing string. text_buf: a pointer to a UTF-32LE representation of the containing string.
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
suff_len: the length of the suffix. suff_len: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case. to_lower: if *True*, any upper case characters in either affix are converted to lower case.
""" """
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
if pref_len > 0:
filled_pref_len = pref_len if pref_len < tok_len else tok_len
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
aff_buf_idx = filled_pref_len
if tok_len < pref_len: while aff_len_buf_idx < pref_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len)) if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
aff_buf_idx = aff_buf_len - suff_len aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
aff_len_buf_idx += 1
text_buf_idx += 1
if text_buf_idx == len(text_buf):
break
if tok_len < suff_len: if aff_len_buf_idx < pref_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len)) memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
aff_buf_idx = aff_buf_len - tok_len aff_len_buf_idx = pref_len
if suff_len > 0: text_buf_idx = 1
# in_word_idx: the index within the token where the suffix starts while aff_len_buf_idx < pref_len + suff_len:
in_word_idx = aff_buf_idx + tok_len - aff_buf_len if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
if in_word_idx < pref_len: aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx)) aff_len_buf_idx += 1
aff_buf_idx += filled_pref_len - in_word_idx text_buf_idx += 1
in_word_idx = aff_buf_idx + tok_len - aff_buf_len if text_buf_idx > text_buf_len:
if aff_buf_idx < aff_buf_len: break
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
if aff_len_buf_idx < pref_len + suff_len:
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
cdef void _search_for_chars( cdef bint _search_for_chars(
const Py_UCS4* text_buf, const unsigned char[:] tok_str,
const int tok_idx, const unsigned char[:] s_1byte_ch,
const int tok_len, const unsigned char[:] s_2byte_ch,
Py_UCS4* search_buf, const unsigned char[:] s_3byte_ch,
Py_UCS4* lookup_buf, const unsigned char[:] s_4byte_ch,
const int search_buf_len, unsigned char* res_buf,
Py_UCS4* result_buf, unsigned char* len_buf,
const int result_buf_len,
bint suffs_not_prefs bint suffs_not_prefs
) nogil: ) nogil:
""" Search a word within a string for characters within *search_buf*, starting at the beginning or """ Search a word within a string for characters within *search_buf*, starting at the beginning or
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
result_buf: the buffer in which to place the results. result_buf: the buffer in which to place the results.
result_buf_len: the length of *result_buf*. result_buf_len: the length of *result_buf*.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
""" """
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef int search_buf_idx cdef int search_buf_idx
@ -2158,6 +2134,8 @@ cdef void _search_for_chars(
# fill in any unused characters in the result buffer with zeros # fill in any unused characters in the result buffer with zeros
if result_buf_idx < result_buf_len: if result_buf_idx < result_buf_len:
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4)) memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
return result_buf_idx > 0
def pickle_doc(doc): def pickle_doc(doc):

View File

@ -1737,69 +1737,42 @@ def all_equal(iterable):
return next(g, True) and not next(g, False) return next(g, True) and not next(g, False)
def get_arrays_for_search_chars( def get_search_char_byte_arrays(
search_chars: str, case_sensitive: bool search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes]: ) -> Tuple[bytes, bytes, bytes, bytes]:
""" """
This function supports the rich feature extractor. It returns search byte arrays with This function supports the rich feature extractor. It splits the UTF-8 representation
4-byte character width that are used for comparison when searching document texts of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
for search characters. The encoding is little-endian regardless of architecture, as characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
this is what is expected by the murmurhash library used downstream. is converted to lower case if *case_sensitive==False*.
Alongside the "search array" against which words from document texts are compared
is the "lookup array". When a character from the search array is matched,
the character at the corresponding position in the lookup array is added to the
sequence that then goes on to be hashed. This enables case-sensitivity
to be handled without converting the case of the words being searched: if
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search array, and both the original character and its
other-cased counterpart map to the lower-case version in the lookup array.
""" """
def encode(ch: str) -> bytes: sc1 = bytearray()
""" sc2 = bytearray()
ch: a single character sc3 = bytearray()
""" sc4 = bytearray()
return ch.encode("UTF-32LE") if not case_sensitive:
search_chars = search_chars.lower()
def add_to_arrays( ordered_search_chars = "".join(sorted(set(search_chars)))
search: List[bytes], lookup: List[bytes], ch: str encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
) -> None: working_start = 0
"""Add the byte representations of *ch* to the two byte array lists. for idx in range(len(encoded_search_char_bytes) + 1):
""" if idx == 0:
this_char_bytes = encode(ch) continue
if not case_sensitive and ch.islower(): if (
if this_char_bytes not in search: idx == len(encoded_search_char_bytes)
search.append(this_char_bytes) or encoded_search_char_bytes[idx] & 0xC0 != 0x80 # not continuation byte
lookup.append(this_char_bytes) ):
upper_char_bytes = encode(ch.upper()) char_length = idx - working_start
if upper_char_bytes not in search: if char_length == 1:
search.append(upper_char_bytes) sc1.extend(encoded_search_char_bytes[working_start:idx])
lookup.append(this_char_bytes) elif char_length == 2:
elif not case_sensitive and ch.isupper(): sc2.extend(encoded_search_char_bytes[working_start:idx])
lower_char_bytes = encode(ch.lower()) elif char_length == 3:
if this_char_bytes not in search: sc3.extend(encoded_search_char_bytes[working_start:idx])
search.append(this_char_bytes) elif char_length == 4:
lookup.append(lower_char_bytes) sc4.extend(encoded_search_char_bytes[working_start:idx])
if lower_char_bytes not in search: else:
search.append(lower_char_bytes) raise RuntimeError(Errors.E1049)
lookup.append(lower_char_bytes) working_start = idx
elif this_char_bytes not in search: return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
search.append(this_char_bytes)
lookup.append(this_char_bytes)
def get_ordered_raw_bytes(
search: List[bytes], lookup: List[bytes]
) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search*.
"""
num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))]
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
return b"".join(search), b"".join(lookup)
search: List[bytes] = []
lookup: List[bytes] = []
for ch in search_chars:
add_to_arrays(search, lookup, ch)
return get_ordered_raw_bytes(search, lookup)