Intermediate state

This commit is contained in:
richardpaulhudson 2022-10-20 21:48:53 +02:00
parent 2707d30ce0
commit f7d9942e7c
10 changed files with 377 additions and 329 deletions

View File

@ -946,7 +946,6 @@ class Errors(metaclass=ErrorsWithCodes):
"{value}.") "{value}.")
E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.") E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
E1045 = ("Invalid rich group config '{label}'.") E1045 = ("Invalid rich group config '{label}'.")
E1046 = ("Search characters may not contain characters that occupy four bytes in UTF-16.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -260,27 +260,6 @@ def RichMultiHashEmbed(
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
not become `a` if it is the third or fourth vowel from the end of the word. not become `a` if it is the third or fourth vowel from the end of the word.
Internally, the model converts each token string to UTF-16 and assumes that each
character from the string occupies two bytes. This assumption holds for all
characters in the Basic Multilingual Plane, which encompasses all characters that
are ever likely to be of interest when extracting features. There are, however,
characters like emojis that are in the Extended Multilingual Plane and occupy
four bytes, although importantly neither of the two byte pairs that make up such
a representation can be a valid two-byte character in its own right. The
following considerations apply to the processing of four-byte characters:
- An exceptional four-byte character within a text consisting mostly of two-byte
characters will probably be ignored by the neural network accepting the
embedding layer as not matching any of the learned features.
- If anyone did want to train a model for a language like Lycian that is
generally written in four-byte characters, prefix and suffix features can
still be extracted, but the length specifications should all be doubled, i.e.
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
situation length specifications that are odd numbers would serve no useful
purpose since they would refer to half-characters.
- Four-byte characters are not accepted within search character specification
strings and lead to an error being thrown.
width (int): The output width. Also used as the width of the embedding tables. width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300. Recommended values are between 64 and 300.
attrs (list of attr IDs): The token attributes to embed. A separate attrs (list of attr IDs): The token attributes to embed. A separate

View File

@ -1,7 +1,7 @@
from typing import List, Optional, Callable, Tuple from typing import List, Optional, Callable, Tuple
from ..util import get_byte_arrays_for_search_chars from ..util import get_arrays_for_search_chars
from thinc.types import Ints2d from thinc.types import Ints1d, Ints2d
from thinc.api import Model, registry from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc from ..tokens import Doc
@ -17,33 +17,46 @@ def RichFeatureExtractor(
suff_search_chars: Optional[str] = None, suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None, suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]: ) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None: if pref_search_chars is not None:
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive) pref_search, pref_lookup = get_arrays_for_search_chars(
pref_search_chars, case_sensitive
)
else: else:
pref_search, pref_ref = bytes(), bytes() pref_search, pref_lookup = bytes(), bytes()
if suff_search_chars is not None: if suff_search_chars is not None:
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive) suff_search, suff_lookup = get_arrays_for_search_chars(
suff_search_chars, case_sensitive
)
else: else:
suff_search, suff_ref = bytes(), bytes() suff_search, suff_lookup = bytes(), bytes()
return Model( return Model(
"extract_character_combination_hashes", "extract_character_combination_hashes",
forward, forward,
attrs={ attrs={
"case_sensitive": case_sensitive, "case_sensitive": case_sensitive,
"pref_lengths": pref_lengths if pref_lengths is not None else [], "pref_lengths": ops.asarray1i(pref_lengths)
"suff_lengths": suff_lengths if suff_lengths is not None else [], if pref_lengths is not None
else ops.asarray1i([]),
"suff_lengths": ops.asarray1i(suff_lengths)
if suff_lengths is not None
else ops.asarray1i([]),
"pref_search": pref_search, "pref_search": pref_search,
"pref_ref": pref_ref, "pref_lookup": pref_lookup,
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0, "pref_search_char_len": len(pref_search) / 4
"pref_search_lengths": pref_search_lengths if pref_search_chars is not None
else 0,
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
if pref_search_lengths is not None if pref_search_lengths is not None
else [], else ops.asarray1i([]),
"suff_search": suff_search, "suff_search": suff_search,
"suff_ref": suff_ref, "suff_lookup": suff_lookup,
"suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0, "suff_search_char_len": len(suff_search) / 4
"suff_search_lengths": suff_search_lengths if suff_search_chars is not None
else 0,
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
if suff_search_lengths is not None if suff_search_lengths is not None
else [], else ops.asarray1i([]),
}, },
) )
@ -53,30 +66,30 @@ def forward(
) -> Tuple[List[Ints2d], Callable]: ) -> Tuple[List[Ints2d], Callable]:
ops = model.ops ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"] case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: List[int] = model.attrs["pref_lengths"] pref_lengths: Ints1d = model.attrs["pref_lengths"]
suff_lengths: List[int] = model.attrs["suff_lengths"] suff_lengths: Ints1d = model.attrs["suff_lengths"]
pref_search: bytes = model.attrs["pref_search"] pref_search: bytes = model.attrs["pref_search"]
pref_ref: bytes = model.attrs["pref_ref"] pref_lookup: bytes = model.attrs["pref_lookup"]
pref_s_char_l: int = model.attr["pref_s_char_l"] pref_search_char_len: int = model.attrs["pref_search_char_len"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"] pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"] suff_search: bytes = model.attrs["suff_search"]
suff_ref: bytes = model.attrs["suff_ref"] suff_lookup: bytes = model.attrs["suff_lookup"]
suff_s_char_l: int = model.attr["suff_s_char_l"] suff_search_char_len: int = model.attrs["suff_search_char_len"]
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
features: List[Ints2d] = [] features: List[Ints2d] = []
for doc in docs: for doc in docs:
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive, cs=case_sensitive,
pref_lengths=pref_lengths, p_lengths=pref_lengths,
suff_lengths=suff_lengths, s_lengths=suff_lengths,
pref_search=pref_search, ps_search=pref_search,
pref_ref=pref_ref, ps_lookup=pref_lookup,
pref_s_char_l=pref_s_char_l, ps_l=pref_search_char_len,
pref_search_lengths=pref_search_lengths, ps_lengths=pref_search_lengths,
suff_search=suff_search, ss_search=suff_search,
suff_ref=suff_ref, ss_lookup=suff_lookup,
suff_s_char_l=suff_s_char_l, ss_l=suff_search_char_len,
suff_search_lengths=suff_search_lengths, ss_lengths=suff_search_lengths,
) )
features.append(ops.asarray2i(hashes)) features.append(ops.asarray2i(hashes))

View File

@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language from spacy.language import Language
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.util import get_byte_arrays_for_search_chars from spacy.util import get_arrays_for_search_chars
from spacy.vocab import Vocab from spacy.vocab import Vocab
from .test_underscore import clean_underscore # noqa: F401 from .test_underscore import clean_underscore # noqa: F401
@ -1004,21 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy") doc = en_tokenizer("spaCy✨ and Prodigy")
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive) ops = get_current_ops()
pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive)
suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
pref_lengths=[1, 4, 3], p_lengths=ops.asarray1i([1, 4, 3]),
suff_lengths=[2, 3, 4, 5], s_lengths=ops.asarray1i([2, 3, 4, 5]),
pref_search=bytes(), ps_search=pref_search,
pref_ref=bytes(), ps_lookup=pref_lookup,
pref_s_char_l = 0, ps_l=2 if case_sensitive else 4,
pref_search_lengths=[2], ps_lengths=ops.asarray1i([2]),
suff_search=suff_search, ss_search=suff_search,
suff_ref=suff_ref, ss_lookup=suff_lookup,
suff_s_char_l=5 if case_sensitive else 9, ss_l=5 if case_sensitive else 9,
suff_search_lengths=[2,1], ss_lengths=ops.asarray1i([2, 1]),
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("s") assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
@ -1035,7 +1036,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
"spaCy" if case_sensitive else "spacy" "spaCy" if case_sensitive else "spacy"
) )
assert hashes[0][7] == _get_unsigned_32_bit_hash(" ") assert hashes[0][7] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("") assert hashes[1][0] == _get_unsigned_32_bit_hash("")
@ -1067,7 +1068,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
assert hashes[3][7] == _get_unsigned_32_bit_hash(" ") assert hashes[3][7] == _get_unsigned_32_bit_hash(" " if case_sensitive else "pr")
assert hashes[3][9] == _get_unsigned_32_bit_hash("r") assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
@ -1077,73 +1078,93 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[3][8] == _get_unsigned_32_bit_hash("rp") assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
# check values are the same cross-platform # check values are the same cross-platform
assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016 if case_sensitive:
assert hashes[1][3] == 3425774424 assert hashes[0][1] == 3712103410
assert hashes[2][8] == 3076404432 else:
assert hashes[0][1] == 307339932
assert hashes[1][3] == 2414314354
assert hashes[2][8] == 1669671676
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer): def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("spaCy✨ and Prodigy")
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True) ops = get_current_ops()
pref_search, pref_lookup = get_arrays_for_search_chars("rp", False)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=True, cs=False,
pref_lengths=[], p_lengths=ops.asarray1i([]),
suff_lengths=[1, 2, 3], s_lengths=ops.asarray1i([2, 3, 4, 5]),
pref_search=bytes(), ps_search=pref_search,
pref_ref=bytes(), ps_lookup=pref_lookup,
pref_s_char_l = 0, ps_l=4,
pref_search_lengths=[], ps_lengths=ops.asarray1i([2]),
suff_search=suff_search, ss_search=bytes(),
suff_ref=suff_ref, ss_lookup=bytes(),
suff_s_char_l=1, ss_l=0,
suff_search_lengths=[1], ss_lengths=ops.asarray1i([]),
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞") assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞") assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
assert hashes[0][3] == _get_unsigned_32_bit_hash("a") assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
assert hashes[0][4] == _get_unsigned_32_bit_hash("p ")
assert hashes[1][0] == _get_unsigned_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("")
assert hashes[1][2] == _get_unsigned_32_bit_hash("")
assert hashes[1][3] == _get_unsigned_32_bit_hash("")
assert hashes[1][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
assert hashes[2][2] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][3] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
doc = en_tokenizer("and𐌞a")
hashes = doc.get_character_combination_hashes(
case_sensitive=False,
pref_lengths=[],
suff_lengths=[1, 2, 3, 4],
pref_search_chars="",
pref_search_lengths=[],
suff_search_chars="a",
suff_search_lengths=[1, 2],
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("a")
assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
assert hashes[0][4] == _get_unsigned_32_bit_hash("a")
assert hashes[0][5] == _get_unsigned_32_bit_hash("aa")
def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer): def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("sp𐌞Cé")
with pytest.raises(ValueError): ops = get_current_ops()
doc.get_character_combination_hashes(
case_sensitive=True, for p_length in range(1, 8):
pref_lengths=[], for s_length in range(1, 8):
suff_lengths=[2, 3, 4, 5], hashes = doc.get_character_combination_hashes(
pref_search_chars="", cs=False,
pref_search_lengths=[], p_lengths=ops.asarray1i([p_length]),
suff_search_chars="𐌞", s_lengths=ops.asarray1i([s_length]),
suff_search_lengths=[2], ps_search=bytes(),
) ps_lookup=bytes(),
ps_l=0,
ps_lengths=ops.asarray1i([]),
ss_search=bytes(),
ss_lookup=bytes(),
ss_l=0,
ss_lengths=ops.asarray1i([]),
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé "[:p_length])
assert hashes[0][1] == _get_unsigned_32_bit_hash(" sp𐌞cé"[8 - s_length :])
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("and𐌞")
assert doc.get_character_combination_hashes( ops = get_current_ops()
case_sensitive=True, hashes = doc.get_character_combination_hashes(
pref_lengths=[], cs=True,
suff_lengths=[], p_lengths=ops.asarray1i([]),
pref_search_chars="", s_lengths=ops.asarray1i([]),
pref_search_lengths=[], ps_search=bytes(),
suff_search_chars="", ps_lookup=bytes(),
suff_search_lengths=[], ps_l=0,
ps_lengths=ops.asarray1i([]),
ss_search=bytes(),
ss_lookup=bytes(),
ss_l=0,
ss_lengths=ops.asarray1i([]),
).shape == (1, 0) ).shape == (1, 0)

View File

@ -1,13 +1,13 @@
import spacy import spacy
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
( (
search, search,
ref, lookup,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False) ) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
assert ( assert (
ref lookup
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
) )
@ -17,39 +17,39 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
) )
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive(): def test_get_arrays_for_search_chars_width_2_case_sensitive():
( (
search, search,
ref, lookup,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True) ) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
assert ( assert (
ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
) )
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive(): def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
( (
search, search,
ref, lookup,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert ( assert (
search search
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
) )
assert ( assert (
ref lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
) )
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive(): def test_get_arrays_for_search_chars_width_4_case_sensitive():
( (
search, search,
ref, lookup,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert search == ref assert search == lookup
assert ( assert (
ref lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
) )

View File

@ -18,6 +18,11 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr const_TokenC_ptr
cdef extern from "unicodeobject.h":
bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
@ -33,25 +38,34 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _populate_aff_buf( cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
)
cdef void _set_affixes(
const Py_UCS4* text_buf, const Py_UCS4* text_buf,
const int tok_idx, const int tok_idx,
const int tok_len, const int tok_len,
Py_UCS4* aff_buf, Py_UCS4* aff_buf,
const int pref_length, const int pref_len,
const int suff_length, const int suff_len,
const bint to_lower const bint to_lower
) )
cdef void _populate_search_buf(
cdef void _search_for_chars(
const Py_UCS4* text_buf, const Py_UCS4* text_buf,
const int tok_idx, const int tok_idx,
const int tok_len, const int tok_len,
Py_UCS4* search_buf, Py_UCS4* search_buf,
Py_UCS4* ref_buf, Py_UCS4* lookup_buf,
const int search_buf_len, const int search_buf_len,
Py_UCS4* finding_buf, Py_UCS4* result_buf,
const int finding_buf_len, const int result_buf_len,
bint suffs_not_prefs bint suffs_not_prefs
) )

View File

@ -1,7 +1,7 @@
from typing import Callable, Protocol, Iterable, Iterator, Optional from typing import Callable, Protocol, Iterable, Iterator, Optional
from typing import Union, Tuple, List, Dict, Any, overload from typing import Union, Tuple, List, Dict, Any, overload
from cymem.cymem import Pool from cymem.cymem import Pool
from thinc.types import Floats1d, Floats2d, Ints2d from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
from .span import Span from .span import Span
from .token import Token from .token import Token
from ._dict_proxies import SpanGroups from ._dict_proxies import SpanGroups
@ -177,17 +177,17 @@ class Doc:
def get_character_combination_hashes( def get_character_combination_hashes(
self, self,
*, *,
case_sensitive: bool, cs: bool,
pref_lengths: List[int], pref_lengths: Ints1d,
suff_lengths: List[int], suff_lengths: Ints1d,
pref_search_chars: str, pref_search_chars: str,
pref_ref_chars: str, pref_lookup_chars: str,
pref_search_char_length: int, pref_search_char_length: int,
pref_search_lengths: List[int], pref_search_lengths: Ints1d,
suff_search_chars: str, suff_search_chars: str,
suff_ref_chars: str, suff_lookup_chars: str,
suff_search_char_length: int, suff_search_char_length: int,
suff_search_lengths: List[int], suff_search_lengths: Ints1d,
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -3,6 +3,7 @@ from typing import Set, List
cimport cython cimport cython
cimport numpy as np cimport numpy as np
from cpython cimport array
from libc.string cimport memcpy, memcmp, memset from libc.string cimport memcpy, memcmp, memset
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t
@ -105,16 +106,6 @@ class SetEntsDefault(str, Enum):
return list(cls.__members__.keys()) return list(cls.__members__.keys())
cdef extern from "unicodeobject.h":
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
void PyUnicode_READY(void * o)
int PyUnicode_KIND(void *data)
int PyUnicode_IS_COMPACT(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary annotations to numpy arrays, losslessly serialize to compressed binary
@ -1745,103 +1736,129 @@ cdef class Doc:
return output return output
def get_character_combination_hashes( def get_character_combination_hashes(self,
self,
*, *,
bint cs, const bint cs,
pref_lengths: List[int], np.ndarray p_lengths,
suff_lengths: List[int], np.ndarray s_lengths,
char* pref_search, const char* ps_search,
char* pref_ref, const char* ps_lookup,
int pref_s_char_l, const int ps_l,
pref_search_lengths: List[int], np.ndarray ps_lengths,
char* suff_search, const char* ss_search,
char* suff_ref, const char* ss_lookup,
int suff_s_char_l, const int ss_l,
suff_search_lengths: List[int], np.ndarray ss_lengths,
): ):
""" """
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the string (text/orth) of each token. derived from the raw text of each token.
Generally:
p_ variables relate to prefixes (affixes starting at the beginning of the word)
s_ variables relate to suffixes (affixes starting at the end of the word)
ps_ variables relate to searches starting at the beginning of the word
ss_ variables relate to searches starting at the end of the word
cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that cs: if *False*, hashes are generated based on the lower-case version of each token.
if *cs==False*, upper-case characters in *search_chars* will not be found in token strings. p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
the prefixes hashed for "spaCy" would be "sp" and "spa". the prefixes hashed for "spaCy" would be "sp" and "spa".
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
pref_search_chars: a string containing characters to search for within each token, starting at the beginning. ps_search: a byte array containing characters to search for within each token, starting at the beginning.
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if ps_lookup: a byte array containing characters that are added to the result string when a character at
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac". "spaCy" would be "a" and "ac".
suff_search_chars: a string containing characters to search for within each token, starting at the end. ss_search: a byte array containing characters to search for within each token, starting at the end.
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if ss_lookup: a byte array containing characters that are added to the result string when a character at
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca". "spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
*get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
[[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")], [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))], [hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
""" """
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0 # Encode the document text
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef int aff_buf_l = max_pref_l + max_suff_l
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
cdef bytes encoded_text = self.text.encode("utf-32le") cdef bytes encoded_text = self.text.encode("utf-32le")
cdef char* intermediate_text = encoded_text cdef char* intermediate_text = encoded_text
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
cdef unsigned int num_toks = len(self), aff_len # Define the result array and work out what is used for what in axis 1
cdef unsigned int h_pref_n = len(pref_lengths) cdef int num_toks = len(self)
cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths) cdef int p_h_num = p_lengths.shape[0]
cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64") cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
cdef int s_max_l = max(s_lengths) if s_h_num > 0 else 0
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
# Define / allocate buffer (pr/sr: result buffers)
cdef int aff_buf_l = p_max_l + s_max_l
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
# Define memory views on length arrays
cdef int[:] p_v = p_lengths
cdef int[:] s_v = s_lengths
cdef int[:] ps_v = ps_lengths
cdef int[:] ss_v = ss_lengths
# Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int tok_i, tok_idx, tok_len, aff_len
for tok_i in range(num_toks): for tok_i in range(num_toks):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
tok_idx = tok_c.idx tok_idx = tok_c.idx
tok_len = tok_c.lex.length tok_len = tok_c.lex.length
_populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs) if aff_buf_l > 0:
_populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False) _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
_populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
for hash_idx in range(h_pref_n):
aff_len = pref_lengths[hash_idx]
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
for hash_idx in range(h_pref_n, h_suff_end_idx):
aff_len = suff_lengths[hash_idx - h_pref_n]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx): for hash_idx in range(p_h_num):
aff_len = pref_search_lengths[hash_idx - h_suff_end_idx] hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
for hash_idx in range(p_h_num, s_h_end):
aff_len = s_v[hash_idx - p_h_num]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
if ps_h_num > 0:
_search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
for hash_idx in range(s_h_end, ps_h_end):
aff_len = ps_v[hash_idx - s_h_end]
hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx): if ss_h_num > 0:
aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx] _search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0) for hash_idx in range(ps_h_end, ss_h_end):
aff_len = ss_v[hash_idx - ps_h_end]
hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
self.mem.free(aff_buf) self.mem.free(aff_buf)
self.mem.free(pref_f_buf) self.mem.free(pr_buf)
self.mem.free(suff_f_buf) self.mem.free(sr_buf)
return hashes return hashes
@staticmethod @staticmethod
@ -2025,76 +2042,103 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix return lca_matrix
cdef void _populate_aff_buf( cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
):
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
any upper-case characters to lower case within the target buffer.
"""
memcpy(target, source, length * sizeof(Py_UCS4))
cdef int idx
if to_lower:
for idx in range(length):
if Py_UNICODE_ISUPPER(target[idx]):
target[idx] = Py_UNICODE_TOLOWER(target[idx])
cdef void _set_affixes(
const Py_UCS4* text_buf, const Py_UCS4* text_buf,
const int tok_idx, const int tok_idx,
const int tok_len, const int tok_len,
Py_UCS4* aff_buf, Py_UCS4* aff_buf,
const int pref_length, const int pref_len,
const int suff_length, const int suff_len,
const bint to_lower const bint to_lower
): ):
""" Populate a buffer of length p+s with the first p and the last s characters of a word within a string. """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros. If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical text_buf: a pointer to a UTF-32LE representation of the containing string.
Unicode form (see PEP 393). tok_idx: the index of the first character of the word within the containing string.
kind: the number of bytes occupied by each character in the containing string. tok_len: the length of the word.
word_idx: the index of the first character of the word within the containing string.
word_len: the length of the word.
aff_buf: the buffer to populate. aff_buf: the buffer to populate.
pref_length: the length of the prefix. pref_len: the length of the prefix.
suff_length: the length of the suffix. suff_len: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case. to_lower: if *True*, any upper case characters in either affix are converted to lower case.
""" """
cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
while aff_buf_idx < pref_length and aff_buf_idx < tok_len: if pref_len > 0:
filled_pref_len = pref_len if pref_len < tok_len else tok_len
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
aff_buf_idx = filled_pref_len
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4) if tok_len < pref_len:
if to_lower: memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx]) aff_buf_idx = aff_buf_len - suff_len
aff_buf_idx += 1 if tok_len < suff_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
aff_buf_idx = aff_buf_len - tok_len
if aff_buf_idx < buf_size - tok_len: if suff_len > 0:
# fill out the empty middle part of the buffer with zeros in_word_idx = aff_buf_idx + tok_len - aff_buf_len
memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx) if in_word_idx < pref_len:
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
aff_buf_idx += filled_pref_len - in_word_idx
if aff_buf_idx < aff_buf_len:
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
while aff_buf_idx < buf_size:
in_word_idx = aff_buf_idx + tok_len - buf_size
# for suffixes we have to track the in-word index separately from the in-buffer index
if in_word_idx < pref_length:
# we've already retrieved this character as part of the prefix, so copy it from there
# as that's quicker than retrieving it from the input string a second time
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
else:
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
if to_lower:
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
aff_buf_idx += 1
cdef void _populate_search_buf( cdef void _search_for_chars(
const Py_UCS4* text_buf, const Py_UCS4* text_buf,
const int tok_idx, const int tok_idx,
const int tok_len, const int tok_len,
Py_UCS4* search_buf, Py_UCS4* search_buf,
Py_UCS4* ref_buf, Py_UCS4* lookup_buf,
const int search_buf_len, const int search_buf_len,
Py_UCS4* finding_buf, Py_UCS4* result_buf,
const int finding_buf_len, const int result_buf_len,
bint suffs_not_prefs bint suffs_not_prefs
): ):
cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx """ Search a word within a string for characters within *search_buf*, starting at the beginning or
cdef unsigned int search_buf_idx end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
cdef int cmp_res the corresponding character from *lookup_buf* is added to *result_buf*.
while finding_buf_idx < finding_buf_len: text_buf: a pointer to a UTF-32LE representation of the containing string.
tok_idx: the index of the first character of the word within the containing string.
tok_len: the length of the word.
search_buf: the characters to search for (ordered).
lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match.
Having separate search and lookup arrays enables case-insensitivity to be handled efficiently.
search_buf_len: the length of *search_buf* and hence also of *lookup_buf*.
result_buf: the buffer in which to place the results.
result_buf_len: the length of *result_buf*.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
"""
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef int search_buf_idx
cdef int cmp_result
while result_buf_idx < result_buf_len:
for search_buf_idx in range (search_buf_len): for search_buf_idx in range (search_buf_len):
cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4) cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4))
if cmp_res == 0: if cmp_result == 0:
memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4) memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4))
finding_buf_idx += 1 result_buf_idx += 1
if cmp_res >= 0: if cmp_result >= 0:
break break
if suffs_not_prefs: if suffs_not_prefs:
if text_string_idx <= tok_idx: if text_string_idx <= tok_idx:
@ -2105,11 +2149,11 @@ cdef void _populate_search_buf(
if text_string_idx >= tok_idx + tok_len: if text_string_idx >= tok_idx + tok_len:
break break
if finding_buf_idx < finding_buf_len: # fill in any unused characters in the result buffer with zeros
memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx) if result_buf_idx < result_buf_len:
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
def pickle_doc(doc): def pickle_doc(doc):
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,

View File

@ -1737,7 +1737,7 @@ def all_equal(iterable):
return next(g, True) and not next(g, False) return next(g, True) and not next(g, False)
def get_byte_arrays_for_search_chars( def get_arrays_for_search_chars(
search_chars: str, case_sensitive: bool search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes]: ) -> Tuple[bytes, bytes]:
""" """
@ -1746,14 +1746,14 @@ def get_byte_arrays_for_search_chars(
for search characters. The encoding is little-endian regardless of architecture, as for search characters. The encoding is little-endian regardless of architecture, as
this is what is expected by the murmurhash library used downstream. this is what is expected by the murmurhash library used downstream.
Alongside the "search byte array" against which words from document texts are compared Alongside the "search array" against which words from document texts are compared
is the "ref byte array". When a character from the search byte array is matched, is the "lookup array". When a character from the search array is matched,
the character at the corresponding position in the ref byte array is added to the the character at the corresponding position in the lookup array is added to the
byte sequence of the configured length that is then hashed. This enables case-sensitivity sequence that then goes on to be hashed. This enables case-sensitivity
to be handled without converting the case of the words being searched: if to be handled without converting the case of the words being searched: if
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search byte arrays, and both the original character and its have case are added to the search array, and both the original character and its
other-cased counterpart map to the lower-case version in the ref byte array. other-cased counterpart map to the lower-case version in the lookup array.
""" """
def encode(ch: str) -> bytes: def encode(ch: str) -> bytes:
@ -1762,8 +1762,8 @@ def get_byte_arrays_for_search_chars(
""" """
return ch.encode("UTF-32LE") return ch.encode("UTF-32LE")
def add_to_byte_arrays( def add_to_arrays(
search: List[bytes], ref: List[bytes], ch: str search: List[bytes], lookup: List[bytes], ch: str
) -> None: ) -> None:
"""Add the byte representations of *ch* to the two byte array lists. """Add the byte representations of *ch* to the two byte array lists.
""" """
@ -1771,36 +1771,36 @@ def get_byte_arrays_for_search_chars(
if not case_sensitive and ch.islower(): if not case_sensitive and ch.islower():
if this_char_bytes not in search: if this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
ref.append(this_char_bytes) lookup.append(this_char_bytes)
upper_char_bytes = encode(ch.upper()) upper_char_bytes = encode(ch.upper())
if upper_char_bytes not in search: if upper_char_bytes not in search:
search.append(upper_char_bytes) search.append(upper_char_bytes)
ref.append(this_char_bytes) lookup.append(this_char_bytes)
elif not case_sensitive and ch.isupper(): elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower()) lower_char_bytes = encode(ch.lower())
if this_char_bytes not in search: if this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
ref.append(lower_char_bytes) lookup.append(lower_char_bytes)
if lower_char_bytes not in search: if lower_char_bytes not in search:
search.append(lower_char_bytes) search.append(lower_char_bytes)
ref.append(lower_char_bytes) lookup.append(lower_char_bytes)
elif this_char_bytes not in search: elif this_char_bytes not in search:
search.append(this_char_bytes) search.append(this_char_bytes)
ref.append(this_char_bytes) lookup.append(this_char_bytes)
def get_ordered_raw_bytes( def get_ordered_raw_bytes(
search: List[bytes], ref: List[bytes] search: List[bytes], lookup: List[bytes]
) -> Tuple[bytes, bytes]: ) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search* """Flatten the two lists, ordering both by the entries in *search*
using the native endianness of the platform. using the native endianness of the platform.
""" """
num_search = [list(entry) for entry in search] num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))] search = [entry for _, entry in sorted(zip(num_search, search))]
ref = [entry for _, entry in sorted(zip(num_search, ref))] lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
return b"".join(search), b"".join(ref) return b"".join(search), b"".join(lookup)
search: List[bytes] = [] search: List[bytes] = []
ref: List[bytes] = [] lookup: List[bytes] = []
for ch in search_chars: for ch in search_chars:
add_to_byte_arrays(search, ref, ch) add_to_arrays(search, lookup, ch)
return get_ordered_raw_bytes(search, ref) return get_ordered_raw_bytes(search, lookup)

View File

@ -218,28 +218,6 @@ whose presence before or after characters that would otherwise alternate
prevents the alternation from occurring, e.g. an `ä` in a German plural noun prevents the alternation from occurring, e.g. an `ä` in a German plural noun
does not become `a` if it is the third or fourth vowel from the end of the word. does not become `a` if it is the third or fourth vowel from the end of the word.
Internally, the model converts each token string to
[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
from the string occupies two bytes. This assumption holds for all characters in
the Basic Multilingual Plane, which encompasses all characters that are ever
likely to be of interest when extracting features. There are, however,
characters like emojis that are in the Extended Multilingual Plane and occupy
four bytes, although importantly neither of the two byte pairs that make up such
a representation can be a valid two-byte character in its own right. The
following considerations apply to the processing of four-byte characters:
- An exceptional four-byte character within a text consisting mostly of two-byte
characters will probably be ignored by the neural network accepting the
embedding layer as not matching any of the learned features.
- If anyone did want to train a model for a language like Lycian that is
generally written in four-byte characters, prefix and suffix features can
still be extracted, but the length specifications should all be doubled, i.e.
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
situation length specifications that are odd numbers would serve no useful
purpose since they would refer to half-characters.
- Four-byte characters are not accepted within search character specification
strings and lead to an error being thrown.
| Name | Description | | Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |