Intermediate state

This commit is contained in:
richardpaulhudson 2022-10-20 21:48:53 +02:00
parent 2707d30ce0
commit f7d9942e7c
10 changed files with 377 additions and 329 deletions

View File

@ -946,7 +946,6 @@ class Errors(metaclass=ErrorsWithCodes):
"{value}.")
E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
E1045 = ("Invalid rich group config '{label}'.")
E1046 = ("Search characters may not contain characters that occupy four bytes in UTF-16.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -260,27 +260,6 @@ def RichMultiHashEmbed(
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
not become `a` if it is the third or fourth vowel from the end of the word.
Internally, the model converts each token string to UTF-16 and assumes that each
character from the string occupies two bytes. This assumption holds for all
characters in the Basic Multilingual Plane, which encompasses all characters that
are ever likely to be of interest when extracting features. There are, however,
characters like emojis that are in the Extended Multilingual Plane and occupy
four bytes, although importantly neither of the two byte pairs that make up such
a representation can be a valid two-byte character in its own right. The
following considerations apply to the processing of four-byte characters:
- An exceptional four-byte character within a text consisting mostly of two-byte
characters will probably be ignored by the neural network accepting the
embedding layer as not matching any of the learned features.
- If anyone did want to train a model for a language like Lycian that is
generally written in four-byte characters, prefix and suffix features can
still be extracted, but the length specifications should all be doubled, i.e.
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
situation length specifications that are odd numbers would serve no useful
purpose since they would refer to half-characters.
- Four-byte characters are not accepted within search character specification
strings and lead to an error being thrown.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
attrs (list of attr IDs): The token attributes to embed. A separate

View File

@ -1,7 +1,7 @@
from typing import List, Optional, Callable, Tuple
from ..util import get_byte_arrays_for_search_chars
from thinc.types import Ints2d
from thinc.api import Model, registry
from ..util import get_arrays_for_search_chars
from thinc.types import Ints1d, Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
@ -17,33 +17,46 @@ def RichFeatureExtractor(
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None:
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
pref_search, pref_lookup = get_arrays_for_search_chars(
pref_search_chars, case_sensitive
)
else:
pref_search, pref_ref = bytes(), bytes()
pref_search, pref_lookup = bytes(), bytes()
if suff_search_chars is not None:
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
suff_search, suff_lookup = get_arrays_for_search_chars(
suff_search_chars, case_sensitive
)
else:
suff_search, suff_ref = bytes(), bytes()
suff_search, suff_lookup = bytes(), bytes()
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"pref_lengths": pref_lengths if pref_lengths is not None else [],
"suff_lengths": suff_lengths if suff_lengths is not None else [],
"pref_lengths": ops.asarray1i(pref_lengths)
if pref_lengths is not None
else ops.asarray1i([]),
"suff_lengths": ops.asarray1i(suff_lengths)
if suff_lengths is not None
else ops.asarray1i([]),
"pref_search": pref_search,
"pref_ref": pref_ref,
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
"pref_search_lengths": pref_search_lengths
"pref_lookup": pref_lookup,
"pref_search_char_len": len(pref_search) / 4
if pref_search_chars is not None
else 0,
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
if pref_search_lengths is not None
else [],
else ops.asarray1i([]),
"suff_search": suff_search,
"suff_ref": suff_ref,
"suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
"suff_search_lengths": suff_search_lengths
"suff_lookup": suff_lookup,
"suff_search_char_len": len(suff_search) / 4
if suff_search_chars is not None
else 0,
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
if suff_search_lengths is not None
else [],
else ops.asarray1i([]),
},
)
@ -53,30 +66,30 @@ def forward(
) -> Tuple[List[Ints2d], Callable]:
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: List[int] = model.attrs["pref_lengths"]
suff_lengths: List[int] = model.attrs["suff_lengths"]
pref_lengths: Ints1d = model.attrs["pref_lengths"]
suff_lengths: Ints1d = model.attrs["suff_lengths"]
pref_search: bytes = model.attrs["pref_search"]
pref_ref: bytes = model.attrs["pref_ref"]
pref_s_char_l: int = model.attr["pref_s_char_l"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
pref_lookup: bytes = model.attrs["pref_lookup"]
pref_search_char_len: int = model.attrs["pref_search_char_len"]
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
suff_search: bytes = model.attrs["suff_search"]
suff_ref: bytes = model.attrs["suff_ref"]
suff_s_char_l: int = model.attr["suff_s_char_l"]
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
suff_lookup: bytes = model.attrs["suff_lookup"]
suff_search_char_len: int = model.attrs["suff_search_char_len"]
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
features: List[Ints2d] = []
for doc in docs:
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
pref_lengths=pref_lengths,
suff_lengths=suff_lengths,
pref_search=pref_search,
pref_ref=pref_ref,
pref_s_char_l=pref_s_char_l,
pref_search_lengths=pref_search_lengths,
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=suff_s_char_l,
suff_search_lengths=suff_search_lengths,
cs=case_sensitive,
p_lengths=pref_lengths,
s_lengths=suff_lengths,
ps_search=pref_search,
ps_lookup=pref_lookup,
ps_l=pref_search_char_len,
ps_lengths=pref_search_lengths,
ss_search=suff_search,
ss_lookup=suff_lookup,
ss_l=suff_search_char_len,
ss_lengths=suff_search_lengths,
)
features.append(ops.asarray2i(hashes))

View File

@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language
from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.util import get_byte_arrays_for_search_chars
from spacy.util import get_arrays_for_search_chars
from spacy.vocab import Vocab
from .test_underscore import clean_underscore # noqa: F401
@ -1004,21 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy")
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
ops = get_current_ops()
pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive)
suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes(
cs=case_sensitive,
pref_lengths=[1, 4, 3],
suff_lengths=[2, 3, 4, 5],
pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[2],
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=5 if case_sensitive else 9,
suff_search_lengths=[2,1],
p_lengths=ops.asarray1i([1, 4, 3]),
s_lengths=ops.asarray1i([2, 3, 4, 5]),
ps_search=pref_search,
ps_lookup=pref_lookup,
ps_l=2 if case_sensitive else 4,
ps_lengths=ops.asarray1i([2]),
ss_search=suff_search,
ss_lookup=suff_lookup,
ss_l=5 if case_sensitive else 9,
ss_lengths=ops.asarray1i([2, 1]),
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
@ -1035,7 +1036,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
"spaCy" if case_sensitive else "spacy"
)
assert hashes[0][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[0][7] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("")
@ -1067,7 +1068,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
assert hashes[3][7] == _get_unsigned_32_bit_hash(" ")
assert hashes[3][7] == _get_unsigned_32_bit_hash(" " if case_sensitive else "pr")
assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
@ -1077,73 +1078,93 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
# check values are the same cross-platform
assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016
assert hashes[1][3] == 3425774424
assert hashes[2][8] == 3076404432
if case_sensitive:
assert hashes[0][1] == 3712103410
else:
assert hashes[0][1] == 307339932
assert hashes[1][3] == 2414314354
assert hashes[2][8] == 1669671676
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
doc = en_tokenizer("and𐌞")
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy")
ops = get_current_ops()
pref_search, pref_lookup = get_arrays_for_search_chars("rp", False)
hashes = doc.get_character_combination_hashes(
cs=True,
pref_lengths=[],
suff_lengths=[1, 2, 3],
pref_search=bytes(),
pref_ref=bytes(),
pref_s_char_l = 0,
pref_search_lengths=[],
suff_search=suff_search,
suff_ref=suff_ref,
suff_s_char_l=1,
suff_search_lengths=[1],
cs=False,
p_lengths=ops.asarray1i([]),
s_lengths=ops.asarray1i([2, 3, 4, 5]),
ps_search=pref_search,
ps_lookup=pref_lookup,
ps_l=4,
ps_lengths=ops.asarray1i([2]),
ss_search=bytes(),
ss_lookup=bytes(),
ss_l=0,
ss_lengths=ops.asarray1i([]),
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
assert hashes[0][4] == _get_unsigned_32_bit_hash("p ")
assert hashes[1][0] == _get_unsigned_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("")
assert hashes[1][2] == _get_unsigned_32_bit_hash("")
assert hashes[1][3] == _get_unsigned_32_bit_hash("")
assert hashes[1][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
assert hashes[2][2] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][3] == _get_unsigned_32_bit_hash(" and")
assert hashes[2][4] == _get_unsigned_32_bit_hash(" ")
assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
doc = en_tokenizer("and𐌞a")
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
doc = en_tokenizer("sp𐌞Cé")
ops = get_current_ops()
for p_length in range(1, 8):
for s_length in range(1, 8):
hashes = doc.get_character_combination_hashes(
case_sensitive=False,
pref_lengths=[],
suff_lengths=[1, 2, 3, 4],
pref_search_chars="",
pref_search_lengths=[],
suff_search_chars="a",
suff_search_lengths=[1, 2],
cs=False,
p_lengths=ops.asarray1i([p_length]),
s_lengths=ops.asarray1i([s_length]),
ps_search=bytes(),
ps_lookup=bytes(),
ps_l=0,
ps_lengths=ops.asarray1i([]),
ss_search=bytes(),
ss_lookup=bytes(),
ss_l=0,
ss_lengths=ops.asarray1i([]),
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("a")
assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
assert hashes[0][4] == _get_unsigned_32_bit_hash("a")
assert hashes[0][5] == _get_unsigned_32_bit_hash("aa")
def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer):
doc = en_tokenizer("and𐌞")
with pytest.raises(ValueError):
doc.get_character_combination_hashes(
case_sensitive=True,
pref_lengths=[],
suff_lengths=[2, 3, 4, 5],
pref_search_chars="",
pref_search_lengths=[],
suff_search_chars="𐌞",
suff_search_lengths=[2],
)
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé "[:p_length])
assert hashes[0][1] == _get_unsigned_32_bit_hash(" sp𐌞cé"[8 - s_length :])
def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞")
assert doc.get_character_combination_hashes(
case_sensitive=True,
pref_lengths=[],
suff_lengths=[],
pref_search_chars="",
pref_search_lengths=[],
suff_search_chars="",
suff_search_lengths=[],
ops = get_current_ops()
hashes = doc.get_character_combination_hashes(
cs=True,
p_lengths=ops.asarray1i([]),
s_lengths=ops.asarray1i([]),
ps_search=bytes(),
ps_lookup=bytes(),
ps_l=0,
ps_lengths=ops.asarray1i([]),
ss_search=bytes(),
ss_lookup=bytes(),
ss_l=0,
ss_lengths=ops.asarray1i([]),
).shape == (1, 0)

View File

@ -1,13 +1,13 @@
import spacy
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
(
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
lookup,
) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
assert (
ref
lookup
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
@ -17,39 +17,39 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
)
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
def test_get_arrays_for_search_chars_width_2_case_sensitive():
(
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
lookup,
) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
assert (
ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
(
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
lookup,
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
assert (
search
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)
assert (
ref
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
)
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
def test_get_arrays_for_search_chars_width_4_case_sensitive():
(
search,
ref,
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert search == ref
lookup,
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
assert search == lookup
assert (
ref
lookup
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
)

View File

@ -18,6 +18,11 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr
cdef extern from "unicodeobject.h":
bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
@ -33,25 +38,34 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _populate_aff_buf(
cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
)
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_length,
const int suff_length,
const int pref_len,
const int suff_len,
const bint to_lower
)
cdef void _populate_search_buf(
cdef void _search_for_chars(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* ref_buf,
Py_UCS4* lookup_buf,
const int search_buf_len,
Py_UCS4* finding_buf,
const int finding_buf_len,
Py_UCS4* result_buf,
const int result_buf_len,
bint suffs_not_prefs
)

View File

@ -1,7 +1,7 @@
from typing import Callable, Protocol, Iterable, Iterator, Optional
from typing import Union, Tuple, List, Dict, Any, overload
from cymem.cymem import Pool
from thinc.types import Floats1d, Floats2d, Ints2d
from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
from .span import Span
from .token import Token
from ._dict_proxies import SpanGroups
@ -177,17 +177,17 @@ class Doc:
def get_character_combination_hashes(
self,
*,
case_sensitive: bool,
pref_lengths: List[int],
suff_lengths: List[int],
cs: bool,
pref_lengths: Ints1d,
suff_lengths: Ints1d,
pref_search_chars: str,
pref_ref_chars: str,
pref_lookup_chars: str,
pref_search_char_length: int,
pref_search_lengths: List[int],
pref_search_lengths: Ints1d,
suff_search_chars: str,
suff_ref_chars: str,
suff_lookup_chars: str,
suff_search_char_length: int,
suff_search_lengths: List[int],
suff_search_lengths: Ints1d,
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -3,6 +3,7 @@ from typing import Set, List
cimport cython
cimport numpy as np
from cpython cimport array
from libc.string cimport memcpy, memcmp, memset
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
@ -105,16 +106,6 @@ class SetEntsDefault(str, Enum):
return list(cls.__members__.keys())
cdef extern from "unicodeobject.h":
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
void PyUnicode_READY(void * o)
int PyUnicode_KIND(void *data)
int PyUnicode_IS_COMPACT(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
@ -1745,103 +1736,129 @@ cdef class Doc:
return output
def get_character_combination_hashes(
self,
def get_character_combination_hashes(self,
*,
bint cs,
pref_lengths: List[int],
suff_lengths: List[int],
char* pref_search,
char* pref_ref,
int pref_s_char_l,
pref_search_lengths: List[int],
char* suff_search,
char* suff_ref,
int suff_s_char_l,
suff_search_lengths: List[int],
const bint cs,
np.ndarray p_lengths,
np.ndarray s_lengths,
const char* ps_search,
const char* ps_lookup,
const int ps_l,
np.ndarray ps_lengths,
const char* ss_search,
const char* ss_lookup,
const int ss_l,
np.ndarray ss_lengths,
):
"""
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the string (text/orth) of each token.
derived from the raw text of each token.
cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
Generally:
p_ variables relate to prefixes (affixes starting at the beginning of the word)
s_ variables relate to suffixes (affixes starting at the end of the word)
ps_ variables relate to searches starting at the beginning of the word
ss_ variables relate to searches starting at the end of the word
cs: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
the prefixes hashed for "spaCy" would be "sp" and "spa".
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
ps_search: a byte array containing characters to search for within each token, starting at the beginning.
ps_lookup: a byte array containing characters that are added to the result string when a character at
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "a" and "ac".
suff_search_chars: a string containing characters to search for within each token, starting at the end.
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
ss_search: a byte array containing characters to search for within each token, starting at the end.
ss_lookup: a byte array containing characters that are added to the result string when a character at
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
case-insensitivity to be handled efficiently.
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
"spaCy" would be "c" and "ca".
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
*get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
[[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))],
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
"""
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef int aff_buf_l = max_pref_l + max_suff_l
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
# Encode the document text
cdef bytes encoded_text = self.text.encode("utf-32le")
cdef char* intermediate_text = encoded_text
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
cdef unsigned int num_toks = len(self), aff_len
cdef unsigned int h_pref_n = len(pref_lengths)
cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
# Define the result array and work out what is used for what in axis 1
cdef int num_toks = len(self)
cdef int p_h_num = p_lengths.shape[0]
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
cdef int s_max_l = max(s_lengths) if s_h_num > 0 else 0
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
# Define / allocate buffer (pr/sr: result buffers)
cdef int aff_buf_l = p_max_l + s_max_l
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
# Define memory views on length arrays
cdef int[:] p_v = p_lengths
cdef int[:] s_v = s_lengths
cdef int[:] ps_v = ps_lengths
cdef int[:] ss_v = ss_lengths
# Define working variables
cdef TokenC tok_c
cdef int tok_i, tok_idx, tok_len, aff_len
for tok_i in range(num_toks):
tok_c = self.c[tok_i]
tok_idx = tok_c.idx
tok_len = tok_c.lex.length
_populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
_populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
_populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
if aff_buf_l > 0:
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
for hash_idx in range(h_pref_n):
aff_len = pref_lengths[hash_idx]
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
for hash_idx in range(p_h_num):
hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
for hash_idx in range(h_pref_n, h_suff_end_idx):
aff_len = suff_lengths[hash_idx - h_pref_n]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
for hash_idx in range(p_h_num, s_h_end):
aff_len = s_v[hash_idx - p_h_num]
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
if ps_h_num > 0:
_search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
for hash_idx in range(s_h_end, ps_h_end):
aff_len = ps_v[hash_idx - s_h_end]
hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
if ss_h_num > 0:
_search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
for hash_idx in range(ps_h_end, ss_h_end):
aff_len = ss_v[hash_idx - ps_h_end]
hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
self.mem.free(aff_buf)
self.mem.free(pref_f_buf)
self.mem.free(suff_f_buf)
self.mem.free(pr_buf)
self.mem.free(sr_buf)
return hashes
@staticmethod
@ -2025,76 +2042,103 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix
cdef void _populate_aff_buf(
cdef void _copy_chars(
Py_UCS4* target,
const Py_UCS4* source,
const int length,
const bint to_lower
):
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
any upper-case characters to lower case within the target buffer.
"""
memcpy(target, source, length * sizeof(Py_UCS4))
cdef int idx
if to_lower:
for idx in range(length):
if Py_UNICODE_ISUPPER(target[idx]):
target[idx] = Py_UNICODE_TOLOWER(target[idx])
cdef void _set_affixes(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* aff_buf,
const int pref_length,
const int suff_length,
const int pref_len,
const int suff_len,
const bint to_lower
):
""" Populate a buffer of length p+s with the first p and the last s characters of a word within a string.
If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros.
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
Unicode form (see PEP 393).
kind: the number of bytes occupied by each character in the containing string.
word_idx: the index of the first character of the word within the containing string.
word_len: the length of the word.
text_buf: a pointer to a UTF-32LE representation of the containing string.
tok_idx: the index of the first character of the word within the containing string.
tok_len: the length of the word.
aff_buf: the buffer to populate.
pref_length: the length of the prefix.
suff_length: the length of the suffix.
pref_len: the length of the prefix.
suff_len: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
"""
cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
if pref_len > 0:
filled_pref_len = pref_len if pref_len < tok_len else tok_len
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
aff_buf_idx = filled_pref_len
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
if to_lower:
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
aff_buf_idx += 1
if tok_len < pref_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
aff_buf_idx = aff_buf_len - suff_len
if tok_len < suff_len:
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
aff_buf_idx = aff_buf_len - tok_len
if aff_buf_idx < buf_size - tok_len:
# fill out the empty middle part of the buffer with zeros
memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
if suff_len > 0:
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
if in_word_idx < pref_len:
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
aff_buf_idx += filled_pref_len - in_word_idx
if aff_buf_idx < aff_buf_len:
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
while aff_buf_idx < buf_size:
in_word_idx = aff_buf_idx + tok_len - buf_size
# for suffixes we have to track the in-word index separately from the in-buffer index
if in_word_idx < pref_length:
# we've already retrieved this character as part of the prefix, so copy it from there
# as that's quicker than retrieving it from the input string a second time
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
else:
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
if to_lower:
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
aff_buf_idx += 1
cdef void _populate_search_buf(
cdef void _search_for_chars(
const Py_UCS4* text_buf,
const int tok_idx,
const int tok_len,
Py_UCS4* search_buf,
Py_UCS4* ref_buf,
Py_UCS4* lookup_buf,
const int search_buf_len,
Py_UCS4* finding_buf,
const int finding_buf_len,
Py_UCS4* result_buf,
const int result_buf_len,
bint suffs_not_prefs
):
cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef unsigned int search_buf_idx
cdef int cmp_res
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
the corresponding character from *lookup_buf* is added to *result_buf*.
while finding_buf_idx < finding_buf_len:
text_buf: a pointer to a UTF-32LE representation of the containing string.
tok_idx: the index of the first character of the word within the containing string.
tok_len: the length of the word.
search_buf: the characters to search for (ordered).
lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match.
Having separate search and lookup arrays enables case-insensitivity to be handled efficiently.
search_buf_len: the length of *search_buf* and hence also of *lookup_buf*.
result_buf: the buffer in which to place the results.
result_buf_len: the length of *result_buf*.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
"""
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
cdef int search_buf_idx
cdef int cmp_result
while result_buf_idx < result_buf_len:
for search_buf_idx in range (search_buf_len):
cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
if cmp_res == 0:
memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
finding_buf_idx += 1
if cmp_res >= 0:
cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4))
if cmp_result == 0:
memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4))
result_buf_idx += 1
if cmp_result >= 0:
break
if suffs_not_prefs:
if text_string_idx <= tok_idx:
@ -2105,9 +2149,9 @@ cdef void _populate_search_buf(
if text_string_idx >= tok_idx + tok_len:
break
if finding_buf_idx < finding_buf_len:
memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
# fill in any unused characters in the result buffer with zeros
if result_buf_idx < result_buf_len:
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
def pickle_doc(doc):

View File

@ -1737,7 +1737,7 @@ def all_equal(iterable):
return next(g, True) and not next(g, False)
def get_byte_arrays_for_search_chars(
def get_arrays_for_search_chars(
search_chars: str, case_sensitive: bool
) -> Tuple[bytes, bytes]:
"""
@ -1746,14 +1746,14 @@ def get_byte_arrays_for_search_chars(
for search characters. The encoding is little-endian regardless of architecture, as
this is what is expected by the murmurhash library used downstream.
Alongside the "search byte array" against which words from document texts are compared
is the "ref byte array". When a character from the search byte array is matched,
the character at the corresponding position in the ref byte array is added to the
byte sequence of the configured length that is then hashed. This enables case-sensitivity
Alongside the "search array" against which words from document texts are compared
is the "lookup array". When a character from the search array is matched,
the character at the corresponding position in the lookup array is added to the
sequence that then goes on to be hashed. This enables case-sensitivity
to be handled without converting the case of the words being searched: if
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
have case are added to the search byte arrays, and both the original character and its
other-cased counterpart map to the lower-case version in the ref byte array.
have case are added to the search array, and both the original character and its
other-cased counterpart map to the lower-case version in the lookup array.
"""
def encode(ch: str) -> bytes:
@ -1762,8 +1762,8 @@ def get_byte_arrays_for_search_chars(
"""
return ch.encode("UTF-32LE")
def add_to_byte_arrays(
search: List[bytes], ref: List[bytes], ch: str
def add_to_arrays(
search: List[bytes], lookup: List[bytes], ch: str
) -> None:
"""Add the byte representations of *ch* to the two byte array lists.
"""
@ -1771,36 +1771,36 @@ def get_byte_arrays_for_search_chars(
if not case_sensitive and ch.islower():
if this_char_bytes not in search:
search.append(this_char_bytes)
ref.append(this_char_bytes)
lookup.append(this_char_bytes)
upper_char_bytes = encode(ch.upper())
if upper_char_bytes not in search:
search.append(upper_char_bytes)
ref.append(this_char_bytes)
lookup.append(this_char_bytes)
elif not case_sensitive and ch.isupper():
lower_char_bytes = encode(ch.lower())
if this_char_bytes not in search:
search.append(this_char_bytes)
ref.append(lower_char_bytes)
lookup.append(lower_char_bytes)
if lower_char_bytes not in search:
search.append(lower_char_bytes)
ref.append(lower_char_bytes)
lookup.append(lower_char_bytes)
elif this_char_bytes not in search:
search.append(this_char_bytes)
ref.append(this_char_bytes)
lookup.append(this_char_bytes)
def get_ordered_raw_bytes(
search: List[bytes], ref: List[bytes]
search: List[bytes], lookup: List[bytes]
) -> Tuple[bytes, bytes]:
"""Flatten the two lists, ordering both by the entries in *search*
using the native endianness of the platform.
"""
num_search = [list(entry) for entry in search]
search = [entry for _, entry in sorted(zip(num_search, search))]
ref = [entry for _, entry in sorted(zip(num_search, ref))]
return b"".join(search), b"".join(ref)
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
return b"".join(search), b"".join(lookup)
search: List[bytes] = []
ref: List[bytes] = []
lookup: List[bytes] = []
for ch in search_chars:
add_to_byte_arrays(search, ref, ch)
return get_ordered_raw_bytes(search, ref)
add_to_arrays(search, lookup, ch)
return get_ordered_raw_bytes(search, lookup)

View File

@ -218,28 +218,6 @@ whose presence before or after characters that would otherwise alternate
prevents the alternation from occurring, e.g. an `ä` in a German plural noun
does not become `a` if it is the third or fourth vowel from the end of the word.
Internally, the model converts each token string to
[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
from the string occupies two bytes. This assumption holds for all characters in
the Basic Multilingual Plane, which encompasses all characters that are ever
likely to be of interest when extracting features. There are, however,
characters like emojis that are in the Extended Multilingual Plane and occupy
four bytes, although importantly neither of the two byte pairs that make up such
a representation can be a valid two-byte character in its own right. The
following considerations apply to the processing of four-byte characters:
- An exceptional four-byte character within a text consisting mostly of two-byte
characters will probably be ignored by the neural network accepting the
embedding layer as not matching any of the learned features.
- If anyone did want to train a model for a language like Lycian that is
generally written in four-byte characters, prefix and suffix features can
still be extracted, but the length specifications should all be doubled, i.e.
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
situation length specifications that are odd numbers would serve no useful
purpose since they would refer to half-characters.
- Four-byte characters are not accepted within search character specification
strings and lead to an error being thrown.
| Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |