mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Intermediate state
This commit is contained in:
parent
2707d30ce0
commit
f7d9942e7c
|
@ -946,7 +946,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"{value}.")
|
"{value}.")
|
||||||
E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
|
E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
|
||||||
E1045 = ("Invalid rich group config '{label}'.")
|
E1045 = ("Invalid rich group config '{label}'.")
|
||||||
E1046 = ("Search characters may not contain characters that occupy four bytes in UTF-16.")
|
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -260,27 +260,6 @@ def RichMultiHashEmbed(
|
||||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
|
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
|
||||||
not become `a` if it is the third or fourth vowel from the end of the word.
|
not become `a` if it is the third or fourth vowel from the end of the word.
|
||||||
|
|
||||||
Internally, the model converts each token string to UTF-16 and assumes that each
|
|
||||||
character from the string occupies two bytes. This assumption holds for all
|
|
||||||
characters in the Basic Multilingual Plane, which encompasses all characters that
|
|
||||||
are ever likely to be of interest when extracting features. There are, however,
|
|
||||||
characters like emojis that are in the Extended Multilingual Plane and occupy
|
|
||||||
four bytes, although importantly neither of the two byte pairs that make up such
|
|
||||||
a representation can be a valid two-byte character in its own right. The
|
|
||||||
following considerations apply to the processing of four-byte characters:
|
|
||||||
|
|
||||||
- An exceptional four-byte character within a text consisting mostly of two-byte
|
|
||||||
characters will probably be ignored by the neural network accepting the
|
|
||||||
embedding layer as not matching any of the learned features.
|
|
||||||
- If anyone did want to train a model for a language like Lycian that is
|
|
||||||
generally written in four-byte characters, prefix and suffix features can
|
|
||||||
still be extracted, but the length specifications should all be doubled, i.e.
|
|
||||||
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
|
|
||||||
situation length specifications that are odd numbers would serve no useful
|
|
||||||
purpose since they would refer to half-characters.
|
|
||||||
- Four-byte characters are not accepted within search character specification
|
|
||||||
strings and lead to an error being thrown.
|
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
attrs (list of attr IDs): The token attributes to embed. A separate
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import List, Optional, Callable, Tuple
|
from typing import List, Optional, Callable, Tuple
|
||||||
from ..util import get_byte_arrays_for_search_chars
|
from ..util import get_arrays_for_search_chars
|
||||||
from thinc.types import Ints2d
|
from thinc.types import Ints1d, Ints2d
|
||||||
from thinc.api import Model, registry
|
from thinc.api import Model, registry, get_current_ops
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
@ -17,33 +17,46 @@ def RichFeatureExtractor(
|
||||||
suff_search_chars: Optional[str] = None,
|
suff_search_chars: Optional[str] = None,
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
suff_search_lengths: Optional[List[int]] = None,
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
ops = get_current_ops()
|
||||||
if pref_search_chars is not None:
|
if pref_search_chars is not None:
|
||||||
pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
|
pref_search, pref_lookup = get_arrays_for_search_chars(
|
||||||
|
pref_search_chars, case_sensitive
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
pref_search, pref_ref = bytes(), bytes()
|
pref_search, pref_lookup = bytes(), bytes()
|
||||||
if suff_search_chars is not None:
|
if suff_search_chars is not None:
|
||||||
suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
|
suff_search, suff_lookup = get_arrays_for_search_chars(
|
||||||
|
suff_search_chars, case_sensitive
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
suff_search, suff_ref = bytes(), bytes()
|
suff_search, suff_lookup = bytes(), bytes()
|
||||||
return Model(
|
return Model(
|
||||||
"extract_character_combination_hashes",
|
"extract_character_combination_hashes",
|
||||||
forward,
|
forward,
|
||||||
attrs={
|
attrs={
|
||||||
"case_sensitive": case_sensitive,
|
"case_sensitive": case_sensitive,
|
||||||
"pref_lengths": pref_lengths if pref_lengths is not None else [],
|
"pref_lengths": ops.asarray1i(pref_lengths)
|
||||||
"suff_lengths": suff_lengths if suff_lengths is not None else [],
|
if pref_lengths is not None
|
||||||
|
else ops.asarray1i([]),
|
||||||
|
"suff_lengths": ops.asarray1i(suff_lengths)
|
||||||
|
if suff_lengths is not None
|
||||||
|
else ops.asarray1i([]),
|
||||||
"pref_search": pref_search,
|
"pref_search": pref_search,
|
||||||
"pref_ref": pref_ref,
|
"pref_lookup": pref_lookup,
|
||||||
"pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
|
"pref_search_char_len": len(pref_search) / 4
|
||||||
"pref_search_lengths": pref_search_lengths
|
if pref_search_chars is not None
|
||||||
|
else 0,
|
||||||
|
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
|
||||||
if pref_search_lengths is not None
|
if pref_search_lengths is not None
|
||||||
else [],
|
else ops.asarray1i([]),
|
||||||
"suff_search": suff_search,
|
"suff_search": suff_search,
|
||||||
"suff_ref": suff_ref,
|
"suff_lookup": suff_lookup,
|
||||||
"suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
|
"suff_search_char_len": len(suff_search) / 4
|
||||||
"suff_search_lengths": suff_search_lengths
|
if suff_search_chars is not None
|
||||||
|
else 0,
|
||||||
|
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
|
||||||
if suff_search_lengths is not None
|
if suff_search_lengths is not None
|
||||||
else [],
|
else ops.asarray1i([]),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -53,30 +66,30 @@ def forward(
|
||||||
) -> Tuple[List[Ints2d], Callable]:
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
pref_lengths: List[int] = model.attrs["pref_lengths"]
|
pref_lengths: Ints1d = model.attrs["pref_lengths"]
|
||||||
suff_lengths: List[int] = model.attrs["suff_lengths"]
|
suff_lengths: Ints1d = model.attrs["suff_lengths"]
|
||||||
pref_search: bytes = model.attrs["pref_search"]
|
pref_search: bytes = model.attrs["pref_search"]
|
||||||
pref_ref: bytes = model.attrs["pref_ref"]
|
pref_lookup: bytes = model.attrs["pref_lookup"]
|
||||||
pref_s_char_l: int = model.attr["pref_s_char_l"]
|
pref_search_char_len: int = model.attrs["pref_search_char_len"]
|
||||||
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
|
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
|
||||||
suff_search: bytes = model.attrs["suff_search"]
|
suff_search: bytes = model.attrs["suff_search"]
|
||||||
suff_ref: bytes = model.attrs["suff_ref"]
|
suff_lookup: bytes = model.attrs["suff_lookup"]
|
||||||
suff_s_char_l: int = model.attr["suff_s_char_l"]
|
suff_search_char_len: int = model.attrs["suff_search_char_len"]
|
||||||
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
cs=case_sensitive,
|
||||||
pref_lengths=pref_lengths,
|
p_lengths=pref_lengths,
|
||||||
suff_lengths=suff_lengths,
|
s_lengths=suff_lengths,
|
||||||
pref_search=pref_search,
|
ps_search=pref_search,
|
||||||
pref_ref=pref_ref,
|
ps_lookup=pref_lookup,
|
||||||
pref_s_char_l=pref_s_char_l,
|
ps_l=pref_search_char_len,
|
||||||
pref_search_lengths=pref_search_lengths,
|
ps_lengths=pref_search_lengths,
|
||||||
suff_search=suff_search,
|
ss_search=suff_search,
|
||||||
suff_ref=suff_ref,
|
ss_lookup=suff_lookup,
|
||||||
suff_s_char_l=suff_s_char_l,
|
ss_l=suff_search_char_len,
|
||||||
suff_search_lengths=suff_search_lengths,
|
ss_lengths=suff_search_lengths,
|
||||||
)
|
)
|
||||||
features.append(ops.asarray2i(hashes))
|
features.append(ops.asarray2i(hashes))
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||||
from spacy.util import get_byte_arrays_for_search_chars
|
from spacy.util import get_arrays_for_search_chars
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from .test_underscore import clean_underscore # noqa: F401
|
from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
@ -1004,21 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||||
|
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
|
ops = get_current_ops()
|
||||||
|
pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive)
|
||||||
|
suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
pref_lengths=[1, 4, 3],
|
p_lengths=ops.asarray1i([1, 4, 3]),
|
||||||
suff_lengths=[2, 3, 4, 5],
|
s_lengths=ops.asarray1i([2, 3, 4, 5]),
|
||||||
pref_search=bytes(),
|
ps_search=pref_search,
|
||||||
pref_ref=bytes(),
|
ps_lookup=pref_lookup,
|
||||||
pref_s_char_l = 0,
|
ps_l=2 if case_sensitive else 4,
|
||||||
pref_search_lengths=[2],
|
ps_lengths=ops.asarray1i([2]),
|
||||||
suff_search=suff_search,
|
ss_search=suff_search,
|
||||||
suff_ref=suff_ref,
|
ss_lookup=suff_lookup,
|
||||||
suff_s_char_l=5 if case_sensitive else 9,
|
ss_l=5 if case_sensitive else 9,
|
||||||
suff_search_lengths=[2,1],
|
ss_lengths=ops.asarray1i([2, 1]),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
||||||
|
@ -1035,7 +1036,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
"spaCy" if case_sensitive else "spacy"
|
"spaCy" if case_sensitive else "spacy"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][7] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[0][7] == _get_unsigned_32_bit_hash("p ")
|
||||||
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
|
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
|
||||||
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
||||||
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
||||||
|
@ -1067,7 +1068,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
|
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
|
||||||
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
|
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
|
||||||
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
|
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
|
||||||
assert hashes[3][7] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[3][7] == _get_unsigned_32_bit_hash(" " if case_sensitive else "pr")
|
||||||
|
|
||||||
assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
|
assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
|
||||||
|
|
||||||
|
@ -1077,73 +1078,93 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
|
assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
|
||||||
|
|
||||||
# check values are the same cross-platform
|
# check values are the same cross-platform
|
||||||
assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016
|
if case_sensitive:
|
||||||
assert hashes[1][3] == 3425774424
|
assert hashes[0][1] == 3712103410
|
||||||
assert hashes[2][8] == 3076404432
|
else:
|
||||||
|
assert hashes[0][1] == 307339932
|
||||||
|
assert hashes[1][3] == 2414314354
|
||||||
|
assert hashes[2][8] == 1669671676
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
|
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
|
ops = get_current_ops()
|
||||||
|
pref_search, pref_lookup = get_arrays_for_search_chars("rp", False)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=True,
|
cs=False,
|
||||||
pref_lengths=[],
|
p_lengths=ops.asarray1i([]),
|
||||||
suff_lengths=[1, 2, 3],
|
s_lengths=ops.asarray1i([2, 3, 4, 5]),
|
||||||
pref_search=bytes(),
|
ps_search=pref_search,
|
||||||
pref_ref=bytes(),
|
ps_lookup=pref_lookup,
|
||||||
pref_s_char_l = 0,
|
ps_l=4,
|
||||||
pref_search_lengths=[],
|
ps_lengths=ops.asarray1i([2]),
|
||||||
suff_search=suff_search,
|
ss_search=bytes(),
|
||||||
suff_ref=suff_ref,
|
ss_lookup=bytes(),
|
||||||
suff_s_char_l=1,
|
ss_l=0,
|
||||||
suff_search_lengths=[1],
|
ss_lengths=ops.asarray1i([]),
|
||||||
)
|
)
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
|
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
|
assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
|
assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
|
||||||
|
assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
|
||||||
|
assert hashes[0][4] == _get_unsigned_32_bit_hash("p ")
|
||||||
|
assert hashes[1][0] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
|
assert hashes[1][1] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
|
assert hashes[1][2] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
|
assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨")
|
||||||
|
assert hashes[1][4] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
|
||||||
|
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
|
||||||
|
assert hashes[2][2] == _get_unsigned_32_bit_hash(" and")
|
||||||
|
assert hashes[2][3] == _get_unsigned_32_bit_hash(" and")
|
||||||
|
assert hashes[2][4] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
|
||||||
|
assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
|
||||||
|
assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
|
||||||
|
assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
|
||||||
|
assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
|
|
||||||
doc = en_tokenizer("and𐌞a")
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
|
||||||
case_sensitive=False,
|
|
||||||
pref_lengths=[],
|
|
||||||
suff_lengths=[1, 2, 3, 4],
|
|
||||||
pref_search_chars="",
|
|
||||||
pref_search_lengths=[],
|
|
||||||
suff_search_chars="a",
|
|
||||||
suff_search_lengths=[1, 2],
|
|
||||||
)
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("a")
|
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
|
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
|
|
||||||
assert hashes[0][4] == _get_unsigned_32_bit_hash("a")
|
|
||||||
assert hashes[0][5] == _get_unsigned_32_bit_hash("aa")
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer):
|
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("sp𐌞Cé")
|
||||||
with pytest.raises(ValueError):
|
ops = get_current_ops()
|
||||||
doc.get_character_combination_hashes(
|
|
||||||
case_sensitive=True,
|
for p_length in range(1, 8):
|
||||||
pref_lengths=[],
|
for s_length in range(1, 8):
|
||||||
suff_lengths=[2, 3, 4, 5],
|
hashes = doc.get_character_combination_hashes(
|
||||||
pref_search_chars="",
|
cs=False,
|
||||||
pref_search_lengths=[],
|
p_lengths=ops.asarray1i([p_length]),
|
||||||
suff_search_chars="𐌞",
|
s_lengths=ops.asarray1i([s_length]),
|
||||||
suff_search_lengths=[2],
|
ps_search=bytes(),
|
||||||
)
|
ps_lookup=bytes(),
|
||||||
|
ps_l=0,
|
||||||
|
ps_lengths=ops.asarray1i([]),
|
||||||
|
ss_search=bytes(),
|
||||||
|
ss_lookup=bytes(),
|
||||||
|
ss_l=0,
|
||||||
|
ss_lengths=ops.asarray1i([]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé "[:p_length])
|
||||||
|
assert hashes[0][1] == _get_unsigned_32_bit_hash(" sp𐌞cé"[8 - s_length :])
|
||||||
|
|
||||||
|
|
||||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
assert doc.get_character_combination_hashes(
|
ops = get_current_ops()
|
||||||
case_sensitive=True,
|
hashes = doc.get_character_combination_hashes(
|
||||||
pref_lengths=[],
|
cs=True,
|
||||||
suff_lengths=[],
|
p_lengths=ops.asarray1i([]),
|
||||||
pref_search_chars="",
|
s_lengths=ops.asarray1i([]),
|
||||||
pref_search_lengths=[],
|
ps_search=bytes(),
|
||||||
suff_search_chars="",
|
ps_lookup=bytes(),
|
||||||
suff_search_lengths=[],
|
ps_l=0,
|
||||||
|
ps_lengths=ops.asarray1i([]),
|
||||||
|
ss_search=bytes(),
|
||||||
|
ss_lookup=bytes(),
|
||||||
|
ss_l=0,
|
||||||
|
ss_lengths=ops.asarray1i([]),
|
||||||
).shape == (1, 0)
|
).shape == (1, 0)
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
|
def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
|
||||||
(
|
(
|
||||||
search,
|
search,
|
||||||
ref,
|
lookup,
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
|
) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
|
||||||
assert (
|
assert (
|
||||||
ref
|
lookup
|
||||||
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -17,39 +17,39 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
|
def test_get_arrays_for_search_chars_width_2_case_sensitive():
|
||||||
(
|
(
|
||||||
search,
|
search,
|
||||||
ref,
|
lookup,
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
|
) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
|
||||||
assert (
|
assert (
|
||||||
ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
|
def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
|
||||||
(
|
(
|
||||||
search,
|
search,
|
||||||
ref,
|
lookup,
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
|
||||||
assert (
|
assert (
|
||||||
search
|
search
|
||||||
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
ref
|
lookup
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
|
def test_get_arrays_for_search_chars_width_4_case_sensitive():
|
||||||
(
|
(
|
||||||
search,
|
search,
|
||||||
ref,
|
lookup,
|
||||||
) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
|
||||||
assert search == ref
|
assert search == lookup
|
||||||
assert (
|
assert (
|
||||||
ref
|
lookup
|
||||||
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
== b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
|
||||||
)
|
)
|
||||||
|
|
|
@ -18,6 +18,11 @@ ctypedef fused LexemeOrToken:
|
||||||
const_TokenC_ptr
|
const_TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
|
cdef extern from "unicodeobject.h":
|
||||||
|
bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
|
||||||
|
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
|
||||||
|
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,25 +38,34 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _populate_aff_buf(
|
cdef void _copy_chars(
|
||||||
|
Py_UCS4* target,
|
||||||
|
const Py_UCS4* source,
|
||||||
|
const int length,
|
||||||
|
const bint to_lower
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _set_affixes(
|
||||||
const Py_UCS4* text_buf,
|
const Py_UCS4* text_buf,
|
||||||
const int tok_idx,
|
const int tok_idx,
|
||||||
const int tok_len,
|
const int tok_len,
|
||||||
Py_UCS4* aff_buf,
|
Py_UCS4* aff_buf,
|
||||||
const int pref_length,
|
const int pref_len,
|
||||||
const int suff_length,
|
const int suff_len,
|
||||||
const bint to_lower
|
const bint to_lower
|
||||||
)
|
)
|
||||||
|
|
||||||
cdef void _populate_search_buf(
|
|
||||||
|
cdef void _search_for_chars(
|
||||||
const Py_UCS4* text_buf,
|
const Py_UCS4* text_buf,
|
||||||
const int tok_idx,
|
const int tok_idx,
|
||||||
const int tok_len,
|
const int tok_len,
|
||||||
Py_UCS4* search_buf,
|
Py_UCS4* search_buf,
|
||||||
Py_UCS4* ref_buf,
|
Py_UCS4* lookup_buf,
|
||||||
const int search_buf_len,
|
const int search_buf_len,
|
||||||
Py_UCS4* finding_buf,
|
Py_UCS4* result_buf,
|
||||||
const int finding_buf_len,
|
const int result_buf_len,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Callable, Protocol, Iterable, Iterator, Optional
|
from typing import Callable, Protocol, Iterable, Iterator, Optional
|
||||||
from typing import Union, Tuple, List, Dict, Any, overload
|
from typing import Union, Tuple, List, Dict, Any, overload
|
||||||
from cymem.cymem import Pool
|
from cymem.cymem import Pool
|
||||||
from thinc.types import Floats1d, Floats2d, Ints2d
|
from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
|
||||||
from .span import Span
|
from .span import Span
|
||||||
from .token import Token
|
from .token import Token
|
||||||
from ._dict_proxies import SpanGroups
|
from ._dict_proxies import SpanGroups
|
||||||
|
@ -177,17 +177,17 @@ class Doc:
|
||||||
def get_character_combination_hashes(
|
def get_character_combination_hashes(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
case_sensitive: bool,
|
cs: bool,
|
||||||
pref_lengths: List[int],
|
pref_lengths: Ints1d,
|
||||||
suff_lengths: List[int],
|
suff_lengths: Ints1d,
|
||||||
pref_search_chars: str,
|
pref_search_chars: str,
|
||||||
pref_ref_chars: str,
|
pref_lookup_chars: str,
|
||||||
pref_search_char_length: int,
|
pref_search_char_length: int,
|
||||||
pref_search_lengths: List[int],
|
pref_search_lengths: Ints1d,
|
||||||
suff_search_chars: str,
|
suff_search_chars: str,
|
||||||
suff_ref_chars: str,
|
suff_lookup_chars: str,
|
||||||
suff_search_char_length: int,
|
suff_search_char_length: int,
|
||||||
suff_search_lengths: List[int],
|
suff_search_lengths: Ints1d,
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import Set, List
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from cpython cimport array
|
||||||
from libc.string cimport memcpy, memcmp, memset
|
from libc.string cimport memcpy, memcmp, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
from libc.stdint cimport int32_t, uint64_t
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
@ -105,16 +106,6 @@ class SetEntsDefault(str, Enum):
|
||||||
return list(cls.__members__.keys())
|
return list(cls.__members__.keys())
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "unicodeobject.h":
|
|
||||||
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
|
|
||||||
void* PyUnicode_DATA(void* o)
|
|
||||||
void PyUnicode_READY(void * o)
|
|
||||||
int PyUnicode_KIND(void *data)
|
|
||||||
int PyUnicode_IS_COMPACT(void *data)
|
|
||||||
|
|
||||||
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||||
|
@ -1745,103 +1736,129 @@ cdef class Doc:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_character_combination_hashes(
|
def get_character_combination_hashes(self,
|
||||||
self,
|
|
||||||
*,
|
*,
|
||||||
bint cs,
|
const bint cs,
|
||||||
pref_lengths: List[int],
|
np.ndarray p_lengths,
|
||||||
suff_lengths: List[int],
|
np.ndarray s_lengths,
|
||||||
char* pref_search,
|
const char* ps_search,
|
||||||
char* pref_ref,
|
const char* ps_lookup,
|
||||||
int pref_s_char_l,
|
const int ps_l,
|
||||||
pref_search_lengths: List[int],
|
np.ndarray ps_lengths,
|
||||||
char* suff_search,
|
const char* ss_search,
|
||||||
char* suff_ref,
|
const char* ss_lookup,
|
||||||
int suff_s_char_l,
|
const int ss_l,
|
||||||
suff_search_lengths: List[int],
|
np.ndarray ss_lengths,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
derived from the string (text/orth) of each token.
|
derived from the raw text of each token.
|
||||||
|
|
||||||
|
Generally:
|
||||||
|
p_ variables relate to prefixes (affixes starting at the beginning of the word)
|
||||||
|
s_ variables relate to suffixes (affixes starting at the end of the word)
|
||||||
|
ps_ variables relate to searches starting at the beginning of the word
|
||||||
|
ss_ variables relate to searches starting at the end of the word
|
||||||
|
|
||||||
cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
|
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
|
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
|
||||||
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
|
|
||||||
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
|
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
|
||||||
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
|
ps_search: a byte array containing characters to search for within each token, starting at the beginning.
|
||||||
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
ps_lookup: a byte array containing characters that are added to the result string when a character at
|
||||||
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
||||||
|
case-insensitivity to be handled efficiently.
|
||||||
|
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
||||||
|
ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if
|
||||||
|
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "a" and "ac".
|
"spaCy" would be "a" and "ac".
|
||||||
suff_search_chars: a string containing characters to search for within each token, starting at the end.
|
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
||||||
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
ss_lookup: a byte array containing characters that are added to the result string when a character at
|
||||||
|
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
||||||
|
case-insensitivity to be handled efficiently.
|
||||||
|
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
||||||
|
ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||||
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "c" and "ca".
|
"spaCy" would be "c" and "ca".
|
||||||
|
|
||||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to
|
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
|
||||||
|
|
||||||
[[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
|
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
||||||
[hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))],
|
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
||||||
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
|
# Encode the document text
|
||||||
cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
|
|
||||||
cdef int aff_buf_l = max_pref_l + max_suff_l
|
|
||||||
cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
|
|
||||||
cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
|
|
||||||
|
|
||||||
cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
|
|
||||||
cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
|
|
||||||
cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
|
|
||||||
cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
|
|
||||||
cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
|
|
||||||
cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
|
|
||||||
cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
|
|
||||||
|
|
||||||
cdef bytes encoded_text = self.text.encode("utf-32le")
|
cdef bytes encoded_text = self.text.encode("utf-32le")
|
||||||
cdef char* intermediate_text = encoded_text
|
cdef char* intermediate_text = encoded_text
|
||||||
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
|
cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
|
||||||
|
|
||||||
cdef unsigned int num_toks = len(self), aff_len
|
# Define the result array and work out what is used for what in axis 1
|
||||||
cdef unsigned int h_pref_n = len(pref_lengths)
|
cdef int num_toks = len(self)
|
||||||
cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
|
cdef int p_h_num = p_lengths.shape[0]
|
||||||
cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
|
cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
|
||||||
cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
|
cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
|
cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
|
||||||
|
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
||||||
|
|
||||||
|
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
|
||||||
|
cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
|
||||||
|
cdef int s_max_l = max(s_lengths) if s_h_num > 0 else 0
|
||||||
|
cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
|
||||||
|
cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
|
||||||
|
|
||||||
|
# Define / allocate buffer (pr/sr: result buffers)
|
||||||
|
cdef int aff_buf_l = p_max_l + s_max_l
|
||||||
|
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
||||||
|
cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
|
||||||
|
cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
|
||||||
|
cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
||||||
|
cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
|
||||||
|
cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
|
||||||
|
cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
||||||
|
|
||||||
|
# Define memory views on length arrays
|
||||||
|
cdef int[:] p_v = p_lengths
|
||||||
|
cdef int[:] s_v = s_lengths
|
||||||
|
cdef int[:] ps_v = ps_lengths
|
||||||
|
cdef int[:] ss_v = ss_lengths
|
||||||
|
|
||||||
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
|
cdef int tok_i, tok_idx, tok_len, aff_len
|
||||||
|
|
||||||
for tok_i in range(num_toks):
|
for tok_i in range(num_toks):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
tok_idx = tok_c.idx
|
tok_idx = tok_c.idx
|
||||||
tok_len = tok_c.lex.length
|
tok_len = tok_c.lex.length
|
||||||
|
|
||||||
_populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
|
if aff_buf_l > 0:
|
||||||
_populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
|
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
||||||
_populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
|
|
||||||
|
|
||||||
for hash_idx in range(h_pref_n):
|
|
||||||
aff_len = pref_lengths[hash_idx]
|
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
|
|
||||||
|
|
||||||
for hash_idx in range(h_pref_n, h_suff_end_idx):
|
|
||||||
aff_len = suff_lengths[hash_idx - h_pref_n]
|
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
|
|
||||||
|
|
||||||
for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
|
for hash_idx in range(p_h_num):
|
||||||
aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
|
hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
|
||||||
hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
|
|
||||||
|
for hash_idx in range(p_h_num, s_h_end):
|
||||||
|
aff_len = s_v[hash_idx - p_h_num]
|
||||||
|
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
|
if ps_h_num > 0:
|
||||||
|
_search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
|
||||||
|
for hash_idx in range(s_h_end, ps_h_end):
|
||||||
|
aff_len = ps_v[hash_idx - s_h_end]
|
||||||
|
hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
|
if ss_h_num > 0:
|
||||||
aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
|
_search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
|
||||||
hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
|
for hash_idx in range(ps_h_end, ss_h_end):
|
||||||
|
aff_len = ss_v[hash_idx - ps_h_end]
|
||||||
|
hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
self.mem.free(aff_buf)
|
self.mem.free(aff_buf)
|
||||||
self.mem.free(pref_f_buf)
|
self.mem.free(pr_buf)
|
||||||
self.mem.free(suff_f_buf)
|
self.mem.free(sr_buf)
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -2025,76 +2042,103 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
|
|
||||||
cdef void _populate_aff_buf(
|
cdef void _copy_chars(
|
||||||
|
Py_UCS4* target,
|
||||||
|
const Py_UCS4* source,
|
||||||
|
const int length,
|
||||||
|
const bint to_lower
|
||||||
|
):
|
||||||
|
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
||||||
|
any upper-case characters to lower case within the target buffer.
|
||||||
|
"""
|
||||||
|
memcpy(target, source, length * sizeof(Py_UCS4))
|
||||||
|
cdef int idx
|
||||||
|
if to_lower:
|
||||||
|
for idx in range(length):
|
||||||
|
if Py_UNICODE_ISUPPER(target[idx]):
|
||||||
|
target[idx] = Py_UNICODE_TOLOWER(target[idx])
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _set_affixes(
|
||||||
const Py_UCS4* text_buf,
|
const Py_UCS4* text_buf,
|
||||||
const int tok_idx,
|
const int tok_idx,
|
||||||
const int tok_len,
|
const int tok_len,
|
||||||
Py_UCS4* aff_buf,
|
Py_UCS4* aff_buf,
|
||||||
const int pref_length,
|
const int pref_len,
|
||||||
const int suff_length,
|
const int suff_len,
|
||||||
const bint to_lower
|
const bint to_lower
|
||||||
):
|
):
|
||||||
""" Populate a buffer of length p+s with the first p and the last s characters of a word within a string.
|
""" Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
||||||
If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros.
|
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
|
||||||
|
|
||||||
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
|
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
||||||
Unicode form (see PEP 393).
|
tok_idx: the index of the first character of the word within the containing string.
|
||||||
kind: the number of bytes occupied by each character in the containing string.
|
tok_len: the length of the word.
|
||||||
word_idx: the index of the first character of the word within the containing string.
|
|
||||||
word_len: the length of the word.
|
|
||||||
aff_buf: the buffer to populate.
|
aff_buf: the buffer to populate.
|
||||||
pref_length: the length of the prefix.
|
pref_len: the length of the prefix.
|
||||||
suff_length: the length of the suffix.
|
suff_len: the length of the suffix.
|
||||||
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
||||||
"""
|
"""
|
||||||
cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
|
cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
|
||||||
|
|
||||||
while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
|
if pref_len > 0:
|
||||||
|
filled_pref_len = pref_len if pref_len < tok_len else tok_len
|
||||||
|
_copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
|
||||||
|
aff_buf_idx = filled_pref_len
|
||||||
|
|
||||||
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
|
if tok_len < pref_len:
|
||||||
if to_lower:
|
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
||||||
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
|
aff_buf_idx = aff_buf_len - suff_len
|
||||||
aff_buf_idx += 1
|
if tok_len < suff_len:
|
||||||
|
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
||||||
|
aff_buf_idx = aff_buf_len - tok_len
|
||||||
|
|
||||||
if aff_buf_idx < buf_size - tok_len:
|
if suff_len > 0:
|
||||||
# fill out the empty middle part of the buffer with zeros
|
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||||
memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
|
if in_word_idx < pref_len:
|
||||||
|
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
||||||
|
aff_buf_idx += filled_pref_len - in_word_idx
|
||||||
|
if aff_buf_idx < aff_buf_len:
|
||||||
|
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
||||||
|
|
||||||
while aff_buf_idx < buf_size:
|
|
||||||
in_word_idx = aff_buf_idx + tok_len - buf_size
|
|
||||||
# for suffixes we have to track the in-word index separately from the in-buffer index
|
|
||||||
if in_word_idx < pref_length:
|
|
||||||
# we've already retrieved this character as part of the prefix, so copy it from there
|
|
||||||
# as that's quicker than retrieving it from the input string a second time
|
|
||||||
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
|
|
||||||
else:
|
|
||||||
memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
|
|
||||||
if to_lower:
|
|
||||||
aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
|
|
||||||
aff_buf_idx += 1
|
|
||||||
|
|
||||||
cdef void _populate_search_buf(
|
cdef void _search_for_chars(
|
||||||
const Py_UCS4* text_buf,
|
const Py_UCS4* text_buf,
|
||||||
const int tok_idx,
|
const int tok_idx,
|
||||||
const int tok_len,
|
const int tok_len,
|
||||||
Py_UCS4* search_buf,
|
Py_UCS4* search_buf,
|
||||||
Py_UCS4* ref_buf,
|
Py_UCS4* lookup_buf,
|
||||||
const int search_buf_len,
|
const int search_buf_len,
|
||||||
Py_UCS4* finding_buf,
|
Py_UCS4* result_buf,
|
||||||
const int finding_buf_len,
|
const int result_buf_len,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
):
|
):
|
||||||
cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
||||||
cdef unsigned int search_buf_idx
|
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
|
||||||
cdef int cmp_res
|
the corresponding character from *lookup_buf* is added to *result_buf*.
|
||||||
|
|
||||||
while finding_buf_idx < finding_buf_len:
|
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
||||||
|
tok_idx: the index of the first character of the word within the containing string.
|
||||||
|
tok_len: the length of the word.
|
||||||
|
search_buf: the characters to search for (ordered).
|
||||||
|
lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match.
|
||||||
|
Having separate search and lookup arrays enables case-insensitivity to be handled efficiently.
|
||||||
|
search_buf_len: the length of *search_buf* and hence also of *lookup_buf*.
|
||||||
|
result_buf: the buffer in which to place the results.
|
||||||
|
result_buf_len: the length of *result_buf*.
|
||||||
|
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||||
|
"""
|
||||||
|
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
||||||
|
cdef int search_buf_idx
|
||||||
|
cdef int cmp_result
|
||||||
|
|
||||||
|
while result_buf_idx < result_buf_len:
|
||||||
for search_buf_idx in range (search_buf_len):
|
for search_buf_idx in range (search_buf_len):
|
||||||
cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
|
cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4))
|
||||||
if cmp_res == 0:
|
if cmp_result == 0:
|
||||||
memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
|
memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4))
|
||||||
finding_buf_idx += 1
|
result_buf_idx += 1
|
||||||
if cmp_res >= 0:
|
if cmp_result >= 0:
|
||||||
break
|
break
|
||||||
if suffs_not_prefs:
|
if suffs_not_prefs:
|
||||||
if text_string_idx <= tok_idx:
|
if text_string_idx <= tok_idx:
|
||||||
|
@ -2105,11 +2149,11 @@ cdef void _populate_search_buf(
|
||||||
if text_string_idx >= tok_idx + tok_len:
|
if text_string_idx >= tok_idx + tok_len:
|
||||||
break
|
break
|
||||||
|
|
||||||
if finding_buf_idx < finding_buf_len:
|
# fill in any unused characters in the result buffer with zeros
|
||||||
memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
|
if result_buf_idx < result_buf_len:
|
||||||
|
memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
||||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||||
|
|
|
@ -1737,7 +1737,7 @@ def all_equal(iterable):
|
||||||
return next(g, True) and not next(g, False)
|
return next(g, True) and not next(g, False)
|
||||||
|
|
||||||
|
|
||||||
def get_byte_arrays_for_search_chars(
|
def get_arrays_for_search_chars(
|
||||||
search_chars: str, case_sensitive: bool
|
search_chars: str, case_sensitive: bool
|
||||||
) -> Tuple[bytes, bytes]:
|
) -> Tuple[bytes, bytes]:
|
||||||
"""
|
"""
|
||||||
|
@ -1746,14 +1746,14 @@ def get_byte_arrays_for_search_chars(
|
||||||
for search characters. The encoding is little-endian regardless of architecture, as
|
for search characters. The encoding is little-endian regardless of architecture, as
|
||||||
this is what is expected by the murmurhash library used downstream.
|
this is what is expected by the murmurhash library used downstream.
|
||||||
|
|
||||||
Alongside the "search byte array" against which words from document texts are compared
|
Alongside the "search array" against which words from document texts are compared
|
||||||
is the "ref byte array". When a character from the search byte array is matched,
|
is the "lookup array". When a character from the search array is matched,
|
||||||
the character at the corresponding position in the ref byte array is added to the
|
the character at the corresponding position in the lookup array is added to the
|
||||||
byte sequence of the configured length that is then hashed. This enables case-sensitivity
|
sequence that then goes on to be hashed. This enables case-sensitivity
|
||||||
to be handled without converting the case of the words being searched: if
|
to be handled without converting the case of the words being searched: if
|
||||||
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
*case_sensitive==False*, the lower- or uppercase counterparts of any characters that
|
||||||
have case are added to the search byte arrays, and both the original character and its
|
have case are added to the search array, and both the original character and its
|
||||||
other-cased counterpart map to the lower-case version in the ref byte array.
|
other-cased counterpart map to the lower-case version in the lookup array.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def encode(ch: str) -> bytes:
|
def encode(ch: str) -> bytes:
|
||||||
|
@ -1762,8 +1762,8 @@ def get_byte_arrays_for_search_chars(
|
||||||
"""
|
"""
|
||||||
return ch.encode("UTF-32LE")
|
return ch.encode("UTF-32LE")
|
||||||
|
|
||||||
def add_to_byte_arrays(
|
def add_to_arrays(
|
||||||
search: List[bytes], ref: List[bytes], ch: str
|
search: List[bytes], lookup: List[bytes], ch: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Add the byte representations of *ch* to the two byte array lists.
|
"""Add the byte representations of *ch* to the two byte array lists.
|
||||||
"""
|
"""
|
||||||
|
@ -1771,36 +1771,36 @@ def get_byte_arrays_for_search_chars(
|
||||||
if not case_sensitive and ch.islower():
|
if not case_sensitive and ch.islower():
|
||||||
if this_char_bytes not in search:
|
if this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
ref.append(this_char_bytes)
|
lookup.append(this_char_bytes)
|
||||||
upper_char_bytes = encode(ch.upper())
|
upper_char_bytes = encode(ch.upper())
|
||||||
if upper_char_bytes not in search:
|
if upper_char_bytes not in search:
|
||||||
search.append(upper_char_bytes)
|
search.append(upper_char_bytes)
|
||||||
ref.append(this_char_bytes)
|
lookup.append(this_char_bytes)
|
||||||
elif not case_sensitive and ch.isupper():
|
elif not case_sensitive and ch.isupper():
|
||||||
lower_char_bytes = encode(ch.lower())
|
lower_char_bytes = encode(ch.lower())
|
||||||
if this_char_bytes not in search:
|
if this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
ref.append(lower_char_bytes)
|
lookup.append(lower_char_bytes)
|
||||||
if lower_char_bytes not in search:
|
if lower_char_bytes not in search:
|
||||||
search.append(lower_char_bytes)
|
search.append(lower_char_bytes)
|
||||||
ref.append(lower_char_bytes)
|
lookup.append(lower_char_bytes)
|
||||||
elif this_char_bytes not in search:
|
elif this_char_bytes not in search:
|
||||||
search.append(this_char_bytes)
|
search.append(this_char_bytes)
|
||||||
ref.append(this_char_bytes)
|
lookup.append(this_char_bytes)
|
||||||
|
|
||||||
def get_ordered_raw_bytes(
|
def get_ordered_raw_bytes(
|
||||||
search: List[bytes], ref: List[bytes]
|
search: List[bytes], lookup: List[bytes]
|
||||||
) -> Tuple[bytes, bytes]:
|
) -> Tuple[bytes, bytes]:
|
||||||
"""Flatten the two lists, ordering both by the entries in *search*
|
"""Flatten the two lists, ordering both by the entries in *search*
|
||||||
using the native endianness of the platform.
|
using the native endianness of the platform.
|
||||||
"""
|
"""
|
||||||
num_search = [list(entry) for entry in search]
|
num_search = [list(entry) for entry in search]
|
||||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||||
ref = [entry for _, entry in sorted(zip(num_search, ref))]
|
lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
|
||||||
return b"".join(search), b"".join(ref)
|
return b"".join(search), b"".join(lookup)
|
||||||
|
|
||||||
search: List[bytes] = []
|
search: List[bytes] = []
|
||||||
ref: List[bytes] = []
|
lookup: List[bytes] = []
|
||||||
for ch in search_chars:
|
for ch in search_chars:
|
||||||
add_to_byte_arrays(search, ref, ch)
|
add_to_arrays(search, lookup, ch)
|
||||||
return get_ordered_raw_bytes(search, ref)
|
return get_ordered_raw_bytes(search, lookup)
|
||||||
|
|
|
@ -218,28 +218,6 @@ whose presence before or after characters that would otherwise alternate
|
||||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun
|
prevents the alternation from occurring, e.g. an `ä` in a German plural noun
|
||||||
does not become `a` if it is the third or fourth vowel from the end of the word.
|
does not become `a` if it is the third or fourth vowel from the end of the word.
|
||||||
|
|
||||||
Internally, the model converts each token string to
|
|
||||||
[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
|
|
||||||
from the string occupies two bytes. This assumption holds for all characters in
|
|
||||||
the Basic Multilingual Plane, which encompasses all characters that are ever
|
|
||||||
likely to be of interest when extracting features. There are, however,
|
|
||||||
characters like emojis that are in the Extended Multilingual Plane and occupy
|
|
||||||
four bytes, although importantly neither of the two byte pairs that make up such
|
|
||||||
a representation can be a valid two-byte character in its own right. The
|
|
||||||
following considerations apply to the processing of four-byte characters:
|
|
||||||
|
|
||||||
- An exceptional four-byte character within a text consisting mostly of two-byte
|
|
||||||
characters will probably be ignored by the neural network accepting the
|
|
||||||
embedding layer as not matching any of the learned features.
|
|
||||||
- If anyone did want to train a model for a language like Lycian that is
|
|
||||||
generally written in four-byte characters, prefix and suffix features can
|
|
||||||
still be extracted, but the length specifications should all be doubled, i.e.
|
|
||||||
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
|
|
||||||
situation length specifications that are odd numbers would serve no useful
|
|
||||||
purpose since they would refer to half-characters.
|
|
||||||
- Four-byte characters are not accepted within search character specification
|
|
||||||
strings and lead to an error being thrown.
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
|
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user