Remove search functionality

This commit is contained in:
richardpaulhudson 2023-01-10 13:06:50 +01:00
parent 2116a71962
commit e95dd432d1
9 changed files with 10 additions and 530 deletions

View File

@ -193,23 +193,14 @@ def _verify_rich_config_group(
label: str,
lengths: Optional[List[int]],
rows: Optional[List[int]],
search_chars: Optional[str],
is_search_char_group: bool,
) -> None:
if lengths is not None or rows is not None:
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
raise ValueError(Errors.E1048.format(label=label))
if search_chars is not None and len(search_chars) > 63:
raise ValueError(Errors.E1049.format(label=label))
if lengths is None or rows is None:
raise ValueError(Errors.E1048.format(label=label))
if len(lengths) != len(rows):
raise ValueError(Errors.E1048.format(label=label))
if any([length < 1 for length in lengths]):
raise ValueError(Errors.E1048.format(label=label))
elif search_chars is not None:
raise ValueError(Errors.E1048.format(label=label))
if lengths is not None:
if lengths[-1] > 63:
raise ValueError(Errors.E1049.format(label=label))
if len(lengths) != len(set(lengths)) or lengths != sorted(lengths):
@ -227,12 +218,6 @@ def RichMultiHashEmbed(
pref_rows: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
suff_rows: Optional[List[int]] = None,
pref_search_chars: Optional[str] = None,
pref_search_lengths: Optional[List[int]] = None,
pref_search_rows: Optional[List[int]] = None,
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
suff_search_rows: Optional[List[int]] = None,
) -> Model[List[Doc], List[Floats2d]]:
"""
Construct an embedding layer with the features of `MultiHashEmbed` (see above)
@ -240,37 +225,12 @@ def RichMultiHashEmbed(
The fixed-length `PREFIX` and `SUFFIX` features used in `MultiHashEmbed`
are sometimes not rich enough when working with languages with complex morphology,
and this layer allows the specification of multiple prefixes and suffixes
of any lengths.
Additionally, it is possible to use as features the results of character
searches of specified lengths. A list of search characters is specified; the
characters in each word are examined in order starting at the beginning or at
the end; and each character that matches one of the search characters is added,
in order, to the string to be used as a feature. The search continues until
either the search result string is full or the whole word has been examined.
This is useful because some languages exhibit morphological alternations where
one letter or letters regularly alternate with another letter or letters
depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
For most languages used with spaCy, searching is likely to be useful starting
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
is also offered for completeness. Search characters should consist of all
characters that regularly alternate with other characters in the language in
question or whose presence before or after characters that would otherwise
alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the
end of the word.
of any lengths. Arrays specifying lengths must be in ascending order.
There are a few rare situations where a graphical character is expressed as
more than one UTF-8 character, e.g. *i* when representing the lower-case form
of the Turkish letter *İ*. Such situations are supported, but the lengths of
prefixes, suffixes and character search results may need to be increased
accordingly.
All arrays specifying lengths must be in ascending order.
prefixes and suffixes may need to be increased accordingly.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
@ -290,39 +250,13 @@ def RichMultiHashEmbed(
for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `y` and `yCa` being used as features.
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
pref_search_chars (Optional[str]): A string containing characters to search for
starting from the beginning of each word.
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the beginning of each word.
pref_search_rows (Optional[List[int]]): The number of rows for each of
`pref_search_lengths`.
suff_search_chars (Optional[str]): A string containing characters to search for
starting from the end of each word.
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
to use as features, where the searches start from the end of each word.
suff_search_rows (Optional[List[int]]): The number of rows for each of
`suff_search_lengths`.
"""
if len(rows) != len(attrs):
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
_verify_rich_config_group("prefix", pref_lengths, pref_rows, None, False)
_verify_rich_config_group("suffix", suff_lengths, suff_rows, None, False)
_verify_rich_config_group(
"prefix search",
pref_search_lengths,
pref_search_rows,
pref_search_chars,
True,
)
_verify_rich_config_group(
"suffix search",
suff_search_lengths,
suff_search_rows,
suff_search_chars,
True,
)
_verify_rich_config_group("prefix", pref_lengths, pref_rows)
_verify_rich_config_group("suffix", suff_lengths, suff_rows)
if "PREFIX" in attrs or "SUFFIX" in attrs:
warnings.warn(Warnings.W124)
@ -331,10 +265,6 @@ def RichMultiHashEmbed(
rows.extend(pref_rows)
if suff_rows is not None:
rows.extend(suff_rows)
if pref_search_rows is not None:
rows.extend(pref_search_rows)
if suff_search_rows is not None:
rows.extend(suff_search_rows)
embeddings: List[Model[Ints2d, Floats2d]] = [
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
@ -350,10 +280,6 @@ def RichMultiHashEmbed(
case_sensitive=case_sensitive,
pref_lengths=pref_lengths,
suff_lengths=suff_lengths,
pref_search_chars=pref_search_chars,
pref_search_lengths=pref_search_lengths,
suff_search_chars=suff_search_chars,
suff_search_lengths=suff_search_lengths,
),
)

View File

@ -2,7 +2,6 @@ from typing import List, Optional, Callable, Tuple
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
from ..util import get_search_char_byte_arrays
@registry.layers("spacy.RichFeatureExtractor.v1")
@ -11,27 +10,7 @@ def RichFeatureExtractor(
case_sensitive: bool,
pref_lengths: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
pref_search_chars: Optional[str] = None,
pref_search_lengths: Optional[List[int]] = None,
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None:
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
pref_search_chars, case_sensitive
)
else:
ps_search_chars = bytes()
ps_width_offsets = bytes()
if suff_search_chars is not None:
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
suff_search_chars, case_sensitive
)
else:
ss_search_chars = bytes()
ss_width_offsets = bytes()
return Model(
"extract_character_combination_hashes",
forward,
@ -39,16 +18,6 @@ def RichFeatureExtractor(
"case_sensitive": case_sensitive,
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
"ps_search_chars": ps_search_chars,
"ps_width_offsets": ps_width_offsets,
"ps_lengths": bytes(pref_search_lengths)
if pref_search_lengths is not None
else bytes(),
"ss_search_chars": ss_search_chars,
"ss_width_offsets": ss_width_offsets,
"ss_lengths": bytes(suff_search_lengths)
if suff_search_lengths is not None
else bytes(),
},
)
@ -60,24 +29,12 @@ def forward(
case_sensitive: bool = model.attrs["case_sensitive"]
p_lengths: bytes = model.attrs["p_lengths"]
s_lengths: bytes = model.attrs["s_lengths"]
ps_search_chars: bytes = model.attrs["ps_search_chars"]
ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
ps_lengths: bytes = model.attrs["ps_lengths"]
ss_search_chars: bytes = model.attrs["ss_search_chars"]
ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
ss_lengths: bytes = model.attrs["ss_lengths"]
features: List[Ints2d] = []
for doc in docs:
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=p_lengths,
s_lengths=s_lengths,
ps_search_chars=ps_search_chars,
ps_width_offsets=ps_width_offsets,
ps_lengths=ps_lengths,
ss_search_chars=ss_search_chars,
ss_width_offsets=ss_width_offsets,
ss_lengths=ss_lengths,
)
features.append(ops.asarray2i(hashes, dtype="uint64"))

View File

@ -15,7 +15,6 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language
from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.util import get_search_char_byte_arrays
from spacy.vocab import Vocab
from .test_underscore import clean_underscore # noqa: F401
@ -1450,12 +1449,6 @@ def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy")
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
"Rp", case_sensitive
)
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
"xx✨rp", case_sensitive
)
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes(
@ -1473,17 +1466,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
5,
)
),
ps_search_chars=ps_search_chars,
ps_width_offsets=ps_width_offsets,
ps_lengths=bytes((2,)),
ss_search_chars=ss_search_chars,
ss_width_offsets=ss_width_offsets,
ss_lengths=bytes(
(
1,
2,
)
),
)
assert hashes[0][0] == _encode_and_hash("s")
assert hashes[0][1] == _encode_and_hash("spa")
@ -1492,9 +1474,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
assert hashes[0][7] == _encode_and_hash("p")
assert hashes[0][8] == _encode_and_hash("p")
assert hashes[0][9] == _encode_and_hash("p")
assert hashes[1][0] == _encode_and_hash("")
assert hashes[1][1] == _encode_and_hash("")
assert hashes[1][2] == _encode_and_hash("")
@ -1502,9 +1481,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[1][4] == _encode_and_hash("", reverse=True)
assert hashes[1][5] == _encode_and_hash("", reverse=True)
assert hashes[1][6] == _encode_and_hash("", reverse=True)
assert hashes[1][7] == EMPTY_HASH_VALUE
assert hashes[1][8] == _encode_and_hash("")
assert hashes[1][9] == _encode_and_hash("")
assert hashes[2][0] == _encode_and_hash("a")
assert hashes[2][1] == _encode_and_hash("and")
assert hashes[2][2] == _encode_and_hash("and")
@ -1512,9 +1488,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[2][4] == _encode_and_hash("dna")
assert hashes[2][5] == _encode_and_hash("dna")
assert hashes[2][6] == _encode_and_hash("dna")
assert hashes[2][7] == EMPTY_HASH_VALUE
assert hashes[2][8] == EMPTY_HASH_VALUE
assert hashes[2][9] == EMPTY_HASH_VALUE
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
@ -1522,21 +1495,10 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[3][4] == _encode_and_hash("ygi")
assert hashes[3][5] == _encode_and_hash("ygid")
assert hashes[3][6] == _encode_and_hash("ygido")
assert (
hashes[3][7] == EMPTY_HASH_VALUE if case_sensitive else _encode_and_hash("pr")
)
assert hashes[3][8] == _encode_and_hash("r")
if case_sensitive:
assert hashes[3][9] == _encode_and_hash("r")
else:
assert hashes[3][9] == _encode_and_hash("rp")
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy")
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False)
hashes = doc.get_character_combination_hashes(
case_sensitive=False,
p_lengths=bytes(),
@ -1548,34 +1510,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
5,
)
),
ps_search_chars=ps_search_chars,
ps_width_offsets=ps_width_offsets,
ps_lengths=bytes((2,)),
ss_search_chars=bytes(),
ss_width_offsets=bytes(),
ss_lengths=bytes(),
)
assert hashes[0][0] == _encode_and_hash("yc")
assert hashes[0][1] == _encode_and_hash("yca")
assert hashes[0][2] == _encode_and_hash("ycap")
assert hashes[0][3] == _encode_and_hash("ycaps")
assert hashes[0][4] == _encode_and_hash("p")
assert hashes[1][0] == _encode_and_hash("", reverse=True)
assert hashes[1][1] == _encode_and_hash("", reverse=True)
assert hashes[1][2] == _encode_and_hash("", reverse=True)
assert hashes[1][3] == _encode_and_hash("", reverse=True)
assert hashes[1][4] == EMPTY_HASH_VALUE
assert hashes[2][0] == _encode_and_hash("dn")
assert hashes[2][1] == _encode_and_hash("dna")
assert hashes[2][2] == _encode_and_hash("dna")
assert hashes[2][3] == _encode_and_hash("dna")
assert hashes[2][4] == EMPTY_HASH_VALUE
assert hashes[3][0] == _encode_and_hash("yg")
assert hashes[3][1] == _encode_and_hash("ygi")
assert hashes[3][2] == _encode_and_hash("ygid")
assert hashes[3][3] == _encode_and_hash("ygido")
assert hashes[3][4] == _encode_and_hash("pr")
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
@ -1588,12 +1540,6 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer):
case_sensitive=False,
p_lengths=bytes((p_length,)),
s_lengths=bytes((s_length,)),
ps_search_chars=bytes(),
ps_width_offsets=bytes(),
ps_lengths=bytes(),
ss_search_chars=bytes(),
ss_width_offsets=bytes(),
ss_lengths=bytes(),
)
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
@ -1605,7 +1551,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
en_tokenizer, case_sensitive
):
doc = en_tokenizer("İ".lower() + "İ")
search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive)
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes(
@ -1624,26 +1569,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
4,
)
),
ps_search_chars=search_chars,
ps_width_offsets=width_offsets,
ps_lengths=bytes(
(
1,
2,
3,
4,
)
),
ss_search_chars=search_chars,
ss_width_offsets=width_offsets,
ss_lengths=bytes(
(
1,
2,
3,
4,
)
),
)
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
@ -1656,10 +1581,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
assert hashes[0][8] == _encode_and_hash("İ")
assert hashes[0][9] == _encode_and_hash("İ")
assert hashes[0][12] == _encode_and_hash("İ")
assert hashes[0][13] == _encode_and_hash("İ")
else:
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
@ -1670,16 +1591,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
)
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
assert hashes[0][8] == _encode_and_hash("i")
assert hashes[0][9] == _encode_and_hash("İ".lower())
assert hashes[0][10] == _encode_and_hash("İ".lower() + "i")
assert hashes[0][11] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE)
assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i")
assert hashes[0][14] == _encode_and_hash(
COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
)
assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1693,33 +1604,17 @@ def test_get_character_combination_hashes_string_store_spec_cases(
assert len(long_word) > 255
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
assert len(doc) == 4
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive)
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes((2,)),
s_lengths=bytes((2,)),
ps_search_chars=ps_search_chars,
ps_width_offsets=ps_width_offsets,
ps_lengths=bytes((2,)),
ss_search_chars=bytes(),
ss_width_offsets=bytes(),
ss_lengths=bytes(),
)
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _encode_and_hash("91")
assert hashes[0][2] == EMPTY_HASH_VALUE
assert hashes[1][0] == _encode_and_hash("be")
assert hashes[1][1] == _encode_and_hash("ee")
if case_sensitive:
assert hashes[1][2] == EMPTY_HASH_VALUE
else:
assert hashes[1][2] == _encode_and_hash("ee")
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
if case_sensitive:
assert hashes[2][2] == hashes[3][2] == EMPTY_HASH_VALUE
else:
assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee")
def test_character_combination_hashes_empty_lengths(en_tokenizer):
@ -1728,10 +1623,4 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
case_sensitive=True,
p_lengths=bytes(),
s_lengths=bytes(),
ps_search_chars=bytes(),
ps_width_offsets=bytes(),
ps_lengths=bytes(),
ss_search_chars=bytes(),
ss_width_offsets=bytes(),
ss_lengths=bytes(),
).shape == (1, 0)

View File

@ -1,86 +0,0 @@
import spacy
import pytest
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"zzaaEP", case_sensitive
)
if case_sensitive:
assert search_chars == b"EPaz"
else:
assert search_chars == b"aepz"
assert width_offsets == b"\x00\x04\x04\x04\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"𐌞", case_sensitive
)
assert search_chars == "𐌞".encode("utf-8")
assert width_offsets == b"\x00\x00\x00\x00\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"𐌞Éabé—B𐌞", case_sensitive
)
if case_sensitive:
assert search_chars == "BabÉé—𐌞".encode("utf-8")
assert width_offsets == b"\x00\x03\x07\x0a\x0e"
else:
assert search_chars == "abé—𐌞".encode("utf-8")
assert width_offsets == b"\x00\x02\x04\x07\x0b"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_widths_1_and_3(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"B—", case_sensitive
)
if case_sensitive:
assert search_chars == "B—".encode("utf-8")
else:
assert search_chars == "b—".encode("utf-8")
assert width_offsets == b"\x00\x01\x01\x04\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_search_char_byte_arrays_widths_1_and_4(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"B𐌞", case_sensitive
)
if case_sensitive:
assert search_chars == "B𐌞".encode("utf-8")
else:
assert search_chars == "b𐌞".encode("utf-8")
assert width_offsets == b"\x00\x01\x01\x01\x05"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_turkish_i_with_dot(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"İ", case_sensitive
)
if case_sensitive:
assert search_chars == "İ".encode("utf-8")
assert width_offsets == b"\x00\x00\x02\x02\x02"
else:
assert search_chars == b"i\xcc\x87"
assert width_offsets == b"\x00\x01\x03\x03\x03"
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_turkish_i_with_dot_and_normal_i(case_sensitive):
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
"İI", case_sensitive
)
if case_sensitive:
assert search_chars == "".encode("utf-8")
assert width_offsets == b"\x00\x01\x03\x03\x03"
else:
assert search_chars == b"i\xcc\x87"
assert width_offsets == b"\x00\x01\x03\x03\x03"

View File

@ -50,18 +50,6 @@ cdef void _set_suffix_lengths(
) nogil
cdef void _search_for_chars(
const unsigned char* tok_str,
const int tok_str_l,
const unsigned char* search_chars,
const unsigned char* width_offsets,
const int max_res_l,
const bint suffs_not_prefs,
unsigned char* res_buf,
unsigned char* l_buf,
) nogil
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,

View File

@ -180,12 +180,6 @@ class Doc:
case_sensitive: bool,
p_lengths: bytes,
s_lengths: bytes,
ps_search_chars: bytes,
ps_width_offsets: bytes,
ps_lengths: bytes,
ss_search_chars: bytes,
ss_width_offsets: bytes,
ss_lengths: bytes,
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -1751,43 +1751,16 @@ cdef class Doc:
const bint case_sensitive,
const unsigned char* p_lengths,
const unsigned char* s_lengths,
const unsigned char* ps_search_chars,
const unsigned char* ps_width_offsets,
const unsigned char* ps_lengths,
const unsigned char* ss_search_chars,
const unsigned char* ss_width_offsets,
const unsigned char* ss_lengths,
):
"""
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the raw text of each token.
Generally:
p_ variables relate to prefixes (affixes starting at the beginning of the word)
s_ variables relate to suffixes (affixes starting at the end of the word)
ps_ variables relate to searches starting at the beginning of the word
ss_ variables relate to searches starting at the end of the word
cs: if *False*, hashes are generated based on the lower-case version of each token.
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
ps_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token,
starting at the beginning.
ps_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
specifying the offsets within *ps_search_chars* that contain UTF-8 characters with the specified widths.
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
hashed for "spaCy" would be "a" and "ac".
ss_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token,
starting at the end.
ss_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
specifying the offsets within *ss_search_chars* that contain UTF-8 characters with the specified widths.
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
hashed for "spaCy" would be "c" and "ca".
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
@ -1801,27 +1774,16 @@ cdef class Doc:
# Work out lengths
cdef int p_lengths_l = strlen(<char*> p_lengths)
cdef int s_lengths_l = strlen(<char*> s_lengths)
cdef int ps_lengths_l = strlen(<char*> ps_lengths)
cdef int ss_lengths_l = strlen(<char*> ss_lengths)
cdef int hashes_per_tok = p_lengths_l + s_lengths_l + ps_lengths_l + ss_lengths_l
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
cdef int ps_max_l = ps_lengths[ps_lengths_l - 1] if ps_lengths_l > 0 else 0
cdef int ss_max_l = ss_lengths[ss_lengths_l - 1] if ss_lengths_l > 0 else 0
# Define / allocate buffers
cdef Pool mem = Pool()
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l,
MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, sizeof(char))
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l,
MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, sizeof(char))
cdef int doc_l = self.length
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="uint64")
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
# Define working variables
@ -1851,17 +1813,7 @@ cdef class Doc:
if s_max_l > 0:
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
if ps_max_l > 0:
_search_for_chars(tok_str, tok_str_l, ps_search_chars, ps_width_offsets,
ps_max_l, False, ps_res_buf, ps_l_buf)
hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes_ptr)
if ss_max_l > 0:
_search_for_chars(tok_str, tok_str_l, ss_search_chars, ss_width_offsets,
ss_max_l, True, ss_res_buf, ss_l_buf)
hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes_ptr)
return hashes
@ -2111,73 +2063,6 @@ cdef void _set_suffix_lengths(
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
cdef void _search_for_chars(
const unsigned char* tok_str,
const int tok_str_l,
const unsigned char* search_chars,
const unsigned char* width_offsets,
const int max_res_l,
const bint suffs_not_prefs,
unsigned char* res_buf,
unsigned char* l_buf,
) nogil:
""" Search *tok_str* within a string for characters within *search_chars*, starting at the
beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches,
it is added to *res_buf* and the byte length up to that point is added to *l_buf*. When nothing
more is found, the remainder of *l_buf* is populated wth the byte length from the last result,
which may be *0* if the search was not successful.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within *tok_str*.
width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
specifying the offsets within *search_chars* that contain UTF-8 characters with the specified widths.
max_res_l: the maximum number of found characters to place in *res_buf*.
suffs_not_prefs: if *True*, searching starts from the end of the word;
if *False*, from the beginning.
res_buf: the buffer in which to place the search results.
l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
The calling code ensures that lengths greater than 255 cannot occur.
"""
cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
while this_tok_str_idx >= 0 and this_tok_str_idx <= tok_str_l:
if (
(this_tok_str_idx == tok_str_l) or
((tok_str[this_tok_str_idx] & 0xc0) != 0x80) # not continuation character, always applies to [0].
):
if this_tok_str_idx > last_tok_str_idx:
ch_wdth = this_tok_str_idx - last_tok_str_idx
else:
ch_wdth = last_tok_str_idx - this_tok_str_idx
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
search_char_idx = width_offsets[ch_wdth - 1]
end_search_idx = width_offsets[ch_wdth]
while search_char_idx < end_search_idx:
cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth)
if cmp_result == 0:
memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth)
res_buf_idx += ch_wdth
l_buf[l_buf_idx] = res_buf_idx
l_buf_idx += 1
if l_buf_idx == max_res_l:
return
if cmp_result <= 0:
break
search_char_idx += ch_wdth
last_tok_str_idx = this_tok_str_idx
if suffs_not_prefs:
this_tok_str_idx -= 1
else:
this_tok_str_idx += 1
# fill in unused characters in the length buffer
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3

View File

@ -1736,44 +1736,3 @@ def all_equal(iterable):
(or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable)
return next(g, True) and not next(g, False)
def get_search_char_byte_arrays(
search_char_string: str, case_sensitive: bool
) -> Tuple[bytes, bytes]:
"""
This function supports *RichMultiHashEmbed*. It orders the characters in
*search_char_string*, removing any duplicates, encodes them as UTF-8, and
returns the result bufer together with a byte array containing the offsets
where the characters of various byte lengths start within the result buffer,
i.e.
<1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.
If the result buffer does not contain any characters of length *n*,
<n_byte_start> == <n+1_byte_start>.
"""
if not case_sensitive:
search_char_string = search_char_string.lower()
ordered_search_char_string = "".join(sorted(set(search_char_string)))
search_chars = ordered_search_char_string.encode("UTF-8")
width_offsets = [-1] * 5
working_start = 0
working_width = 0
for idx in range(1, len(search_chars) + 1):
if (
idx == len(search_chars)
or search_chars[idx] & 0xC0 != 0x80 # not continuation byte
):
this_width = idx - working_start
if this_width > 4 or this_width < working_width:
raise RuntimeError(Errors.E1051)
if this_width > working_width:
for i in range(working_width, 5):
width_offsets[i] = working_start
working_width = this_width
working_start = idx
for i in range(this_width, 5):
width_offsets[i] = idx
return search_chars, bytes((width_offsets))

View File

@ -186,9 +186,6 @@ updated).
> pref_rows = [10000,10000,10000]
> suff_lengths = [2, 3, 4, 5]
> suff_rows = [10000,10000,10000,10000]
> suff_search_chars = "aeiouäöüß"
> suff_search_lengths = [2, 3]
> suff_search_rows = [10000,10000]
> ```
Construct an embedding layer with the features of
@ -198,35 +195,12 @@ features extracted from various positions in each token string. The fixed-length
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
enough when working with languages with complex morphology, and this layer
allows the specification of multiple prefixes and suffixes of any lengths.
Additionally, it is possible to use as features the results of character
searches of specified lengths. A list of search characters is specified; the
characters in each word are examined in order starting at the beginning or at
the end; and each character that matches one of the search characters is added,
in order, to the string to be used as a feature. The search continues until
either the search result string is full or the whole word has been examined.
This is useful because some languages exhibit morphological alternations where
one letter or letters regularly alternate with another letter or letters
depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
For most languages used with spaCy, searching is likely to be useful starting at
the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
also offered for completeness. Search characters should consist of all
characters that regularly alternate with other characters in the language in
question or whose presence before or after characters that would otherwise
alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the end
of the word.
Arrays specifying lengths must be in ascending order.
There are a few rare situations where a graphical character is expressed as more
than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
suffixes and character search results may need to be increased accordingly.
All arrays specifying lengths must be in ascending order.
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes
and suffixes may need to be increased accordingly.
| Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -239,12 +213,6 @@ All arrays specifying lengths must be in ascending order.
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ |
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ |
| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v2 {#CharacterEmbed}