mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Remove search functionality
This commit is contained in:
parent
2116a71962
commit
e95dd432d1
|
@ -193,23 +193,14 @@ def _verify_rich_config_group(
|
||||||
label: str,
|
label: str,
|
||||||
lengths: Optional[List[int]],
|
lengths: Optional[List[int]],
|
||||||
rows: Optional[List[int]],
|
rows: Optional[List[int]],
|
||||||
search_chars: Optional[str],
|
|
||||||
is_search_char_group: bool,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
if lengths is not None or rows is not None:
|
if lengths is not None or rows is not None:
|
||||||
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
|
||||||
if search_chars is not None and len(search_chars) > 63:
|
|
||||||
raise ValueError(Errors.E1049.format(label=label))
|
|
||||||
if lengths is None or rows is None:
|
if lengths is None or rows is None:
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
if len(lengths) != len(rows):
|
if len(lengths) != len(rows):
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
if any([length < 1 for length in lengths]):
|
if any([length < 1 for length in lengths]):
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
elif search_chars is not None:
|
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
|
||||||
if lengths is not None:
|
|
||||||
if lengths[-1] > 63:
|
if lengths[-1] > 63:
|
||||||
raise ValueError(Errors.E1049.format(label=label))
|
raise ValueError(Errors.E1049.format(label=label))
|
||||||
if len(lengths) != len(set(lengths)) or lengths != sorted(lengths):
|
if len(lengths) != len(set(lengths)) or lengths != sorted(lengths):
|
||||||
|
@ -227,12 +218,6 @@ def RichMultiHashEmbed(
|
||||||
pref_rows: Optional[List[int]] = None,
|
pref_rows: Optional[List[int]] = None,
|
||||||
suff_lengths: Optional[List[int]] = None,
|
suff_lengths: Optional[List[int]] = None,
|
||||||
suff_rows: Optional[List[int]] = None,
|
suff_rows: Optional[List[int]] = None,
|
||||||
pref_search_chars: Optional[str] = None,
|
|
||||||
pref_search_lengths: Optional[List[int]] = None,
|
|
||||||
pref_search_rows: Optional[List[int]] = None,
|
|
||||||
suff_search_chars: Optional[str] = None,
|
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
|
||||||
suff_search_rows: Optional[List[int]] = None,
|
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""
|
"""
|
||||||
Construct an embedding layer with the features of `MultiHashEmbed` (see above)
|
Construct an embedding layer with the features of `MultiHashEmbed` (see above)
|
||||||
|
@ -240,37 +225,12 @@ def RichMultiHashEmbed(
|
||||||
The fixed-length `PREFIX` and `SUFFIX` features used in `MultiHashEmbed`
|
The fixed-length `PREFIX` and `SUFFIX` features used in `MultiHashEmbed`
|
||||||
are sometimes not rich enough when working with languages with complex morphology,
|
are sometimes not rich enough when working with languages with complex morphology,
|
||||||
and this layer allows the specification of multiple prefixes and suffixes
|
and this layer allows the specification of multiple prefixes and suffixes
|
||||||
of any lengths.
|
of any lengths. Arrays specifying lengths must be in ascending order.
|
||||||
|
|
||||||
Additionally, it is possible to use as features the results of character
|
|
||||||
searches of specified lengths. A list of search characters is specified; the
|
|
||||||
characters in each word are examined in order starting at the beginning or at
|
|
||||||
the end; and each character that matches one of the search characters is added,
|
|
||||||
in order, to the string to be used as a feature. The search continues until
|
|
||||||
either the search result string is full or the whole word has been examined.
|
|
||||||
This is useful because some languages exhibit morphological alternations where
|
|
||||||
one letter or letters regularly alternate with another letter or letters
|
|
||||||
depending on the presence of some other letter before or after it, e.g. German
|
|
||||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
|
||||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
|
|
||||||
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
|
|
||||||
|
|
||||||
For most languages used with spaCy, searching is likely to be useful starting
|
|
||||||
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
|
||||||
is also offered for completeness. Search characters should consist of all
|
|
||||||
characters that regularly alternate with other characters in the language in
|
|
||||||
question or whose presence before or after characters that would otherwise
|
|
||||||
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
|
||||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
|
||||||
end of the word.
|
|
||||||
|
|
||||||
There are a few rare situations where a graphical character is expressed as
|
There are a few rare situations where a graphical character is expressed as
|
||||||
more than one UTF-8 character, e.g. *i* when representing the lower-case form
|
more than one UTF-8 character, e.g. *i* when representing the lower-case form
|
||||||
of the Turkish letter *İ*. Such situations are supported, but the lengths of
|
of the Turkish letter *İ*. Such situations are supported, but the lengths of
|
||||||
prefixes, suffixes and character search results may need to be increased
|
prefixes and suffixes may need to be increased accordingly.
|
||||||
accordingly.
|
|
||||||
|
|
||||||
All arrays specifying lengths must be in ascending order.
|
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
|
@ -290,39 +250,13 @@ def RichMultiHashEmbed(
|
||||||
for each word, e.g. for the word `spaCy`:
|
for each word, e.g. for the word `spaCy`:
|
||||||
`[1, 3]` would lead to `y` and `yCa` being used as features.
|
`[1, 3]` would lead to `y` and `yCa` being used as features.
|
||||||
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
||||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
|
||||||
starting from the beginning of each word.
|
|
||||||
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
|
||||||
to use as features, where the searches start from the beginning of each word.
|
|
||||||
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
|
||||||
`pref_search_lengths`.
|
|
||||||
suff_search_chars (Optional[str]): A string containing characters to search for
|
|
||||||
starting from the end of each word.
|
|
||||||
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
|
||||||
to use as features, where the searches start from the end of each word.
|
|
||||||
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
|
||||||
`suff_search_lengths`.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if len(rows) != len(attrs):
|
if len(rows) != len(attrs):
|
||||||
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||||
|
|
||||||
_verify_rich_config_group("prefix", pref_lengths, pref_rows, None, False)
|
_verify_rich_config_group("prefix", pref_lengths, pref_rows)
|
||||||
_verify_rich_config_group("suffix", suff_lengths, suff_rows, None, False)
|
_verify_rich_config_group("suffix", suff_lengths, suff_rows)
|
||||||
_verify_rich_config_group(
|
|
||||||
"prefix search",
|
|
||||||
pref_search_lengths,
|
|
||||||
pref_search_rows,
|
|
||||||
pref_search_chars,
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
_verify_rich_config_group(
|
|
||||||
"suffix search",
|
|
||||||
suff_search_lengths,
|
|
||||||
suff_search_rows,
|
|
||||||
suff_search_chars,
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
||||||
warnings.warn(Warnings.W124)
|
warnings.warn(Warnings.W124)
|
||||||
|
@ -331,10 +265,6 @@ def RichMultiHashEmbed(
|
||||||
rows.extend(pref_rows)
|
rows.extend(pref_rows)
|
||||||
if suff_rows is not None:
|
if suff_rows is not None:
|
||||||
rows.extend(suff_rows)
|
rows.extend(suff_rows)
|
||||||
if pref_search_rows is not None:
|
|
||||||
rows.extend(pref_search_rows)
|
|
||||||
if suff_search_rows is not None:
|
|
||||||
rows.extend(suff_search_rows)
|
|
||||||
|
|
||||||
embeddings: List[Model[Ints2d, Floats2d]] = [
|
embeddings: List[Model[Ints2d, Floats2d]] = [
|
||||||
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
|
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
|
||||||
|
@ -350,10 +280,6 @@ def RichMultiHashEmbed(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
pref_lengths=pref_lengths,
|
pref_lengths=pref_lengths,
|
||||||
suff_lengths=suff_lengths,
|
suff_lengths=suff_lengths,
|
||||||
pref_search_chars=pref_search_chars,
|
|
||||||
pref_search_lengths=pref_search_lengths,
|
|
||||||
suff_search_chars=suff_search_chars,
|
|
||||||
suff_search_lengths=suff_search_lengths,
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import List, Optional, Callable, Tuple
|
||||||
from thinc.types import Ints2d
|
from thinc.types import Ints2d
|
||||||
from thinc.api import Model, registry, get_current_ops
|
from thinc.api import Model, registry, get_current_ops
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..util import get_search_char_byte_arrays
|
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.RichFeatureExtractor.v1")
|
@registry.layers("spacy.RichFeatureExtractor.v1")
|
||||||
|
@ -11,27 +10,7 @@ def RichFeatureExtractor(
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
pref_lengths: Optional[List[int]] = None,
|
pref_lengths: Optional[List[int]] = None,
|
||||||
suff_lengths: Optional[List[int]] = None,
|
suff_lengths: Optional[List[int]] = None,
|
||||||
pref_search_chars: Optional[str] = None,
|
|
||||||
pref_search_lengths: Optional[List[int]] = None,
|
|
||||||
suff_search_chars: Optional[str] = None,
|
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
ops = get_current_ops()
|
|
||||||
if pref_search_chars is not None:
|
|
||||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
|
|
||||||
pref_search_chars, case_sensitive
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ps_search_chars = bytes()
|
|
||||||
ps_width_offsets = bytes()
|
|
||||||
if suff_search_chars is not None:
|
|
||||||
|
|
||||||
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
|
|
||||||
suff_search_chars, case_sensitive
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ss_search_chars = bytes()
|
|
||||||
ss_width_offsets = bytes()
|
|
||||||
return Model(
|
return Model(
|
||||||
"extract_character_combination_hashes",
|
"extract_character_combination_hashes",
|
||||||
forward,
|
forward,
|
||||||
|
@ -39,16 +18,6 @@ def RichFeatureExtractor(
|
||||||
"case_sensitive": case_sensitive,
|
"case_sensitive": case_sensitive,
|
||||||
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
|
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
|
||||||
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
|
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
|
||||||
"ps_search_chars": ps_search_chars,
|
|
||||||
"ps_width_offsets": ps_width_offsets,
|
|
||||||
"ps_lengths": bytes(pref_search_lengths)
|
|
||||||
if pref_search_lengths is not None
|
|
||||||
else bytes(),
|
|
||||||
"ss_search_chars": ss_search_chars,
|
|
||||||
"ss_width_offsets": ss_width_offsets,
|
|
||||||
"ss_lengths": bytes(suff_search_lengths)
|
|
||||||
if suff_search_lengths is not None
|
|
||||||
else bytes(),
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -60,24 +29,12 @@ def forward(
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
p_lengths: bytes = model.attrs["p_lengths"]
|
p_lengths: bytes = model.attrs["p_lengths"]
|
||||||
s_lengths: bytes = model.attrs["s_lengths"]
|
s_lengths: bytes = model.attrs["s_lengths"]
|
||||||
ps_search_chars: bytes = model.attrs["ps_search_chars"]
|
|
||||||
ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
|
|
||||||
ps_lengths: bytes = model.attrs["ps_lengths"]
|
|
||||||
ss_search_chars: bytes = model.attrs["ss_search_chars"]
|
|
||||||
ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
|
|
||||||
ss_lengths: bytes = model.attrs["ss_lengths"]
|
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
p_lengths=p_lengths,
|
p_lengths=p_lengths,
|
||||||
s_lengths=s_lengths,
|
s_lengths=s_lengths,
|
||||||
ps_search_chars=ps_search_chars,
|
|
||||||
ps_width_offsets=ps_width_offsets,
|
|
||||||
ps_lengths=ps_lengths,
|
|
||||||
ss_search_chars=ss_search_chars,
|
|
||||||
ss_width_offsets=ss_width_offsets,
|
|
||||||
ss_lengths=ss_lengths,
|
|
||||||
)
|
)
|
||||||
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,6 @@ from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||||
from spacy.util import get_search_char_byte_arrays
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from .test_underscore import clean_underscore # noqa: F401
|
from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
@ -1450,12 +1449,6 @@ def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
|
||||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
|
|
||||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
|
|
||||||
"Rp", case_sensitive
|
|
||||||
)
|
|
||||||
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
|
|
||||||
"xx✨rp", case_sensitive
|
|
||||||
)
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
p_lengths=bytes(
|
p_lengths=bytes(
|
||||||
|
@ -1473,17 +1466,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
5,
|
5,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
ps_search_chars=ps_search_chars,
|
|
||||||
ps_width_offsets=ps_width_offsets,
|
|
||||||
ps_lengths=bytes((2,)),
|
|
||||||
ss_search_chars=ss_search_chars,
|
|
||||||
ss_width_offsets=ss_width_offsets,
|
|
||||||
ss_lengths=bytes(
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
assert hashes[0][0] == _encode_and_hash("s")
|
assert hashes[0][0] == _encode_and_hash("s")
|
||||||
assert hashes[0][1] == _encode_and_hash("spa")
|
assert hashes[0][1] == _encode_and_hash("spa")
|
||||||
|
@ -1492,9 +1474,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
|
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
|
||||||
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
|
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
|
||||||
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
|
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
|
||||||
assert hashes[0][7] == _encode_and_hash("p")
|
|
||||||
assert hashes[0][8] == _encode_and_hash("p")
|
|
||||||
assert hashes[0][9] == _encode_and_hash("p")
|
|
||||||
assert hashes[1][0] == _encode_and_hash("✨")
|
assert hashes[1][0] == _encode_and_hash("✨")
|
||||||
assert hashes[1][1] == _encode_and_hash("✨")
|
assert hashes[1][1] == _encode_and_hash("✨")
|
||||||
assert hashes[1][2] == _encode_and_hash("✨")
|
assert hashes[1][2] == _encode_and_hash("✨")
|
||||||
|
@ -1502,9 +1481,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][7] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[1][8] == _encode_and_hash("✨")
|
|
||||||
assert hashes[1][9] == _encode_and_hash("✨")
|
|
||||||
assert hashes[2][0] == _encode_and_hash("a")
|
assert hashes[2][0] == _encode_and_hash("a")
|
||||||
assert hashes[2][1] == _encode_and_hash("and")
|
assert hashes[2][1] == _encode_and_hash("and")
|
||||||
assert hashes[2][2] == _encode_and_hash("and")
|
assert hashes[2][2] == _encode_and_hash("and")
|
||||||
|
@ -1512,9 +1488,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[2][4] == _encode_and_hash("dna")
|
assert hashes[2][4] == _encode_and_hash("dna")
|
||||||
assert hashes[2][5] == _encode_and_hash("dna")
|
assert hashes[2][5] == _encode_and_hash("dna")
|
||||||
assert hashes[2][6] == _encode_and_hash("dna")
|
assert hashes[2][6] == _encode_and_hash("dna")
|
||||||
assert hashes[2][7] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[2][8] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[2][9] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
|
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
|
||||||
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
|
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
|
||||||
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
|
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
|
||||||
|
@ -1522,21 +1495,10 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
assert hashes[3][4] == _encode_and_hash("ygi")
|
assert hashes[3][4] == _encode_and_hash("ygi")
|
||||||
assert hashes[3][5] == _encode_and_hash("ygid")
|
assert hashes[3][5] == _encode_and_hash("ygid")
|
||||||
assert hashes[3][6] == _encode_and_hash("ygido")
|
assert hashes[3][6] == _encode_and_hash("ygido")
|
||||||
assert (
|
|
||||||
hashes[3][7] == EMPTY_HASH_VALUE if case_sensitive else _encode_and_hash("pr")
|
|
||||||
)
|
|
||||||
|
|
||||||
assert hashes[3][8] == _encode_and_hash("r")
|
|
||||||
|
|
||||||
if case_sensitive:
|
|
||||||
assert hashes[3][9] == _encode_and_hash("r")
|
|
||||||
else:
|
|
||||||
assert hashes[3][9] == _encode_and_hash("rp")
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False)
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
p_lengths=bytes(),
|
p_lengths=bytes(),
|
||||||
|
@ -1548,34 +1510,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
5,
|
5,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
ps_search_chars=ps_search_chars,
|
|
||||||
ps_width_offsets=ps_width_offsets,
|
|
||||||
ps_lengths=bytes((2,)),
|
|
||||||
ss_search_chars=bytes(),
|
|
||||||
ss_width_offsets=bytes(),
|
|
||||||
ss_lengths=bytes(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _encode_and_hash("yc")
|
assert hashes[0][0] == _encode_and_hash("yc")
|
||||||
assert hashes[0][1] == _encode_and_hash("yca")
|
assert hashes[0][1] == _encode_and_hash("yca")
|
||||||
assert hashes[0][2] == _encode_and_hash("ycap")
|
assert hashes[0][2] == _encode_and_hash("ycap")
|
||||||
assert hashes[0][3] == _encode_and_hash("ycaps")
|
assert hashes[0][3] == _encode_and_hash("ycaps")
|
||||||
assert hashes[0][4] == _encode_and_hash("p")
|
|
||||||
assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
||||||
assert hashes[1][4] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[2][0] == _encode_and_hash("dn")
|
assert hashes[2][0] == _encode_and_hash("dn")
|
||||||
assert hashes[2][1] == _encode_and_hash("dna")
|
assert hashes[2][1] == _encode_and_hash("dna")
|
||||||
assert hashes[2][2] == _encode_and_hash("dna")
|
assert hashes[2][2] == _encode_and_hash("dna")
|
||||||
assert hashes[2][3] == _encode_and_hash("dna")
|
assert hashes[2][3] == _encode_and_hash("dna")
|
||||||
assert hashes[2][4] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[3][0] == _encode_and_hash("yg")
|
assert hashes[3][0] == _encode_and_hash("yg")
|
||||||
assert hashes[3][1] == _encode_and_hash("ygi")
|
assert hashes[3][1] == _encode_and_hash("ygi")
|
||||||
assert hashes[3][2] == _encode_and_hash("ygid")
|
assert hashes[3][2] == _encode_and_hash("ygid")
|
||||||
assert hashes[3][3] == _encode_and_hash("ygido")
|
assert hashes[3][3] == _encode_and_hash("ygido")
|
||||||
assert hashes[3][4] == _encode_and_hash("pr")
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
|
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
|
||||||
|
@ -1588,12 +1540,6 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer):
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
p_lengths=bytes((p_length,)),
|
p_lengths=bytes((p_length,)),
|
||||||
s_lengths=bytes((s_length,)),
|
s_lengths=bytes((s_length,)),
|
||||||
ps_search_chars=bytes(),
|
|
||||||
ps_width_offsets=bytes(),
|
|
||||||
ps_lengths=bytes(),
|
|
||||||
ss_search_chars=bytes(),
|
|
||||||
ss_width_offsets=bytes(),
|
|
||||||
ss_lengths=bytes(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
|
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
|
||||||
|
@ -1605,7 +1551,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||||
en_tokenizer, case_sensitive
|
en_tokenizer, case_sensitive
|
||||||
):
|
):
|
||||||
doc = en_tokenizer("İ".lower() + "İ")
|
doc = en_tokenizer("İ".lower() + "İ")
|
||||||
search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive)
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
p_lengths=bytes(
|
p_lengths=bytes(
|
||||||
|
@ -1624,26 +1569,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||||
4,
|
4,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
ps_search_chars=search_chars,
|
|
||||||
ps_width_offsets=width_offsets,
|
|
||||||
ps_lengths=bytes(
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
3,
|
|
||||||
4,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
ss_search_chars=search_chars,
|
|
||||||
ss_width_offsets=width_offsets,
|
|
||||||
ss_lengths=bytes(
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
3,
|
|
||||||
4,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||||
|
@ -1656,10 +1581,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||||
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
|
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
|
||||||
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||||
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||||
assert hashes[0][8] == _encode_and_hash("İ")
|
|
||||||
assert hashes[0][9] == _encode_and_hash("İ")
|
|
||||||
assert hashes[0][12] == _encode_and_hash("İ")
|
|
||||||
assert hashes[0][13] == _encode_and_hash("İ")
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
|
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
|
||||||
|
@ -1670,16 +1591,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||||
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
|
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
|
||||||
)
|
)
|
||||||
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
|
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
|
||||||
assert hashes[0][8] == _encode_and_hash("i")
|
|
||||||
assert hashes[0][9] == _encode_and_hash("İ".lower())
|
|
||||||
assert hashes[0][10] == _encode_and_hash("İ".lower() + "i")
|
|
||||||
assert hashes[0][11] == _encode_and_hash("İ".lower() * 2)
|
|
||||||
assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE)
|
|
||||||
assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i")
|
|
||||||
assert hashes[0][14] == _encode_and_hash(
|
|
||||||
COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
|
|
||||||
)
|
|
||||||
assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
|
@ -1693,33 +1604,17 @@ def test_get_character_combination_hashes_string_store_spec_cases(
|
||||||
assert len(long_word) > 255
|
assert len(long_word) > 255
|
||||||
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
|
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive)
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
p_lengths=bytes((2,)),
|
p_lengths=bytes((2,)),
|
||||||
s_lengths=bytes((2,)),
|
s_lengths=bytes((2,)),
|
||||||
ps_search_chars=ps_search_chars,
|
|
||||||
ps_width_offsets=ps_width_offsets,
|
|
||||||
ps_lengths=bytes((2,)),
|
|
||||||
ss_search_chars=bytes(),
|
|
||||||
ss_width_offsets=bytes(),
|
|
||||||
ss_lengths=bytes(),
|
|
||||||
)
|
)
|
||||||
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
|
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
|
||||||
assert hashes[0][1] == _encode_and_hash("91")
|
assert hashes[0][1] == _encode_and_hash("91")
|
||||||
assert hashes[0][2] == EMPTY_HASH_VALUE
|
|
||||||
assert hashes[1][0] == _encode_and_hash("be")
|
assert hashes[1][0] == _encode_and_hash("be")
|
||||||
assert hashes[1][1] == _encode_and_hash("ee")
|
assert hashes[1][1] == _encode_and_hash("ee")
|
||||||
if case_sensitive:
|
|
||||||
assert hashes[1][2] == EMPTY_HASH_VALUE
|
|
||||||
else:
|
|
||||||
assert hashes[1][2] == _encode_and_hash("ee")
|
|
||||||
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
|
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
|
||||||
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
|
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
|
||||||
if case_sensitive:
|
|
||||||
assert hashes[2][2] == hashes[3][2] == EMPTY_HASH_VALUE
|
|
||||||
else:
|
|
||||||
assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee")
|
|
||||||
|
|
||||||
|
|
||||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
|
@ -1728,10 +1623,4 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
case_sensitive=True,
|
case_sensitive=True,
|
||||||
p_lengths=bytes(),
|
p_lengths=bytes(),
|
||||||
s_lengths=bytes(),
|
s_lengths=bytes(),
|
||||||
ps_search_chars=bytes(),
|
|
||||||
ps_width_offsets=bytes(),
|
|
||||||
ps_lengths=bytes(),
|
|
||||||
ss_search_chars=bytes(),
|
|
||||||
ss_width_offsets=bytes(),
|
|
||||||
ss_lengths=bytes(),
|
|
||||||
).shape == (1, 0)
|
).shape == (1, 0)
|
||||||
|
|
|
@ -1,86 +0,0 @@
|
||||||
import spacy
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"zzaaEP", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == b"EPaz"
|
|
||||||
else:
|
|
||||||
assert search_chars == b"aepz"
|
|
||||||
assert width_offsets == b"\x00\x04\x04\x04\x04"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"𐌞", case_sensitive
|
|
||||||
)
|
|
||||||
assert search_chars == "𐌞".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x00\x00\x00\x04"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_get_search_char_byte_arrays_all_widths(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"𐌞Éabé—B𐌞", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == "BabÉé—𐌞".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x03\x07\x0a\x0e"
|
|
||||||
else:
|
|
||||||
assert search_chars == "abé—𐌞".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x02\x04\x07\x0b"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_get_search_char_byte_arrays_widths_1_and_3(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"B—", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == "B—".encode("utf-8")
|
|
||||||
else:
|
|
||||||
assert search_chars == "b—".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x01\x01\x04\x04"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_get_search_char_byte_arrays_widths_1_and_4(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"B𐌞", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == "B𐌞".encode("utf-8")
|
|
||||||
else:
|
|
||||||
assert search_chars == "b𐌞".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x01\x01\x01\x05"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_turkish_i_with_dot(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"İ", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == "İ".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x00\x02\x02\x02"
|
|
||||||
else:
|
|
||||||
assert search_chars == b"i\xcc\x87"
|
|
||||||
assert width_offsets == b"\x00\x01\x03\x03\x03"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
|
||||||
def test_turkish_i_with_dot_and_normal_i(case_sensitive):
|
|
||||||
search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
|
|
||||||
"İI", case_sensitive
|
|
||||||
)
|
|
||||||
if case_sensitive:
|
|
||||||
assert search_chars == "Iİ".encode("utf-8")
|
|
||||||
assert width_offsets == b"\x00\x01\x03\x03\x03"
|
|
||||||
else:
|
|
||||||
assert search_chars == b"i\xcc\x87"
|
|
||||||
assert width_offsets == b"\x00\x01\x03\x03\x03"
|
|
|
@ -50,18 +50,6 @@ cdef void _set_suffix_lengths(
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
|
||||||
const unsigned char* tok_str,
|
|
||||||
const int tok_str_l,
|
|
||||||
const unsigned char* search_chars,
|
|
||||||
const unsigned char* width_offsets,
|
|
||||||
const int max_res_l,
|
|
||||||
const bint suffs_not_prefs,
|
|
||||||
unsigned char* res_buf,
|
|
||||||
unsigned char* l_buf,
|
|
||||||
) nogil
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _write_hashes(
|
cdef int _write_hashes(
|
||||||
const unsigned char* res_buf,
|
const unsigned char* res_buf,
|
||||||
const unsigned char* aff_l_buf,
|
const unsigned char* aff_l_buf,
|
||||||
|
|
|
@ -180,12 +180,6 @@ class Doc:
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
p_lengths: bytes,
|
p_lengths: bytes,
|
||||||
s_lengths: bytes,
|
s_lengths: bytes,
|
||||||
ps_search_chars: bytes,
|
|
||||||
ps_width_offsets: bytes,
|
|
||||||
ps_lengths: bytes,
|
|
||||||
ss_search_chars: bytes,
|
|
||||||
ss_width_offsets: bytes,
|
|
||||||
ss_lengths: bytes,
|
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -1751,43 +1751,16 @@ cdef class Doc:
|
||||||
const bint case_sensitive,
|
const bint case_sensitive,
|
||||||
const unsigned char* p_lengths,
|
const unsigned char* p_lengths,
|
||||||
const unsigned char* s_lengths,
|
const unsigned char* s_lengths,
|
||||||
const unsigned char* ps_search_chars,
|
|
||||||
const unsigned char* ps_width_offsets,
|
|
||||||
const unsigned char* ps_lengths,
|
|
||||||
const unsigned char* ss_search_chars,
|
|
||||||
const unsigned char* ss_width_offsets,
|
|
||||||
const unsigned char* ss_lengths,
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
derived from the raw text of each token.
|
derived from the raw text of each token.
|
||||||
|
|
||||||
Generally:
|
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
|
|
||||||
p_ variables relate to prefixes (affixes starting at the beginning of the word)
|
|
||||||
s_ variables relate to suffixes (affixes starting at the end of the word)
|
|
||||||
ps_ variables relate to searches starting at the beginning of the word
|
|
||||||
ss_ variables relate to searches starting at the end of the word
|
|
||||||
|
|
||||||
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
|
||||||
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
||||||
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
||||||
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
|
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
|
||||||
ps_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token,
|
|
||||||
starting at the beginning.
|
|
||||||
ps_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
|
|
||||||
specifying the offsets within *ps_search_chars* that contain UTF-8 characters with the specified widths.
|
|
||||||
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
|
|
||||||
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
|
|
||||||
hashed for "spaCy" would be "a" and "ac".
|
|
||||||
ss_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token,
|
|
||||||
starting at the end.
|
|
||||||
ss_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
|
|
||||||
specifying the offsets within *ss_search_chars* that contain UTF-8 characters with the specified widths.
|
|
||||||
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
|
|
||||||
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
|
|
||||||
hashed for "spaCy" would be "c" and "ca".
|
|
||||||
|
|
||||||
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
||||||
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
||||||
|
@ -1801,27 +1774,16 @@ cdef class Doc:
|
||||||
# Work out lengths
|
# Work out lengths
|
||||||
cdef int p_lengths_l = strlen(<char*> p_lengths)
|
cdef int p_lengths_l = strlen(<char*> p_lengths)
|
||||||
cdef int s_lengths_l = strlen(<char*> s_lengths)
|
cdef int s_lengths_l = strlen(<char*> s_lengths)
|
||||||
cdef int ps_lengths_l = strlen(<char*> ps_lengths)
|
|
||||||
cdef int ss_lengths_l = strlen(<char*> ss_lengths)
|
|
||||||
cdef int hashes_per_tok = p_lengths_l + s_lengths_l + ps_lengths_l + ss_lengths_l
|
|
||||||
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
|
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
|
||||||
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
|
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
|
||||||
cdef int ps_max_l = ps_lengths[ps_lengths_l - 1] if ps_lengths_l > 0 else 0
|
|
||||||
cdef int ss_max_l = ss_lengths[ss_lengths_l - 1] if ss_lengths_l > 0 else 0
|
|
||||||
|
|
||||||
# Define / allocate buffers
|
# Define / allocate buffers
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
|
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
|
||||||
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
|
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
|
||||||
cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l,
|
|
||||||
MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
|
|
||||||
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, sizeof(char))
|
|
||||||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l,
|
|
||||||
MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
|
|
||||||
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, sizeof(char))
|
|
||||||
cdef int doc_l = self.length
|
cdef int doc_l = self.length
|
||||||
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
||||||
(doc_l, hashes_per_tok), dtype="uint64")
|
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
|
||||||
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
|
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
|
@ -1852,16 +1814,6 @@ cdef class Doc:
|
||||||
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
|
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
|
||||||
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
|
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
|
||||||
|
|
||||||
if ps_max_l > 0:
|
|
||||||
_search_for_chars(tok_str, tok_str_l, ps_search_chars, ps_width_offsets,
|
|
||||||
ps_max_l, False, ps_res_buf, ps_l_buf)
|
|
||||||
hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes_ptr)
|
|
||||||
|
|
||||||
if ss_max_l > 0:
|
|
||||||
_search_for_chars(tok_str, tok_str_l, ss_search_chars, ss_width_offsets,
|
|
||||||
ss_max_l, True, ss_res_buf, ss_l_buf)
|
|
||||||
hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes_ptr)
|
|
||||||
|
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
|
|
||||||
|
@ -2111,73 +2063,6 @@ cdef void _set_suffix_lengths(
|
||||||
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
|
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
|
||||||
const unsigned char* tok_str,
|
|
||||||
const int tok_str_l,
|
|
||||||
const unsigned char* search_chars,
|
|
||||||
const unsigned char* width_offsets,
|
|
||||||
const int max_res_l,
|
|
||||||
const bint suffs_not_prefs,
|
|
||||||
unsigned char* res_buf,
|
|
||||||
unsigned char* l_buf,
|
|
||||||
) nogil:
|
|
||||||
""" Search *tok_str* within a string for characters within *search_chars*, starting at the
|
|
||||||
beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches,
|
|
||||||
it is added to *res_buf* and the byte length up to that point is added to *l_buf*. When nothing
|
|
||||||
more is found, the remainder of *l_buf* is populated wth the byte length from the last result,
|
|
||||||
which may be *0* if the search was not successful.
|
|
||||||
|
|
||||||
tok_str: a UTF-8 representation of a string.
|
|
||||||
tok_str_l: the length of *tok_str*.
|
|
||||||
search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within *tok_str*.
|
|
||||||
width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
|
|
||||||
specifying the offsets within *search_chars* that contain UTF-8 characters with the specified widths.
|
|
||||||
max_res_l: the maximum number of found characters to place in *res_buf*.
|
|
||||||
suffs_not_prefs: if *True*, searching starts from the end of the word;
|
|
||||||
if *False*, from the beginning.
|
|
||||||
res_buf: the buffer in which to place the search results.
|
|
||||||
l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
|
|
||||||
The calling code ensures that lengths greater than 255 cannot occur.
|
|
||||||
"""
|
|
||||||
cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
|
|
||||||
cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
|
|
||||||
cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
|
|
||||||
|
|
||||||
while this_tok_str_idx >= 0 and this_tok_str_idx <= tok_str_l:
|
|
||||||
if (
|
|
||||||
(this_tok_str_idx == tok_str_l) or
|
|
||||||
((tok_str[this_tok_str_idx] & 0xc0) != 0x80) # not continuation character, always applies to [0].
|
|
||||||
):
|
|
||||||
if this_tok_str_idx > last_tok_str_idx:
|
|
||||||
ch_wdth = this_tok_str_idx - last_tok_str_idx
|
|
||||||
else:
|
|
||||||
ch_wdth = last_tok_str_idx - this_tok_str_idx
|
|
||||||
|
|
||||||
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
|
||||||
search_char_idx = width_offsets[ch_wdth - 1]
|
|
||||||
end_search_idx = width_offsets[ch_wdth]
|
|
||||||
while search_char_idx < end_search_idx:
|
|
||||||
cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth)
|
|
||||||
if cmp_result == 0:
|
|
||||||
memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth)
|
|
||||||
res_buf_idx += ch_wdth
|
|
||||||
l_buf[l_buf_idx] = res_buf_idx
|
|
||||||
l_buf_idx += 1
|
|
||||||
if l_buf_idx == max_res_l:
|
|
||||||
return
|
|
||||||
if cmp_result <= 0:
|
|
||||||
break
|
|
||||||
search_char_idx += ch_wdth
|
|
||||||
last_tok_str_idx = this_tok_str_idx
|
|
||||||
if suffs_not_prefs:
|
|
||||||
this_tok_str_idx -= 1
|
|
||||||
else:
|
|
||||||
this_tok_str_idx += 1
|
|
||||||
|
|
||||||
# fill in unused characters in the length buffer
|
|
||||||
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
|
|
||||||
|
|
||||||
|
|
||||||
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
||||||
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
||||||
|
|
||||||
|
|
|
@ -1736,44 +1736,3 @@ def all_equal(iterable):
|
||||||
(or if the input is an empty sequence), False otherwise."""
|
(or if the input is an empty sequence), False otherwise."""
|
||||||
g = itertools.groupby(iterable)
|
g = itertools.groupby(iterable)
|
||||||
return next(g, True) and not next(g, False)
|
return next(g, True) and not next(g, False)
|
||||||
|
|
||||||
|
|
||||||
def get_search_char_byte_arrays(
|
|
||||||
search_char_string: str, case_sensitive: bool
|
|
||||||
) -> Tuple[bytes, bytes]:
|
|
||||||
"""
|
|
||||||
This function supports *RichMultiHashEmbed*. It orders the characters in
|
|
||||||
*search_char_string*, removing any duplicates, encodes them as UTF-8, and
|
|
||||||
returns the result bufer together with a byte array containing the offsets
|
|
||||||
where the characters of various byte lengths start within the result buffer,
|
|
||||||
i.e.
|
|
||||||
|
|
||||||
<1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.
|
|
||||||
|
|
||||||
If the result buffer does not contain any characters of length *n*,
|
|
||||||
<n_byte_start> == <n+1_byte_start>.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not case_sensitive:
|
|
||||||
search_char_string = search_char_string.lower()
|
|
||||||
ordered_search_char_string = "".join(sorted(set(search_char_string)))
|
|
||||||
search_chars = ordered_search_char_string.encode("UTF-8")
|
|
||||||
width_offsets = [-1] * 5
|
|
||||||
working_start = 0
|
|
||||||
working_width = 0
|
|
||||||
for idx in range(1, len(search_chars) + 1):
|
|
||||||
if (
|
|
||||||
idx == len(search_chars)
|
|
||||||
or search_chars[idx] & 0xC0 != 0x80 # not continuation byte
|
|
||||||
):
|
|
||||||
this_width = idx - working_start
|
|
||||||
if this_width > 4 or this_width < working_width:
|
|
||||||
raise RuntimeError(Errors.E1051)
|
|
||||||
if this_width > working_width:
|
|
||||||
for i in range(working_width, 5):
|
|
||||||
width_offsets[i] = working_start
|
|
||||||
working_width = this_width
|
|
||||||
working_start = idx
|
|
||||||
for i in range(this_width, 5):
|
|
||||||
width_offsets[i] = idx
|
|
||||||
return search_chars, bytes((width_offsets))
|
|
||||||
|
|
|
@ -186,9 +186,6 @@ updated).
|
||||||
> pref_rows = [10000,10000,10000]
|
> pref_rows = [10000,10000,10000]
|
||||||
> suff_lengths = [2, 3, 4, 5]
|
> suff_lengths = [2, 3, 4, 5]
|
||||||
> suff_rows = [10000,10000,10000,10000]
|
> suff_rows = [10000,10000,10000,10000]
|
||||||
> suff_search_chars = "aeiouäöüß"
|
|
||||||
> suff_search_lengths = [2, 3]
|
|
||||||
> suff_search_rows = [10000,10000]
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Construct an embedding layer with the features of
|
Construct an embedding layer with the features of
|
||||||
|
@ -198,35 +195,12 @@ features extracted from various positions in each token string. The fixed-length
|
||||||
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
|
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
|
||||||
enough when working with languages with complex morphology, and this layer
|
enough when working with languages with complex morphology, and this layer
|
||||||
allows the specification of multiple prefixes and suffixes of any lengths.
|
allows the specification of multiple prefixes and suffixes of any lengths.
|
||||||
|
Arrays specifying lengths must be in ascending order.
|
||||||
Additionally, it is possible to use as features the results of character
|
|
||||||
searches of specified lengths. A list of search characters is specified; the
|
|
||||||
characters in each word are examined in order starting at the beginning or at
|
|
||||||
the end; and each character that matches one of the search characters is added,
|
|
||||||
in order, to the string to be used as a feature. The search continues until
|
|
||||||
either the search result string is full or the whole word has been examined.
|
|
||||||
This is useful because some languages exhibit morphological alternations where
|
|
||||||
one letter or letters regularly alternate with another letter or letters
|
|
||||||
depending on the presence of some other letter before or after it, e.g. German
|
|
||||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
|
||||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
|
|
||||||
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
|
|
||||||
|
|
||||||
For most languages used with spaCy, searching is likely to be useful starting at
|
|
||||||
the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
|
|
||||||
also offered for completeness. Search characters should consist of all
|
|
||||||
characters that regularly alternate with other characters in the language in
|
|
||||||
question or whose presence before or after characters that would otherwise
|
|
||||||
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
|
||||||
plural noun does not become `a` if it is the third or fourth vowel from the end
|
|
||||||
of the word.
|
|
||||||
|
|
||||||
There are a few rare situations where a graphical character is expressed as more
|
There are a few rare situations where a graphical character is expressed as more
|
||||||
than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
|
than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
|
||||||
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
|
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes
|
||||||
suffixes and character search results may need to be increased accordingly.
|
and suffixes may need to be increased accordingly.
|
||||||
|
|
||||||
All arrays specifying lengths must be in ascending order.
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -239,12 +213,6 @@ All arrays specifying lengths must be in ascending order.
|
||||||
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
||||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ |
|
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ |
|
||||||
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
||||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
|
|
||||||
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
|
||||||
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
|
|
||||||
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ |
|
|
||||||
| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
|
|
||||||
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
|
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.CharacterEmbed.v2 {#CharacterEmbed}
|
### spacy.CharacterEmbed.v2 {#CharacterEmbed}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user