From e95dd432d15ee8408196cdb12860416e7238737b Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Tue, 10 Jan 2023 13:06:50 +0100 Subject: [PATCH] Remove search functionality --- spacy/ml/models/tok2vec.py | 82 +------------------- spacy/ml/richfeatureextractor.py | 43 ----------- spacy/tests/doc/test_doc_api.py | 111 --------------------------- spacy/tests/test_util.py | 86 --------------------- spacy/tokens/doc.pxd | 12 --- spacy/tokens/doc.pyi | 6 -- spacy/tokens/doc.pyx | 121 +----------------------------- spacy/util.py | 41 ---------- website/docs/api/architectures.md | 38 +--------- 9 files changed, 10 insertions(+), 530 deletions(-) delete mode 100644 spacy/tests/test_util.py diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 35cc10bd5..b8ff9ce87 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -193,23 +193,14 @@ def _verify_rich_config_group( label: str, lengths: Optional[List[int]], rows: Optional[List[int]], - search_chars: Optional[str], - is_search_char_group: bool, ) -> None: if lengths is not None or rows is not None: - if is_search_char_group and (search_chars is None or len(search_chars) == 0): - raise ValueError(Errors.E1048.format(label=label)) - if search_chars is not None and len(search_chars) > 63: - raise ValueError(Errors.E1049.format(label=label)) if lengths is None or rows is None: raise ValueError(Errors.E1048.format(label=label)) if len(lengths) != len(rows): raise ValueError(Errors.E1048.format(label=label)) if any([length < 1 for length in lengths]): raise ValueError(Errors.E1048.format(label=label)) - elif search_chars is not None: - raise ValueError(Errors.E1048.format(label=label)) - if lengths is not None: if lengths[-1] > 63: raise ValueError(Errors.E1049.format(label=label)) if len(lengths) != len(set(lengths)) or lengths != sorted(lengths): @@ -227,12 +218,6 @@ def RichMultiHashEmbed( pref_rows: Optional[List[int]] = None, suff_lengths: Optional[List[int]] = None, suff_rows: Optional[List[int]] = None, - pref_search_chars: Optional[str] = None, - pref_search_lengths: Optional[List[int]] = None, - pref_search_rows: Optional[List[int]] = None, - suff_search_chars: Optional[str] = None, - suff_search_lengths: Optional[List[int]] = None, - suff_search_rows: Optional[List[int]] = None, ) -> Model[List[Doc], List[Floats2d]]: """ Construct an embedding layer with the features of `MultiHashEmbed` (see above) @@ -240,37 +225,12 @@ def RichMultiHashEmbed( The fixed-length `PREFIX` and `SUFFIX` features used in `MultiHashEmbed` are sometimes not rich enough when working with languages with complex morphology, and this layer allows the specification of multiple prefixes and suffixes - of any lengths. - - Additionally, it is possible to use as features the results of character - searches of specified lengths. A list of search characters is specified; the - characters in each word are examined in order starting at the beginning or at - the end; and each character that matches one of the search characters is added, - in order, to the string to be used as a feature. The search continues until - either the search result string is full or the whole word has been examined. - This is useful because some languages exhibit morphological alternations where - one letter or letters regularly alternate with another letter or letters - depending on the presence of some other letter before or after it, e.g. German - plural nouns where the final two vowels are `ä-e` regularly correspond to - singular lemmas where the `e` is no longer present and the `ä` has become `a`, - e.g. `die Bäche` (plural) vs. `der Bach` (singular). - - For most languages used with spaCy, searching is likely to be useful starting - at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) - is also offered for completeness. Search characters should consist of all - characters that regularly alternate with other characters in the language in - question or whose presence before or after characters that would otherwise - alternate prevents the alternation from occurring, e.g. an `ä` in a German - plural noun does not become `a` if it is the third or fourth vowel from the - end of the word. + of any lengths. Arrays specifying lengths must be in ascending order. There are a few rare situations where a graphical character is expressed as more than one UTF-8 character, e.g. *i* when representing the lower-case form of the Turkish letter *İ*. Such situations are supported, but the lengths of - prefixes, suffixes and character search results may need to be increased - accordingly. - - All arrays specifying lengths must be in ascending order. + prefixes and suffixes may need to be increased accordingly. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. @@ -290,39 +250,13 @@ def RichMultiHashEmbed( for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`. - pref_search_chars (Optional[str]): A string containing characters to search for - starting from the beginning of each word. - pref_search_lengths (Optional[List[int]]): The lengths of search result strings - to use as features, where the searches start from the beginning of each word. - pref_search_rows (Optional[List[int]]): The number of rows for each of - `pref_search_lengths`. - suff_search_chars (Optional[str]): A string containing characters to search for - starting from the end of each word. - suff_search_lengths (Optional[List[int]]): The lengths of search result strings - to use as features, where the searches start from the end of each word. - suff_search_rows (Optional[List[int]]): The number of rows for each of - `suff_search_lengths`. """ if len(rows) != len(attrs): raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") - _verify_rich_config_group("prefix", pref_lengths, pref_rows, None, False) - _verify_rich_config_group("suffix", suff_lengths, suff_rows, None, False) - _verify_rich_config_group( - "prefix search", - pref_search_lengths, - pref_search_rows, - pref_search_chars, - True, - ) - _verify_rich_config_group( - "suffix search", - suff_search_lengths, - suff_search_rows, - suff_search_chars, - True, - ) + _verify_rich_config_group("prefix", pref_lengths, pref_rows) + _verify_rich_config_group("suffix", suff_lengths, suff_rows) if "PREFIX" in attrs or "SUFFIX" in attrs: warnings.warn(Warnings.W124) @@ -331,10 +265,6 @@ def RichMultiHashEmbed( rows.extend(pref_rows) if suff_rows is not None: rows.extend(suff_rows) - if pref_search_rows is not None: - rows.extend(pref_search_rows) - if suff_search_rows is not None: - rows.extend(suff_search_rows) embeddings: List[Model[Ints2d, Floats2d]] = [ HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) @@ -350,10 +280,6 @@ def RichMultiHashEmbed( case_sensitive=case_sensitive, pref_lengths=pref_lengths, suff_lengths=suff_lengths, - pref_search_chars=pref_search_chars, - pref_search_lengths=pref_search_lengths, - suff_search_chars=suff_search_chars, - suff_search_lengths=suff_search_lengths, ), ) diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 50ccab6c2..53df964c4 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -2,7 +2,6 @@ from typing import List, Optional, Callable, Tuple from thinc.types import Ints2d from thinc.api import Model, registry, get_current_ops from ..tokens import Doc -from ..util import get_search_char_byte_arrays @registry.layers("spacy.RichFeatureExtractor.v1") @@ -11,27 +10,7 @@ def RichFeatureExtractor( case_sensitive: bool, pref_lengths: Optional[List[int]] = None, suff_lengths: Optional[List[int]] = None, - pref_search_chars: Optional[str] = None, - pref_search_lengths: Optional[List[int]] = None, - suff_search_chars: Optional[str] = None, - suff_search_lengths: Optional[List[int]] = None, ) -> Model[List[Doc], List[Ints2d]]: - ops = get_current_ops() - if pref_search_chars is not None: - ps_search_chars, ps_width_offsets = get_search_char_byte_arrays( - pref_search_chars, case_sensitive - ) - else: - ps_search_chars = bytes() - ps_width_offsets = bytes() - if suff_search_chars is not None: - - ss_search_chars, ss_width_offsets = get_search_char_byte_arrays( - suff_search_chars, case_sensitive - ) - else: - ss_search_chars = bytes() - ss_width_offsets = bytes() return Model( "extract_character_combination_hashes", forward, @@ -39,16 +18,6 @@ def RichFeatureExtractor( "case_sensitive": case_sensitive, "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(), "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(), - "ps_search_chars": ps_search_chars, - "ps_width_offsets": ps_width_offsets, - "ps_lengths": bytes(pref_search_lengths) - if pref_search_lengths is not None - else bytes(), - "ss_search_chars": ss_search_chars, - "ss_width_offsets": ss_width_offsets, - "ss_lengths": bytes(suff_search_lengths) - if suff_search_lengths is not None - else bytes(), }, ) @@ -60,24 +29,12 @@ def forward( case_sensitive: bool = model.attrs["case_sensitive"] p_lengths: bytes = model.attrs["p_lengths"] s_lengths: bytes = model.attrs["s_lengths"] - ps_search_chars: bytes = model.attrs["ps_search_chars"] - ps_width_offsets: bytes = model.attrs["ps_width_offsets"] - ps_lengths: bytes = model.attrs["ps_lengths"] - ss_search_chars: bytes = model.attrs["ss_search_chars"] - ss_width_offsets: bytes = model.attrs["ss_width_offsets"] - ss_lengths: bytes = model.attrs["ss_lengths"] features: List[Ints2d] = [] for doc in docs: hashes = doc.get_character_combination_hashes( case_sensitive=case_sensitive, p_lengths=p_lengths, s_lengths=s_lengths, - ps_search_chars=ps_search_chars, - ps_width_offsets=ps_width_offsets, - ps_lengths=ps_lengths, - ss_search_chars=ss_search_chars, - ss_width_offsets=ss_width_offsets, - ss_lengths=ss_lengths, ) features.append(ops.asarray2i(hashes, dtype="uint64")) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ac4ddcb01..610570f53 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -15,7 +15,6 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token -from spacy.util import get_search_char_byte_arrays from spacy.vocab import Vocab from .test_underscore import clean_underscore # noqa: F401 @@ -1450,12 +1449,6 @@ def _encode_and_hash(input: str, *, reverse: bool = False) -> int: def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): doc = en_tokenizer("spaCy✨ and Prodigy") - ps_search_chars, ps_width_offsets = get_search_char_byte_arrays( - "Rp", case_sensitive - ) - ss_search_chars, ss_width_offsets = get_search_char_byte_arrays( - "xx✨rp", case_sensitive - ) hashes = doc.get_character_combination_hashes( case_sensitive=case_sensitive, p_lengths=bytes( @@ -1473,17 +1466,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive 5, ) ), - ps_search_chars=ps_search_chars, - ps_width_offsets=ps_width_offsets, - ps_lengths=bytes((2,)), - ss_search_chars=ss_search_chars, - ss_width_offsets=ss_width_offsets, - ss_lengths=bytes( - ( - 1, - 2, - ) - ), ) assert hashes[0][0] == _encode_and_hash("s") assert hashes[0][1] == _encode_and_hash("spa") @@ -1492,9 +1474,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca") assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap") assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps") - assert hashes[0][7] == _encode_and_hash("p") - assert hashes[0][8] == _encode_and_hash("p") - assert hashes[0][9] == _encode_and_hash("p") assert hashes[1][0] == _encode_and_hash("✨") assert hashes[1][1] == _encode_and_hash("✨") assert hashes[1][2] == _encode_and_hash("✨") @@ -1502,9 +1481,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[1][4] == _encode_and_hash("✨", reverse=True) assert hashes[1][5] == _encode_and_hash("✨", reverse=True) assert hashes[1][6] == _encode_and_hash("✨", reverse=True) - assert hashes[1][7] == EMPTY_HASH_VALUE - assert hashes[1][8] == _encode_and_hash("✨") - assert hashes[1][9] == _encode_and_hash("✨") assert hashes[2][0] == _encode_and_hash("a") assert hashes[2][1] == _encode_and_hash("and") assert hashes[2][2] == _encode_and_hash("and") @@ -1512,9 +1488,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[2][4] == _encode_and_hash("dna") assert hashes[2][5] == _encode_and_hash("dna") assert hashes[2][6] == _encode_and_hash("dna") - assert hashes[2][7] == EMPTY_HASH_VALUE - assert hashes[2][8] == EMPTY_HASH_VALUE - assert hashes[2][9] == EMPTY_HASH_VALUE assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p") assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro") assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod") @@ -1522,21 +1495,10 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[3][4] == _encode_and_hash("ygi") assert hashes[3][5] == _encode_and_hash("ygid") assert hashes[3][6] == _encode_and_hash("ygido") - assert ( - hashes[3][7] == EMPTY_HASH_VALUE if case_sensitive else _encode_and_hash("pr") - ) - - assert hashes[3][8] == _encode_and_hash("r") - - if case_sensitive: - assert hashes[3][9] == _encode_and_hash("r") - else: - assert hashes[3][9] == _encode_and_hash("rp") def test_get_character_combination_hashes_good_case_partial(en_tokenizer): doc = en_tokenizer("spaCy✨ and Prodigy") - ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False) hashes = doc.get_character_combination_hashes( case_sensitive=False, p_lengths=bytes(), @@ -1548,34 +1510,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): 5, ) ), - ps_search_chars=ps_search_chars, - ps_width_offsets=ps_width_offsets, - ps_lengths=bytes((2,)), - ss_search_chars=bytes(), - ss_width_offsets=bytes(), - ss_lengths=bytes(), ) assert hashes[0][0] == _encode_and_hash("yc") assert hashes[0][1] == _encode_and_hash("yca") assert hashes[0][2] == _encode_and_hash("ycap") assert hashes[0][3] == _encode_and_hash("ycaps") - assert hashes[0][4] == _encode_and_hash("p") assert hashes[1][0] == _encode_and_hash("✨", reverse=True) assert hashes[1][1] == _encode_and_hash("✨", reverse=True) assert hashes[1][2] == _encode_and_hash("✨", reverse=True) assert hashes[1][3] == _encode_and_hash("✨", reverse=True) - assert hashes[1][4] == EMPTY_HASH_VALUE assert hashes[2][0] == _encode_and_hash("dn") assert hashes[2][1] == _encode_and_hash("dna") assert hashes[2][2] == _encode_and_hash("dna") assert hashes[2][3] == _encode_and_hash("dna") - assert hashes[2][4] == EMPTY_HASH_VALUE assert hashes[3][0] == _encode_and_hash("yg") assert hashes[3][1] == _encode_and_hash("ygi") assert hashes[3][2] == _encode_and_hash("ygid") assert hashes[3][3] == _encode_and_hash("ygido") - assert hashes[3][4] == _encode_and_hash("pr") def test_get_character_combination_hashes_various_lengths(en_tokenizer): @@ -1588,12 +1540,6 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer): case_sensitive=False, p_lengths=bytes((p_length,)), s_lengths=bytes((s_length,)), - ps_search_chars=bytes(), - ps_width_offsets=bytes(), - ps_lengths=bytes(), - ss_search_chars=bytes(), - ss_width_offsets=bytes(), - ss_lengths=bytes(), ) assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length]) @@ -1605,7 +1551,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot( en_tokenizer, case_sensitive ): doc = en_tokenizer("İ".lower() + "İ") - search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive) hashes = doc.get_character_combination_hashes( case_sensitive=case_sensitive, p_lengths=bytes( @@ -1624,26 +1569,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot( 4, ) ), - ps_search_chars=search_chars, - ps_width_offsets=width_offsets, - ps_lengths=bytes( - ( - 1, - 2, - 3, - 4, - ) - ), - ss_search_chars=search_chars, - ss_width_offsets=width_offsets, - ss_lengths=bytes( - ( - 1, - 2, - 3, - 4, - ) - ), ) COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") @@ -1656,10 +1581,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot( assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True) assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True) assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True) - assert hashes[0][8] == _encode_and_hash("İ") - assert hashes[0][9] == _encode_and_hash("İ") - assert hashes[0][12] == _encode_and_hash("İ") - assert hashes[0][13] == _encode_and_hash("İ") else: assert hashes[0][2] == _encode_and_hash("İ".lower() + "i") @@ -1670,16 +1591,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot( COMBINING_DOT_ABOVE + "İ".lower(), reverse=True ) assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True) - assert hashes[0][8] == _encode_and_hash("i") - assert hashes[0][9] == _encode_and_hash("İ".lower()) - assert hashes[0][10] == _encode_and_hash("İ".lower() + "i") - assert hashes[0][11] == _encode_and_hash("İ".lower() * 2) - assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE) - assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i") - assert hashes[0][14] == _encode_and_hash( - COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE - ) - assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1693,33 +1604,17 @@ def test_get_character_combination_hashes_string_store_spec_cases( assert len(long_word) > 255 doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word))) assert len(doc) == 4 - ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive) hashes = doc.get_character_combination_hashes( case_sensitive=case_sensitive, p_lengths=bytes((2,)), s_lengths=bytes((2,)), - ps_search_chars=ps_search_chars, - ps_width_offsets=ps_width_offsets, - ps_lengths=bytes((2,)), - ss_search_chars=bytes(), - ss_width_offsets=bytes(), - ss_lengths=bytes(), ) assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl") assert hashes[0][1] == _encode_and_hash("91") - assert hashes[0][2] == EMPTY_HASH_VALUE assert hashes[1][0] == _encode_and_hash("be") assert hashes[1][1] == _encode_and_hash("ee") - if case_sensitive: - assert hashes[1][2] == EMPTY_HASH_VALUE - else: - assert hashes[1][2] == _encode_and_hash("ee") assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se") assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt") - if case_sensitive: - assert hashes[2][2] == hashes[3][2] == EMPTY_HASH_VALUE - else: - assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee") def test_character_combination_hashes_empty_lengths(en_tokenizer): @@ -1728,10 +1623,4 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer): case_sensitive=True, p_lengths=bytes(), s_lengths=bytes(), - ps_search_chars=bytes(), - ps_width_offsets=bytes(), - ps_lengths=bytes(), - ss_search_chars=bytes(), - ss_width_offsets=bytes(), - ss_lengths=bytes(), ).shape == (1, 0) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py deleted file mode 100644 index 148bac7cd..000000000 --- a/spacy/tests/test_util.py +++ /dev/null @@ -1,86 +0,0 @@ -import spacy -import pytest - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_search_char_byte_arrays_1_width_only(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "zzaaEP", case_sensitive - ) - if case_sensitive: - assert search_chars == b"EPaz" - else: - assert search_chars == b"aepz" - assert width_offsets == b"\x00\x04\x04\x04\x04" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_search_char_byte_arrays_4_width_only(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "𐌞", case_sensitive - ) - assert search_chars == "𐌞".encode("utf-8") - assert width_offsets == b"\x00\x00\x00\x00\x04" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_search_char_byte_arrays_all_widths(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "𐌞Éabé—B𐌞", case_sensitive - ) - if case_sensitive: - assert search_chars == "BabÉé—𐌞".encode("utf-8") - assert width_offsets == b"\x00\x03\x07\x0a\x0e" - else: - assert search_chars == "abé—𐌞".encode("utf-8") - assert width_offsets == b"\x00\x02\x04\x07\x0b" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_search_char_byte_arrays_widths_1_and_3(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "B—", case_sensitive - ) - if case_sensitive: - assert search_chars == "B—".encode("utf-8") - else: - assert search_chars == "b—".encode("utf-8") - assert width_offsets == b"\x00\x01\x01\x04\x04" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_search_char_byte_arrays_widths_1_and_4(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "B𐌞", case_sensitive - ) - if case_sensitive: - assert search_chars == "B𐌞".encode("utf-8") - else: - assert search_chars == "b𐌞".encode("utf-8") - assert width_offsets == b"\x00\x01\x01\x01\x05" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_turkish_i_with_dot(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "İ", case_sensitive - ) - if case_sensitive: - assert search_chars == "İ".encode("utf-8") - assert width_offsets == b"\x00\x00\x02\x02\x02" - else: - assert search_chars == b"i\xcc\x87" - assert width_offsets == b"\x00\x01\x03\x03\x03" - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_turkish_i_with_dot_and_normal_i(case_sensitive): - search_chars, width_offsets = spacy.util.get_search_char_byte_arrays( - "İI", case_sensitive - ) - if case_sensitive: - assert search_chars == "Iİ".encode("utf-8") - assert width_offsets == b"\x00\x01\x03\x03\x03" - else: - assert search_chars == b"i\xcc\x87" - assert width_offsets == b"\x00\x01\x03\x03\x03" diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index fc431b151..e6a554e41 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,18 +50,6 @@ cdef void _set_suffix_lengths( ) nogil -cdef void _search_for_chars( - const unsigned char* tok_str, - const int tok_str_l, - const unsigned char* search_chars, - const unsigned char* width_offsets, - const int max_res_l, - const bint suffs_not_prefs, - unsigned char* res_buf, - unsigned char* l_buf, -) nogil - - cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 91c6b5479..fdf515c8f 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -180,12 +180,6 @@ class Doc: case_sensitive: bool, p_lengths: bytes, s_lengths: bytes, - ps_search_chars: bytes, - ps_width_offsets: bytes, - ps_lengths: bytes, - ss_search_chars: bytes, - ss_width_offsets: bytes, - ss_lengths: bytes, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b5ee0676d..93c6697e4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1751,43 +1751,16 @@ cdef class Doc: const bint case_sensitive, const unsigned char* p_lengths, const unsigned char* s_lengths, - const unsigned char* ps_search_chars, - const unsigned char* ps_width_offsets, - const unsigned char* ps_lengths, - const unsigned char* ss_search_chars, - const unsigned char* ss_width_offsets, - const unsigned char* ss_lengths, ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations derived from the raw text of each token. - Generally: - - p_ variables relate to prefixes (affixes starting at the beginning of the word) - s_ variables relate to suffixes (affixes starting at the end of the word) - ps_ variables relate to searches starting at the beginning of the word - ss_ variables relate to searches starting at the end of the word - - cs: if *False*, hashes are generated based on the lower-case version of each token. + case_sensitive: if *False*, hashes are generated based on the lower-case version of each token. p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa". - ps_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token, - starting at the beginning. - ps_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end] - specifying the offsets within *ps_search_chars* that contain UTF-8 characters with the specified widths. - ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed - in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings - hashed for "spaCy" would be "a" and "ac". - ss_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token, - starting at the end. - ss_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end] - specifying the offsets within *ss_search_chars* that contain UTF-8 characters with the specified widths. - ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed - in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings - hashed for "spaCy" would be "c" and "ca". Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible @@ -1801,27 +1774,16 @@ cdef class Doc: # Work out lengths cdef int p_lengths_l = strlen( p_lengths) cdef int s_lengths_l = strlen( s_lengths) - cdef int ps_lengths_l = strlen( ps_lengths) - cdef int ss_lengths_l = strlen( ss_lengths) - cdef int hashes_per_tok = p_lengths_l + s_lengths_l + ps_lengths_l + ss_lengths_l cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0 cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0 - cdef int ps_max_l = ps_lengths[ps_lengths_l - 1] if ps_lengths_l > 0 else 0 - cdef int ss_max_l = ss_lengths[ss_lengths_l - 1] if ss_lengths_l > 0 else 0 # Define / allocate buffers cdef Pool mem = Pool() cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, sizeof(char)) cdef unsigned char* suff_l_buf = mem.alloc(s_max_l, sizeof(char)) - cdef unsigned char* ps_res_buf = mem.alloc(ps_max_l, - MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char)) - cdef unsigned char* ps_l_buf = mem.alloc(ps_max_l, sizeof(char)) - cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, - MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char)) - cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, sizeof(char)) cdef int doc_l = self.length cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty( - (doc_l, hashes_per_tok), dtype="uint64") + (doc_l, p_lengths_l + s_lengths_l), dtype="uint64") cdef np.uint64_t* hashes_ptr = hashes.data # Define working variables @@ -1851,17 +1813,7 @@ cdef class Doc: if s_max_l > 0: _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf) hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr) - - if ps_max_l > 0: - _search_for_chars(tok_str, tok_str_l, ps_search_chars, ps_width_offsets, - ps_max_l, False, ps_res_buf, ps_l_buf) - hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes_ptr) - - if ss_max_l > 0: - _search_for_chars(tok_str, tok_str_l, ss_search_chars, ss_width_offsets, - ss_max_l, True, ss_res_buf, ss_l_buf) - hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes_ptr) - + return hashes @@ -2111,73 +2063,6 @@ cdef void _set_suffix_lengths( memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) -cdef void _search_for_chars( - const unsigned char* tok_str, - const int tok_str_l, - const unsigned char* search_chars, - const unsigned char* width_offsets, - const int max_res_l, - const bint suffs_not_prefs, - unsigned char* res_buf, - unsigned char* l_buf, -) nogil: - """ Search *tok_str* within a string for characters within *search_chars*, starting at the - beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches, - it is added to *res_buf* and the byte length up to that point is added to *l_buf*. When nothing - more is found, the remainder of *l_buf* is populated wth the byte length from the last result, - which may be *0* if the search was not successful. - - tok_str: a UTF-8 representation of a string. - tok_str_l: the length of *tok_str*. - search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within *tok_str*. - width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end] - specifying the offsets within *search_chars* that contain UTF-8 characters with the specified widths. - max_res_l: the maximum number of found characters to place in *res_buf*. - suffs_not_prefs: if *True*, searching starts from the end of the word; - if *False*, from the beginning. - res_buf: the buffer in which to place the search results. - l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters. - The calling code ensures that lengths greater than 255 cannot occur. - """ - cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx - cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0 - cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1 - - while this_tok_str_idx >= 0 and this_tok_str_idx <= tok_str_l: - if ( - (this_tok_str_idx == tok_str_l) or - ((tok_str[this_tok_str_idx] & 0xc0) != 0x80) # not continuation character, always applies to [0]. - ): - if this_tok_str_idx > last_tok_str_idx: - ch_wdth = this_tok_str_idx - last_tok_str_idx - else: - ch_wdth = last_tok_str_idx - this_tok_str_idx - - tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx - search_char_idx = width_offsets[ch_wdth - 1] - end_search_idx = width_offsets[ch_wdth] - while search_char_idx < end_search_idx: - cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth) - if cmp_result == 0: - memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth) - res_buf_idx += ch_wdth - l_buf[l_buf_idx] = res_buf_idx - l_buf_idx += 1 - if l_buf_idx == max_res_l: - return - if cmp_result <= 0: - break - search_char_idx += ch_wdth - last_tok_str_idx = this_tok_str_idx - if suffs_not_prefs: - this_tok_str_idx -= 1 - else: - this_tok_str_idx += 1 - - # fill in unused characters in the length buffer - memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) - - cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325 cdef uint64_t FNV1A_PRIME = 0x00000100000001B3 diff --git a/spacy/util.py b/spacy/util.py index 0373e2219..8d211a9a5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1736,44 +1736,3 @@ def all_equal(iterable): (or if the input is an empty sequence), False otherwise.""" g = itertools.groupby(iterable) return next(g, True) and not next(g, False) - - -def get_search_char_byte_arrays( - search_char_string: str, case_sensitive: bool -) -> Tuple[bytes, bytes]: - """ - This function supports *RichMultiHashEmbed*. It orders the characters in - *search_char_string*, removing any duplicates, encodes them as UTF-8, and - returns the result bufer together with a byte array containing the offsets - where the characters of various byte lengths start within the result buffer, - i.e. - - <1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>. - - If the result buffer does not contain any characters of length *n*, - == . - """ - - if not case_sensitive: - search_char_string = search_char_string.lower() - ordered_search_char_string = "".join(sorted(set(search_char_string))) - search_chars = ordered_search_char_string.encode("UTF-8") - width_offsets = [-1] * 5 - working_start = 0 - working_width = 0 - for idx in range(1, len(search_chars) + 1): - if ( - idx == len(search_chars) - or search_chars[idx] & 0xC0 != 0x80 # not continuation byte - ): - this_width = idx - working_start - if this_width > 4 or this_width < working_width: - raise RuntimeError(Errors.E1051) - if this_width > working_width: - for i in range(working_width, 5): - width_offsets[i] = working_start - working_width = this_width - working_start = idx - for i in range(this_width, 5): - width_offsets[i] = idx - return search_chars, bytes((width_offsets)) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index fbb2e4319..fd31f1f2a 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -186,9 +186,6 @@ updated). > pref_rows = [10000,10000,10000] > suff_lengths = [2, 3, 4, 5] > suff_rows = [10000,10000,10000,10000] -> suff_search_chars = "aeiouäöüß" -> suff_search_lengths = [2, 3] -> suff_search_rows = [10000,10000] > ``` Construct an embedding layer with the features of @@ -198,35 +195,12 @@ features extracted from various positions in each token string. The fixed-length [MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich enough when working with languages with complex morphology, and this layer allows the specification of multiple prefixes and suffixes of any lengths. - -Additionally, it is possible to use as features the results of character -searches of specified lengths. A list of search characters is specified; the -characters in each word are examined in order starting at the beginning or at -the end; and each character that matches one of the search characters is added, -in order, to the string to be used as a feature. The search continues until -either the search result string is full or the whole word has been examined. -This is useful because some languages exhibit morphological alternations where -one letter or letters regularly alternate with another letter or letters -depending on the presence of some other letter before or after it, e.g. German -plural nouns where the final two vowels are `ä-e` regularly correspond to -singular lemmas where the `e` is no longer present and the `ä` has become `a`, -e.g. `die Bäche` (plural) vs. `der Bach` (singular). - -For most languages used with spaCy, searching is likely to be useful starting at -the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is -also offered for completeness. Search characters should consist of all -characters that regularly alternate with other characters in the language in -question or whose presence before or after characters that would otherwise -alternate prevents the alternation from occurring, e.g. an `ä` in a German -plural noun does not become `a` if it is the third or fourth vowel from the end -of the word. +Arrays specifying lengths must be in ascending order. There are a few rare situations where a graphical character is expressed as more than one UTF-8 character, e.g. _i_ when representing the lower-case form of the -Turkish letter _İ_. Such situations are supported, but the lengths of prefixes, -suffixes and character search results may need to be increased accordingly. - -All arrays specifying lengths must be in ascending order. +Turkish letter _İ_. Such situations are supported, but the lengths of prefixes +and suffixes may need to be increased accordingly. | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -239,12 +213,6 @@ All arrays specifying lengths must be in ascending order. | `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ | | `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ | | `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ | -| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ | -| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ | -| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ | -| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ | -| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ | -| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.CharacterEmbed.v2 {#CharacterEmbed}