Final touches

2025-08-02 19:30:19 +03:00 · 2022-11-09 11:40:54 +01:00 · 2022-11-09 11:40:54 +01:00 · 35d0c217d2
commit 35d0c217d2
parent a972791c9a
8 changed files with 58 additions and 64 deletions
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -248,11 +248,13 @@ def RichMultiHashEmbed(
    the end; and each character that matches one of the search characters is added,
    in order, to the string to be used as a feature. The search continues until
    either the search result string is full or the whole word has been examined.
-    This is useful because many languages exhibit morphological alternations where
+    This is useful because some languages exhibit morphological alternations where
    one letter or letters regularly alternate with another letter or letters
    depending on the presence of some other letter before or after it, e.g. German
    plural nouns where the final two vowels are `ä-e` regularly correspond to
-    singular lemmas where the `e` is no longer present and the `ä` has become `a`.
+    singular lemmas where the `e` is no longer present and the `ä` has become `a`,
+    e.g. `die Bäche` (plural) vs `der Bach` (singular).
+
    For most languages used with spaCy, searching is likely to be useful starting
    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
    is also offered for completeness. Search characters should consist of all
@ -268,7 +270,7 @@ def RichMultiHashEmbed(
    prefixes, suffixes and character search results may need to be increased
    accordingly.

-    All lengths must be specified in ascending order.
+    All arrays specifying lengths must be in ascending order.

    width (int): The output width. Also used as the width of the embedding tables.
        Recommended values are between 64 and 300.
@ -286,7 +288,7 @@ def RichMultiHashEmbed(
    pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
        for each word, e.g. for the word `spaCy`:
-        `[1, 3]` would lead to `y` and `aCy` being used as features.
+        `[1, 3]` would lead to `y` and `yCa` being used as features.
    suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
    pref_search_chars (Optional[str]): A string containing characters to search for
        starting from the beginning of each word.
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -1,11 +1,8 @@
 from typing import List, Optional, Callable, Tuple
-from spacy.util import get_search_char_byte_arrays
-
-# from ..util import get_arrays_for_search_chars
 from thinc.types import Ints2d
 from thinc.api import Model, registry, get_current_ops
-
 from ..tokens import Doc
+from ..util import get_search_char_byte_arrays


@registry.layers("spacy.RichFeatureExtractor.v1")
@ -21,13 +18,17 @@ def RichFeatureExtractor(
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
-        ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
+        ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
+            pref_search_chars, case_sensitive
+        )
    else:
        ps_search_chars = bytes()
        ps_width_offsets = bytes()
    if suff_search_chars is not None:

-        ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
+        ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
+            suff_search_chars, case_sensitive
+        )
    else:
        ss_search_chars = bytes()
        ss_width_offsets = bytes()
@ -36,12 +37,8 @@ def RichFeatureExtractor(
        forward,
        attrs={
            "case_sensitive": case_sensitive,
-            "p_lengths": bytes(pref_lengths)
-            if pref_lengths is not None
-            else bytes(),
-            "s_lengths": bytes(suff_lengths)
-            if suff_lengths is not None
-            else bytes(),
+            "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
+            "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
            "ps_search_chars": ps_search_chars,
            "ps_width_offsets": ps_width_offsets,
            "ps_lengths": bytes(pref_search_lengths)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -16,7 +16,6 @@ from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.tokens.doc import get_fnv1a_hash
 from spacy.util import get_search_char_byte_arrays
 from spacy.vocab import Vocab

@ -998,18 +997,32 @@ def test_doc_spans_setdefault(en_tokenizer):


 EMPTY_HASH_VALUE = 0xCBF29CE484222325
+FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
+FNV1A_PRIME = 0x00000100000001B3
+
+
+def _get_fnv1a_hash(input: bytes) -> int:
+    hash_val = FNV1A_OFFSET_BASIS
+    length = len(input)
+    offset = 0
+
+    while offset < length:
+        hash_val ^= input[offset]
+        hash_val *= FNV1A_PRIME
+        hash_val %= 2**64
+        offset += 1
+    return hash_val


 def test_fnv1a_hash():
    """Checks the conformity of the 64-bit FNV1A implementation with
    http://www.isthe.com/chongo/src/fnv/test_fnv.c.
-    The method called here is only used in testing; in production
-    code, the hashing is performed in a fashion that is interweaved
-    with other logic. The conformity of the production code is
-    demonstrated by the character combination hash tests, where
-    hashes produced by the production code are tested for equality
-    against hashes produced by the test code.
-    s"""
+    The method called here, _get_fnv1a_hash(), is only used in testing; 
+    in production code, the hashing is performed in a fashion that is interweaved
+    with other logic. The conformity of the production code is demonstrated by the 
+    character combination hash tests, where hashes produced by the production code 
+    are tested for equality against hashes produced by _get_fnv1a_hash().
+    """
    INPUTS = [
        b"",
        b"a",
@ -1424,14 +1437,14 @@ def test_fnv1a_hash():

    assert len(INPUTS) == len(OUTPUTS)
    for i in range(len(INPUTS)):
-        assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
+        assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]


 def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
    encoded_input = input.encode("UTF-8")
    if reverse:
        encoded_input = encoded_input[::-1]
-    return get_fnv1a_hash(encoded_input)
+    return _get_fnv1a_hash(encoded_input)


@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1566,7 +1579,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
    assert hashes[3][4] == _encode_and_hash("pr")


-def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
+def test_get_character_combination_hashes_various_lengths(en_tokenizer):
    doc = en_tokenizer("sp𐌞Cé")

    for p_length in range(1, 8):
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -6,8 +6,6 @@ from ..structs cimport TokenC, LexemeC, SpanC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t

-from libc.stdint cimport uint32_t
-

 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil    
@ -20,10 +18,6 @@ ctypedef fused LexemeOrToken:
    const_TokenC_ptr


-cdef extern from "unicodeobject.h":
-    bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
-    Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
-

 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1

--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -1,7 +1,7 @@
 from typing import Callable, Protocol, Iterable, Iterator, Optional
 from typing import Union, Tuple, List, Dict, Any, overload
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
+from thinc.types import Floats1d, Floats2d, Ints2d
 from .span import Span
 from .token import Token
 from ._dict_proxies import SpanGroups
@ -126,7 +126,7 @@ class Doc:
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
-        default: str = ...,
+        default: str = ...
    ) -> None: ...
    @property
    def noun_chunks(self) -> Iterator[Span]: ...
@ -190,6 +190,3 @@ class Doc:

    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
-
-def get_fnv1a_hash(input: bytes) -> int: ...
-
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -3,7 +3,6 @@ from typing import Set, List

 cimport cython
 cimport numpy as np
-from cpython cimport array
 from libc.string cimport memcpy, memcmp, memset, strlen
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
@ -955,7 +954,7 @@ cdef class Doc:
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
-        # Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
+        # Handle scalar/list inputs of strings/ints for py_attr_ids
        # See also #3064
        if isinstance(py_attr_ids, str):
            # Handle inputs like doc.to_array('ORTH')
@ -1780,7 +1779,7 @@ cdef class Doc:
    
        Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of 
        the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
-        for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte 
+        for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte 
        character widths can never exceed 255.

        Note that this method performs no data validation itself as it expects the calling code will already have done so, and
@ -2117,7 +2116,7 @@ cdef void _search_for_chars(
        suffs_not_prefs: if *True*, searching starts from the end of the word; 
            if *False*, from the beginning.
        res_buf: the buffer in which to place the search results.
-        l_buf: a buffer of length *max_res_l* in which to store the byte lengths.
+        l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
            The calling code ensures that lengths greater than 255 cannot occur. 
    """
    cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
@ -2162,17 +2161,6 @@ cdef void _search_for_chars(
 cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
 cdef uint64_t FNV1A_PRIME = 0x00000100000001B3

-def get_fnv1a_hash(input: bytes):
-    """ Python-callable method to facilitate testing. """
-    cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
-    cdef int length = len(input), offset = 0
-
-    while offset < length:
-        hash_val ^= input[offset]
-        hash_val *= FNV1A_PRIME
-        offset += 1
-    return hash_val
-    

@cython.boundscheck(False)  # Deactivate bounds checking
 cdef int _write_hashes(
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1741,14 +1741,15 @@ def get_search_char_byte_arrays(
    search_char_string: str, case_sensitive: bool
 ) -> Tuple[bytes, bytes]:
    """
-    This function supports the rich feature extractor. It orders the characters in
-    *search_char_string*, removing any duplicates, encodes them with UTF-8, and
-    returns the result together with a byte array containing the offsets where the
-    characters of various byte lengths start within the result, i.e.
+    This function supports *RichMultiHashEmbed*. It orders the characters in
+    *search_char_string*, removing any duplicates, encodes them as UTF-8, and
+    returns the result bufer together with a byte array containing the offsets
+    where the characters of various byte lengths start within the result buffer,
+    i.e.

    <1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.

-    If the string does not contain any characters of length *n*,
+    If the result buffer does not contain any characters of length *n*,
    <n_byte_start> == <n+1_byte_start>.
    """

--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -205,11 +205,13 @@ characters in each word are examined in order starting at the beginning or at
 the end; and each character that matches one of the search characters is added,
 in order, to the string to be used as a feature. The search continues until
 either the search result string is full or the whole word has been examined.
-This is useful because many languages exhibit morphological alternations where
+This is useful because some languages exhibit morphological alternations where
 one letter or letters regularly alternate with another letter or letters
 depending on the presence of some other letter before or after it, e.g. German
 plural nouns where the final two vowels are `ä-e` regularly correspond to
-singular lemmas where the `e` is no longer present and the `ä` has become `a`.
+singular lemmas where the `e` is no longer present and the `ä` has become `a`,
+e.g. `die Bäche` (plural) vs. `der Bach` (singular).
+
 For most languages used with spaCy, searching is likely to be useful starting at
 the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
 also offered for completeness. Search characters should consist of all
@ -224,7 +226,7 @@ than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
 Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
 suffixes and character search results may need to be increased accordingly.

-All lengths must be specified in ascending order.
+All arrays specifying lengths must be in ascending order.

 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -235,7 +237,7 @@ All lengths must be specified in ascending order.
 | `case_sensitive`         | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~                                                                                                                                                                                                                                                                                                          |
 | `pref_lengths`           | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
 | `pref_rows`              | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
-| `suff_lengths`           | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
+| `suff_lengths`           | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
 | `suff_rows`              | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
 | `pref_search_chars`      | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~                                                                                                                                                                                                                                                                                                                                           |
 | `pref_search_lengths`    | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                         |