Major refactoring

2025-08-02 19:30:19 +03:00 · 2022-10-21 12:01:24 +02:00 · 2022-10-21 12:01:24 +02:00 · 42b7b8d509
commit 42b7b8d509
parent f7d9942e7c
6 changed files with 73 additions and 70 deletions
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -224,11 +224,11 @@ def RichMultiHashEmbed(
    case_sensitive: bool,
    pref_lengths: Optional[List[int]] = None,
    pref_rows: Optional[List[int]] = None,
+    suff_lengths: Optional[List[int]] = None,
+    suff_rows: Optional[List[int]] = None,
    pref_search_chars: Optional[str] = None,
    pref_search_lengths: Optional[List[int]] = None,
    pref_search_rows: Optional[List[int]] = None,
-    suff_lengths: Optional[List[int]] = None,
-    suff_rows: Optional[List[int]] = None,
    suff_search_chars: Optional[str] = None,
    suff_search_lengths: Optional[List[int]] = None,
    suff_search_rows: Optional[List[int]] = None,
@ -252,13 +252,14 @@ def RichMultiHashEmbed(
    depending on the presence of some other letter before or after it, e.g. German
    plural nouns where the final two vowels are `ä-e` regularly correspond to
    singular lemmas where the `e` is no longer present and the `ä` has become `a`.
-    For most languages, searching is likely to be useful starting at the end
-    (`suff_*`), but the ability to search from the beginning (`pref_*`) is also
-    offered for completeness. Search characters should consist of all characters
-    that regularly alternate with other characters in the language in question or
-    whose presence before or after characters that would otherwise alternate
-    prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
-    not become `a` if it is the third or fourth vowel from the end of the word.
+    For most languages used with spaCy, searching is likely to be useful starting 
+    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) 
+    is also offered for completeness. Search characters should consist of all 
+    characters that regularly alternate with other characters in the language in 
+    question or whose presence before or after characters that would otherwise 
+    alternate prevents the alternation from occurring, e.g. an `ä` in a German 
+    plural noun does not become `a` if it is the third or fourth vowel from the 
+    end of the word.

    width (int): The output width. Also used as the width of the embedding tables.
        Recommended values are between 64 and 300.
@ -274,22 +275,18 @@ def RichMultiHashEmbed(
        for each word, e.g. for the word `spaCy`: 
        `[1, 3]` would lead to `s` and `spa` being used as features.
    pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
-    pref_search_chars (Optional[str]): A string containing characters to search for 
-        starting from the beginning of each word. May not contain characters that 
-        occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain 
-        upper-case letters.
-    pref_search_lengths (Optional[List[int]]): The lengths of search result strings 
-        to use as features, where the searches start from the beginning of each word.
-    pref_search_rows (Optional[List[int]]): The number of rows for each of 
-        `pref_search_lengths`.
    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features 
        for each word, e.g. for the word `spaCy`: 
        `[1, 3]` would lead to `y` and `aCy` being used as features.
    suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
+    pref_search_chars (Optional[str]): A string containing characters to search for 
+        starting from the beginning of each word.
+    pref_search_lengths (Optional[List[int]]): The lengths of search result strings 
+        to use as features, where the searches start from the beginning of each word.
+    pref_search_rows (Optional[List[int]]): The number of rows for each of 
+        `pref_search_lengths`.
    suff_search_chars (Optional[str]): A string containing characters to search for 
-        starting from the end of each word. May not contain characters that 
-        occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain 
-        upper-case letters.
+        starting from the end of each word.
    suff_search_lengths (Optional[List[int]]): The lengths of search result strings 
        to use as features, where the searches start from the end of each word.
    suff_search_rows (Optional[List[int]]): The number of rows for each of 
@ -302,6 +299,9 @@ def RichMultiHashEmbed(
    _verify_rich_config_group(
        "prefix", pref_lengths, pref_rows, None, False, case_sensitive
    )
+    _verify_rich_config_group(
+        "suffix", suff_lengths, suff_rows, None, False, case_sensitive
+    )
    _verify_rich_config_group(
        "prefix search",
        pref_search_lengths,
@ -310,9 +310,6 @@ def RichMultiHashEmbed(
        True,
        case_sensitive,
    )
-    _verify_rich_config_group(
-        "suffix", suff_lengths, suff_rows, None, False, case_sensitive
-    )
    _verify_rich_config_group(
        "suffix search",
        suff_search_lengths,
@ -324,10 +321,10 @@ def RichMultiHashEmbed(

    if pref_rows is not None:
        rows.extend(pref_rows)
-    if pref_search_rows is not None:
-        rows.extend(pref_search_rows)
    if suff_rows is not None:
        rows.extend(suff_rows)
+    if pref_search_rows is not None:
+        rows.extend(pref_search_rows)
    if suff_search_rows is not None:
        rows.extend(suff_search_rows)

@ -344,9 +341,9 @@ def RichMultiHashEmbed(
        RichFeatureExtractor(
            case_sensitive=case_sensitive,
            pref_lengths=pref_lengths,
+            suff_lengths=suff_lengths,
            pref_search_chars=pref_search_chars,
            pref_search_lengths=pref_search_lengths,
-            suff_lengths=suff_lengths,
            suff_search_chars=suff_search_chars,
            suff_search_lengths=suff_search_lengths,
        ),
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -11,9 +11,9 @@ def RichFeatureExtractor(
    *,
    case_sensitive: bool,
    pref_lengths: Optional[List[int]] = None,
+    suff_lengths: Optional[List[int]] = None,
    pref_search_chars: Optional[str] = None,
    pref_search_lengths: Optional[List[int]] = None,
-    suff_lengths: Optional[List[int]] = None,
    suff_search_chars: Optional[str] = None,
    suff_search_lengths: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Ints2d]]:
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -67,7 +67,7 @@ cdef void _search_for_chars(
    Py_UCS4* result_buf, 
    const int result_buf_len, 
    bint suffs_not_prefs
-)
+) nogil


 cdef class Doc:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1755,6 +1755,7 @@ cdef class Doc:
            derived from the raw text of each token.

        Generally:
+
        p_ variables relate to prefixes (affixes starting at the beginning of the word)
        s_ variables relate to suffixes (affixes starting at the end of the word)
        ps_ variables relate to searches starting at the beginning of the word
@ -1770,7 +1771,7 @@ cdef class Doc:
            the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
-        ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if 
+        ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if 
            *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "a" and "ac".
        ss_search: a byte array containing characters to search for within each token, starting at the end.
@ -1778,12 +1779,13 @@ cdef class Doc:
            the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
-        ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if 
-            *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for 
+        ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if 
+            *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "c" and "ca".
        
        For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by 
-        *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
+        *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* 
+        would correspond to

        [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
        [hash("an"), hash("nd"), hash(" and", hash("   and"), hash(" "), hash("  "))],
@ -1812,18 +1814,18 @@ cdef class Doc:
        # Define / allocate buffer (pr/sr: result buffers)
        cdef int aff_buf_l  = p_max_l + s_max_l
        cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
-        cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
-        cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
-        cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
-        cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
-        cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
-        cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
+        cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
+        cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
+        cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
+        cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
+        cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
+        cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
        
        # Define memory views on length arrays
-        cdef int[:] p_v = p_lengths
-        cdef int[:] s_v = s_lengths
-        cdef int[:] ps_v = ps_lengths
-        cdef int[:] ss_v = ss_lengths
+        cdef int[:] p_lengths_v = p_lengths
+        cdef int[:] s_lengths_v = s_lengths
+        cdef int[:] ps_lengths_v = ps_lengths
+        cdef int[:] ss_lengths_v = ss_lengths

        # Define working variables
        cdef TokenC tok_c
@ -1838,27 +1840,27 @@ cdef class Doc:
                _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
                
                for hash_idx in range(p_h_num):
-                    hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
+                    hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
            
                for hash_idx in range(p_h_num, s_h_end):
-                    aff_len = s_v[hash_idx - p_h_num]
+                    aff_len = s_lengths_v[hash_idx - p_h_num]
                    hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
                
            if ps_h_num > 0:
-                _search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
+                _search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
                for hash_idx in range(s_h_end, ps_h_end):
-                    aff_len = ps_v[hash_idx - s_h_end]
-                    hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
+                    aff_len = ps_lengths_v[hash_idx - s_h_end]
+                    hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)

            if ss_h_num > 0:
-                _search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
+                _search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
                for hash_idx in range(ps_h_end, ss_h_end):
-                    aff_len = ss_v[hash_idx - ps_h_end]
-                    hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
+                    aff_len = ss_lengths_v[hash_idx - ps_h_end]
+                    hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)

        self.mem.free(aff_buf)
-        self.mem.free(pr_buf)
-        self.mem.free(sr_buf)
+        self.mem.free(ps_r_buf)
+        self.mem.free(ss_r_buf)
        return hashes

    @staticmethod
@ -2051,8 +2053,9 @@ cdef void _copy_chars(
    """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
    any upper-case characters to lower case within the target buffer.
    """
-    memcpy(target, source, length * sizeof(Py_UCS4))
    cdef int idx
+
+    memcpy(target, source, length * sizeof(Py_UCS4))
    if to_lower:
        for idx in range(length):
            if Py_UNICODE_ISUPPER(target[idx]):
@ -2089,15 +2092,18 @@ cdef void _set_affixes(
    if tok_len < pref_len:
        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
        aff_buf_idx = aff_buf_len - suff_len
+
    if tok_len < suff_len:
        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
        aff_buf_idx = aff_buf_len - tok_len

    if suff_len > 0:
+        # in_word_idx: the index within the token where the suffix starts
        in_word_idx = aff_buf_idx + tok_len - aff_buf_len
        if in_word_idx < pref_len:
            memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
            aff_buf_idx += filled_pref_len - in_word_idx
+            in_word_idx = aff_buf_idx + tok_len - aff_buf_len
        if aff_buf_idx < aff_buf_len:
            _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)

@ -2112,7 +2118,7 @@ cdef void _search_for_chars(
    Py_UCS4* result_buf, 
    const int result_buf_len, 
    bint suffs_not_prefs
-):
+) nogil:
    """ Search a word within a string for characters within *search_buf*, starting at the beginning or
        end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
        the corresponding character from *lookup_buf* is added to *result_buf*.
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1791,8 +1791,7 @@ def get_arrays_for_search_chars(
    def get_ordered_raw_bytes(
        search: List[bytes], lookup: List[bytes]
    ) -> Tuple[bytes, bytes]:
-        """Flatten the two lists, ordering both by the entries in *search*
-        using the native endianness of the platform.
+        """Flatten the two lists, ordering both by the entries in *search*.
        """
        num_search = [list(entry) for entry in search]
        search = [entry for _, entry in sorted(zip(num_search, search))]
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -210,13 +210,14 @@ one letter or letters regularly alternate with another letter or letters
 depending on the presence of some other letter before or after it, e.g. German
 plural nouns where the final two vowels are `ä-e` regularly correspond to
 singular lemmas where the `e` is no longer present and the `ä` has become `a`.
-For most languages, searching is likely to be useful starting at the end
-(`suff_*`), but the ability to search from the beginning (`pref_*`) is also
-offered for completeness. Search characters should consist of all characters
-that regularly alternate with other characters in the language in question or
-whose presence before or after characters that would otherwise alternate
-prevents the alternation from occurring, e.g. an `ä` in a German plural noun
-does not become `a` if it is the third or fourth vowel from the end of the word.
+For most languages used with spaCy, searching is likely to be useful starting 
+at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) 
+is also offered for completeness. Search characters should consist of all 
+characters that regularly alternate with other characters in the language in 
+question or whose presence before or after characters that would otherwise 
+alternate prevents the alternation from occurring, e.g. an `ä` in a German 
+plural noun does not become `a` if it is the third or fourth vowel from the 
+end of the word.

 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -226,13 +227,13 @@ does not become `a` if it is the third or fourth vowel from the end of the word.
 | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
 | `case_sensitive`         | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~                                                                                                                                                                                                                                                                                                          |
 | `pref_lengths`           | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
+| `suff_lengths`           | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
 | `pref_rows`              | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
-| `pref_search_chars`      | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~                                                                                                                                                                                                               |
+| `pref_search_chars`      | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~                                                                                                                                                                                                               |
 | `pref_search_lengths`    | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                         |
 | `pref_search_rows`       | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
-| `suff_lengths`           | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
 | `suff_rows`              | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
-| `suff_search_chars`      | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~                                                                                                                                                                                                                     |
+| `suff_search_chars`      | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~                                                                                                                                                                                                                     |
 | `suff_search_lengths`    | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                               |
 | `suff_search_rows`       | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
 | **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |