Python code and documentation

2025-09-12 07:02:37 +03:00 · 2022-10-06 15:10:27 +02:00 · 2022-10-06 15:10:27 +02:00 · 581f380c00
commit 581f380c00
parent 06fe50a12d
4 changed files with 247 additions and 160 deletions
--- a/spacy/ml/affixextractor.py
+++ b/spacy/ml/affixextractor.py
@ -1,61 +0,0 @@
-from typing import List, Optional, Callable, Tuple
-from thinc.types import Ints2d
-from thinc.api import Model, registry, get_current_ops
-
-from ..tokens import Doc
-
-
-@registry.layers("spacy.AffixExtractor.v1")
-def AffixExtractor(
-    *,
-    suffs_not_prefs: bool,
-    case_sensitive: bool,
-    len_start: Optional[int],
-    len_end: Optional[int],
-    special_chars: Optional[str],
-    sc_len_start: Optional[int],
-    sc_len_end: Optional[int],
-) -> Model[List[Doc], List[Ints2d]]:
-    return Model(
-        "extract_affixes",
-        forward,
-        attrs={
-            "suffs_not_prefs": suffs_not_prefs,
-            "case_sensitive": case_sensitive,
-            "len_start": len_start if len_start is not None else 0,
-            "len_end": len_end if len_end is not None else 0,
-            "special_chars": special_chars if special_chars is not None else "",
-            "sc_len_start": sc_len_start if sc_len_start is not None else 0,
-            "sc_len_end": sc_len_end if sc_len_end is not None else 0,
-        },
-    )
-
-
-def forward(
-    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
-) -> Tuple[List[Ints2d], Callable]:
-    suffs_not_prefs: bool = model.attrs["suffs_not_prefs"]
-    case_sensitive: bool = model.attrs["case_sensitive"]
-    len_start: int = model.attrs["len_start"]
-    len_end: int = model.attrs["len_end"]
-    special_chars: str = model.attrs["special_chars"]
-    sc_len_start: int = model.attrs["sc_len_start"]
-    sc_len_end: int = model.attrs["sc_len_end"]
-    features: List[Ints2d] = []
-#    for doc in docs:
-#        features.append(
-#           model.ops.asarray2i(
-#                doc.get_affix_hashes(
-#                    suffs_not_prefs,
-#                    case_sensitive,
-#                    len_start,
-#                   len_end,
-#                    special_chars,
-#                    sc_len_start,
-#                    sc_len_end,
-#                )
-#            )
- #       )
-
-    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
-    return features, backprop
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -1,6 +1,7 @@
+from encodings import search_function
 from typing import Optional, List, Union, cast
-from spacy.ml.affixextractor import AffixExtractor
-from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
+from spacy.ml.richfeatureextractor import RichFeatureExtractor
+from thinc.types import Floats2d, Ints2d, Ragged
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
 from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
@ -187,52 +188,50 @@ def MultiHashEmbed(
    return model


-def process_affix_config_group(
+def verify_rich_config_group(
    label: str,
-    start_len: Optional[int],
-    end_len: Optional[int],
+    lengths: Optional[List[int]],
    rows: Optional[List[int]],
-    scs: Optional[str],
-    is_sc: bool,
-) -> List[int]:
-    if start_len is not None or end_len is not None or rows is not None:
-        if start_len is None or end_len is None or rows is None:
+    search_chars: Optional[str],
+    is_search_char_group: bool,
+    case_sensitive: bool,
+) -> None:
+    if lengths is not None or rows is not None:
+        if is_search_char_group and (search_chars is None or len(search_chars) == 0):
            raise ValueError(Errors.E1045.format(label=label))
-        if start_len < 0 or end_len < start_len + 1:
+        if lengths is None or rows is None:
            raise ValueError(Errors.E1045.format(label=label))
-        if is_sc and scs is None:
+        if len(lengths) != len(rows):
            raise ValueError(Errors.E1045.format(label=label))
-        if scs is not None and scs != scs.lower():
+        if any([length < 1 for length in lengths]):
+            raise ValueError(Errors.E1045.format(label=label))
+        if (
+            not case_sensitive
+            and search_chars is not None
+            and search_chars != search_chars.lower()
+        ):
            raise ValueError(Errors.E1044.format(label=label))
-        if len(rows) != end_len - start_len:
-            raise ValueError(Errors.E1045.format(label=label))
-    elif scs is not None:
+    elif search_chars is not None:
        raise ValueError(Errors.E1045.format(label=label))
-    return rows if rows is not None else []


-@registry.architectures("spacy.AffixMultiHashEmbed.v1")
-def AffixMultiHashEmbed(
+@registry.architectures("spacy.RichMultiHashEmbed.v1")
+def RichMultiHashEmbed(
    width: int,
    attrs: List[Union[str, int]],
    rows: List[int],
    include_static_vectors: bool,
-    *,
-    affix_case_sensitive: bool,
-    suffix_start_len: Optional[int] = None,
-    suffix_end_len: Optional[int] = None,
-    suffix_rows: Optional[List[int]] = None,
-    suffix_scs: Optional[str] = None,
-    suffix_sc_start_len: Optional[int] = None,
-    suffix_sc_end_len: Optional[int] = None,
-    suffix_sc_rows: Optional[List[int]] = None,
-    prefix_start_len: Optional[int] = None,
-    prefix_end_len: Optional[int] = None,
-    prefix_rows: Optional[List[int]] = None,
-    prefix_scs: Optional[str] = None,
-    prefix_sc_start_len: Optional[int] = None,
-    prefix_sc_end_len: Optional[int] = None,
-    prefix_sc_rows: Optional[List[int]] = None,
+    case_sensitive: bool,
+    pref_lengths: Optional[List[int]] = None,
+    pref_rows: Optional[List[int]] = None,
+    pref_search_chars: Optional[str] = None,
+    pref_search_lengths: Optional[List[int]] = None,
+    pref_search_rows: Optional[List[int]] = None,
+    suff_lengths: Optional[List[int]] = None,
+    suff_rows: Optional[List[int]] = None,
+    suff_search_chars: Optional[str] = None,
+    suff_search_lengths: Optional[List[int]] = None,
+    suff_search_rows: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Floats2d]]:

    """
@ -242,74 +241,62 @@ def AffixMultiHashEmbed(
    if len(rows) != len(attrs):
        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")

-    rows.extend(
-        process_affix_config_group(
-            "prefix", prefix_start_len, prefix_end_len, prefix_rows, None, False
-        )
+    verify_rich_config_group(
+        "prefix", pref_lengths, pref_rows, None, False, case_sensitive
    )
-    rows.extend(
-        process_affix_config_group(
-            "prefix_sc",
-            prefix_sc_start_len,
-            prefix_sc_end_len,
-            prefix_sc_rows,
-            prefix_scs,
-            True,
-        )
+    verify_rich_config_group(
+        "prefix search",
+        pref_search_lengths,
+        pref_search_rows,
+        pref_search_chars,
+        True,
+        case_sensitive,
    )
-    rows.extend(
-        process_affix_config_group(
-            "suffix", suffix_start_len, suffix_end_len, suffix_rows, None, False
-        )
+    verify_rich_config_group(
+        "suffix", suff_lengths, suff_rows, None, False, case_sensitive
    )
-    rows.extend(
-        process_affix_config_group(
-            "suffix_sc",
-            suffix_sc_start_len,
-            suffix_sc_end_len,
-            suffix_sc_rows,
-            suffix_scs,
-            True,
-        )
+    verify_rich_config_group(
+        "suffix search",
+        suff_search_lengths,
+        suff_search_rows,
+        suff_search_chars,
+        True,
+        case_sensitive,
    )

-    embeddings = [  # type:ignore
-        HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) # type: ignore
+    if pref_rows is not None:
+        rows.extend(pref_rows)
+    if pref_search_rows is not None:
+        rows.extend(pref_search_rows)
+    if suff_rows is not None:
+        rows.extend(suff_rows)
+    if suff_search_rows is not None:
+        rows.extend(suff_search_rows)
+
+    embeddings: List[Model[Ints2d, Floats2d]] = [
+        HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
        for i, row in enumerate(rows)
    ]
    concat_size = width * (len(embeddings) + include_static_vectors)
    max_out: Model[Ragged, Ragged] = with_array(
        Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
    )
-    extractors = [FeatureExtractor(attrs)]
-    if prefix_start_len is not None or prefix_sc_start_len is not None:
-        extractors.append(
-            AffixExtractor(
-                suffs_not_prefs=False,
-                case_sensitive=affix_case_sensitive,
-                len_start=prefix_start_len,
-                len_end=prefix_end_len,
-                special_chars=prefix_scs,
-                sc_len_start=prefix_sc_start_len,
-                sc_len_end=prefix_sc_end_len,
-            )
-        )
-    if suffix_start_len is not None or suffix_sc_start_len is not None:
-        extractors.append(
-            AffixExtractor(
-                suffs_not_prefs=True,
-                case_sensitive=affix_case_sensitive,
-                len_start=suffix_start_len,
-                len_end=suffix_end_len,
-                special_chars=suffix_scs,
-                sc_len_start=suffix_sc_start_len,
-                sc_len_end=suffix_sc_end_len,
-            )
-        )
+    extractors = concatenate(
+        FeatureExtractor(attrs),
+        RichFeatureExtractor(
+            case_sensitive=case_sensitive,
+            pref_lengths=pref_lengths,
+            pref_search_chars=pref_search_chars,
+            pref_search_lengths=pref_search_lengths,
+            suff_lengths=suff_lengths,
+            suff_search_chars=suff_search_chars,
+            suff_search_lengths=suff_search_lengths,
+        ),
+    )

-    if include_static_vectors: 
-        feature_extractor: Model[List[Doc], Ragged] = chain( # type: ignore
-            concatenate(*extractors),
+    if include_static_vectors:
+        feature_extractor: Model[List[Doc], Ragged] = chain(
+            extractors,
            cast(Model[List[Ints2d], Ragged], list2ragged()),
            with_array(concatenate(*embeddings)),
        )
@ -322,8 +309,8 @@ def AffixMultiHashEmbed(
            ragged2list(),
        )
    else:
-        model = chain( # type: ignore
-            concatenate(*extractors),
+        model = chain(
+            extractors,
            cast(Model[List[Ints2d], Ragged], list2ragged()),
            with_array(concatenate(*embeddings)),
            max_out,
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -0,0 +1,72 @@
+from typing import List, Optional, Callable, Tuple
+from thinc.types import Ints2d
+from thinc.api import Model, registry
+
+from ..tokens import Doc
+
+
+@registry.layers("spacy.RichFeatureExtractor.v1")
+def RichFeatureExtractor(
+    *,
+    case_sensitive: bool,
+    pref_lengths: Optional[List[int]] = None,
+    pref_search_chars: Optional[str] = None,
+    pref_search_lengths: Optional[List[int]] = None,
+    suff_lengths: Optional[List[int]] = None,
+    suff_search_chars: Optional[str] = None,
+    suff_search_lengths: Optional[List[int]] = None,
+) -> Model[List[Doc], List[Ints2d]]:
+    return Model(
+        "extract_character_combination_hashes",
+        forward,
+        attrs={
+            "case_sensitive": case_sensitive,
+            "pref_lengths": pref_lengths if pref_lengths is not None else [],
+            "pref_search_chars": pref_search_chars
+            if pref_search_chars is not None
+            else "",
+            "pref_search_lengths": pref_search_lengths
+            if pref_search_lengths is not None
+            else [],
+            "suff_lengths": suff_lengths if suff_lengths is not None else [],
+            "suff_search_chars": suff_search_chars
+            if suff_search_chars is not None
+            else "",
+            "suff_search_lengths": suff_search_lengths
+            if suff_search_lengths is not None
+            else [],
+        },
+    )
+
+
+def forward(
+    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
+) -> Tuple[List[Ints2d], Callable]:
+    ops = model.ops
+    case_sensitive: bool = model.attrs["case_sensitive"]
+    pref_lengths: List[int] = model.attrs["pref_lengths"]
+    pref_search_chars: str = model.attrs["pref_search_chars"]
+    pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
+    suff_lengths: List[int] = model.attrs["suff_lengths"]
+    suff_search_chars: str = model.attrs["suff_search_chars"]
+    suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
+    features: List[Ints2d] = []
+    for doc in docs:
+        prefix_hashes = doc.get_character_combination_hashes(
+            case_sensitive=case_sensitive,
+            suffs_not_prefs=False,
+            affix_lengths=pref_lengths,
+            search_chars=pref_search_chars,
+            search_lengths=pref_search_lengths,
+        )
+        suffix_hashes = doc.get_character_combination_hashes(
+            case_sensitive=case_sensitive,
+            suffs_not_prefs=True,
+            affix_lengths=suff_lengths,
+            search_chars=suff_search_chars,
+            search_lengths=suff_search_lengths,
+        )
+        features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes])))
+
+    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
+    return features, backprop
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -169,6 +169,95 @@ updated).
 | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
 | **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |

+### spacy.RichMultiHashEmbed.v1 {#RichMultiHashEmbed}
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.RichMultiHashEmbed.v1"
+> width = 64
+> attrs = ["LOWER","SHAPE"]
+> rows = [2000,1000]
+> include_static_vectors = "False"
+> case_sensitive = "False"
+> pref_lengths = [2, 3, 5]
+> pref_rows = [2000,2000,2000]
+> suff_lengths = [2, 3, 4, 5]
+> suff_rows = [2000,2000,2000,2000]
+> suff_search_chars = "aeiouäöüyß"
+> suff_search_lengths = [2, 3]
+> suff_search_rows = [2000,2000]
+> ```
+
+Construct an embedding layer with the features of
+[MultiHashEmbed](#spacymultihashembedv2-multihashembed) plus more detailed
+features extracted from various positions in each token string. The fixed-length
+`PREFIX` and `SUFFIX` features used in
+[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
+enough when working with languages with complex morphology, and this layer
+allows the specification of multiple prefixes and suffixes of any lengths.
+
+Additionally, it is possible to hash the results of character searches of
+specified lengths. A list of search characters is specified; the characters in
+each word are examined in order starting at the beginning or at the end, and
+each character that matches one of the search characters is added in order to
+the string to be hashed. The search continues until either the search result
+string is full or the whole word has been examined. This feature is useful
+because many languages exhibit morphological alternations where one letter or
+letters regularly alternate with another letter or letters depending on the
+presence of some other letter before or after it, e.g. German plural nouns where
+the final two vowels are `ä-e` regularly correspond to singular lemmas where the
+`e` is no longer present and the `ä` has become `a`. For most languages,
+searching is likely to be useful starting at the end (`suff_*`), but the ability
+to search from the beginning (`pref_*`) is also offered for completeness. Search
+characters should consist of all characters that regularly alternate with other
+characters in the language in question or whose presence before or after
+characters that would otherwise alternate prevents the alternation from
+occuring, e.g. an `ä` in a German plural noun does not become `a` if it the
+third or fourth vowel from the end of the word.
+
+Internally, the model converts each token string to
+[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
+from the string occupies two bytes. This assumption holds for all characters in
+the Basic Multilingual Plane, which encompasses all characters that are ever
+likely to be of interest when extracting features. There are, however,
+characters like emojis that are in the Extended Multilingual Plane and occupy
+four bytes, although importantly neither of the two byte pairs that make up such
+a representation can be a valid two-byte character in its own right. The
+following considerations apply to the processing of four-byte characters:
+
+- An exceptional four-byte character within a text consisting mostly of two-byte
+  characters will probably be ignored by the neural network accepting the
+  embedding layer as not matching any of the learned features.
+- If anyone did want to train a model for a language like Lycian that is
+  generally written in four-byte characters, prefix and suffix features can
+  still be extracted, but the length specifications should all be doubled, i.e.
+  `[2,4,6]` to extract one-, two- and three-character affixes. In such a
+  situation length specifications that are odd numbers would serve no useful
+  purpose since they would refer to half-characters.
+- Four-byte characters are not accepted within search character specification
+  strings and lead to an error being thrown.
+
+| Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`                  | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~                                                              |
+| `attrs`                  | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~                                                                                                                                                                                                                                                                                                                        |
+| `rows`                   | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
+| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
+| `case_sensitive`         | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to hash. ~~bool~~                                                                                                                                                                                                                                                                                                                     |
+| `pref_lengths`           | The lengths of prefixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `s` and `spa`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                  |
+| `pref_rows`              | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
+| `pref_search_chars`      | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~                                                                                                                                                                                                               |
+| `pref_search_lengths`    | The lengths of search result strings to hash, where the searches start from the beginning of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                                    |
+| `pref_search_rows`       | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
+| `suff_lengths`           | The lengths of suffixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `y` and `aCy`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                  |
+| `suff_rows`              | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
+| `suff_search_chars`      | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~                                                                                                                                                                                                                     |
+| `suff_search_lengths`    | The lengths of search result strings to hash, where the searches start from the end of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                                          |
+| `suff_search_rows`       | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
+| **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |
+
 ### spacy.CharacterEmbed.v2 {#CharacterEmbed}

 > #### Example config
@ -587,8 +676,8 @@ consists of either two or three subnetworks:
  run once for each batch.
 - **lower**: Construct a feature-specific vector for each `(token, feature)`
  pair. This is also run once for each batch. Constructing the state
-  representation is then a matter of summing the component features and
-  applying the non-linearity.
+  representation is then a matter of summing the component features and applying
+  the non-linearity.
 - **upper** (optional): A feed-forward network that predicts scores from the
  state representation. If not present, the output from the lower model is used
  as action scores directly.
@ -628,8 +717,8 @@ same signature, but the `use_upper` argument was `True` by default.
 > ```

 Build a tagger model, using a provided token-to-vector component. The tagger
-model adds a linear layer with softmax activation to predict scores given
-the token vectors.
+model adds a linear layer with softmax activation to predict scores given the
+token vectors.

 | Name        | Description                                                                                |
 | ----------- | ------------------------------------------------------------------------------------------ |
@ -920,5 +1009,5 @@ A function that reads an existing `KnowledgeBase` from file.
 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
 [`Span`](/api/span) object denoting a named entity, and returns a list of
 plausible [`Candidate`](/api/kb/#candidate) objects. The default
-`CandidateGenerator` uses the text of a mention to find its potential
-aliases in the `KnowledgeBase`. Note that this function is case-dependent.
+`CandidateGenerator` uses the text of a mention to find its potential aliases in
+the `KnowledgeBase`. Note that this function is case-dependent.