diff --git a/spacy/ml/affixextractor.py b/spacy/ml/affixextractor.py deleted file mode 100644 index 55c3a3a71..000000000 --- a/spacy/ml/affixextractor.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import List, Optional, Callable, Tuple -from thinc.types import Ints2d -from thinc.api import Model, registry, get_current_ops - -from ..tokens import Doc - - -@registry.layers("spacy.AffixExtractor.v1") -def AffixExtractor( - *, - suffs_not_prefs: bool, - case_sensitive: bool, - len_start: Optional[int], - len_end: Optional[int], - special_chars: Optional[str], - sc_len_start: Optional[int], - sc_len_end: Optional[int], -) -> Model[List[Doc], List[Ints2d]]: - return Model( - "extract_affixes", - forward, - attrs={ - "suffs_not_prefs": suffs_not_prefs, - "case_sensitive": case_sensitive, - "len_start": len_start if len_start is not None else 0, - "len_end": len_end if len_end is not None else 0, - "special_chars": special_chars if special_chars is not None else "", - "sc_len_start": sc_len_start if sc_len_start is not None else 0, - "sc_len_end": sc_len_end if sc_len_end is not None else 0, - }, - ) - - -def forward( - model: Model[List[Doc], List[Ints2d]], docs, is_train: bool -) -> Tuple[List[Ints2d], Callable]: - suffs_not_prefs: bool = model.attrs["suffs_not_prefs"] - case_sensitive: bool = model.attrs["case_sensitive"] - len_start: int = model.attrs["len_start"] - len_end: int = model.attrs["len_end"] - special_chars: str = model.attrs["special_chars"] - sc_len_start: int = model.attrs["sc_len_start"] - sc_len_end: int = model.attrs["sc_len_end"] - features: List[Ints2d] = [] -# for doc in docs: -# features.append( -# model.ops.asarray2i( -# doc.get_affix_hashes( -# suffs_not_prefs, -# case_sensitive, -# len_start, -# len_end, -# special_chars, -# sc_len_start, -# sc_len_end, -# ) -# ) - # ) - - backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] - return features, backprop diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 2a642e88b..538ce36c4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,6 +1,7 @@ +from encodings import search_function from typing import Optional, List, Union, cast -from spacy.ml.affixextractor import AffixExtractor -from thinc.types import Floats2d, Ints2d, Ragged, Ints1d +from spacy.ml.richfeatureextractor import RichFeatureExtractor +from thinc.types import Floats2d, Ints2d, Ragged from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM @@ -187,52 +188,50 @@ def MultiHashEmbed( return model -def process_affix_config_group( +def verify_rich_config_group( label: str, - start_len: Optional[int], - end_len: Optional[int], + lengths: Optional[List[int]], rows: Optional[List[int]], - scs: Optional[str], - is_sc: bool, -) -> List[int]: - if start_len is not None or end_len is not None or rows is not None: - if start_len is None or end_len is None or rows is None: + search_chars: Optional[str], + is_search_char_group: bool, + case_sensitive: bool, +) -> None: + if lengths is not None or rows is not None: + if is_search_char_group and (search_chars is None or len(search_chars) == 0): raise ValueError(Errors.E1045.format(label=label)) - if start_len < 0 or end_len < start_len + 1: + if lengths is None or rows is None: raise ValueError(Errors.E1045.format(label=label)) - if is_sc and scs is None: + if len(lengths) != len(rows): raise ValueError(Errors.E1045.format(label=label)) - if scs is not None and scs != scs.lower(): + if any([length < 1 for length in lengths]): + raise ValueError(Errors.E1045.format(label=label)) + if ( + not case_sensitive + and search_chars is not None + and search_chars != search_chars.lower() + ): raise ValueError(Errors.E1044.format(label=label)) - if len(rows) != end_len - start_len: - raise ValueError(Errors.E1045.format(label=label)) - elif scs is not None: + elif search_chars is not None: raise ValueError(Errors.E1045.format(label=label)) - return rows if rows is not None else [] -@registry.architectures("spacy.AffixMultiHashEmbed.v1") -def AffixMultiHashEmbed( +@registry.architectures("spacy.RichMultiHashEmbed.v1") +def RichMultiHashEmbed( width: int, attrs: List[Union[str, int]], rows: List[int], include_static_vectors: bool, - *, - affix_case_sensitive: bool, - suffix_start_len: Optional[int] = None, - suffix_end_len: Optional[int] = None, - suffix_rows: Optional[List[int]] = None, - suffix_scs: Optional[str] = None, - suffix_sc_start_len: Optional[int] = None, - suffix_sc_end_len: Optional[int] = None, - suffix_sc_rows: Optional[List[int]] = None, - prefix_start_len: Optional[int] = None, - prefix_end_len: Optional[int] = None, - prefix_rows: Optional[List[int]] = None, - prefix_scs: Optional[str] = None, - prefix_sc_start_len: Optional[int] = None, - prefix_sc_end_len: Optional[int] = None, - prefix_sc_rows: Optional[List[int]] = None, + case_sensitive: bool, + pref_lengths: Optional[List[int]] = None, + pref_rows: Optional[List[int]] = None, + pref_search_chars: Optional[str] = None, + pref_search_lengths: Optional[List[int]] = None, + pref_search_rows: Optional[List[int]] = None, + suff_lengths: Optional[List[int]] = None, + suff_rows: Optional[List[int]] = None, + suff_search_chars: Optional[str] = None, + suff_search_lengths: Optional[List[int]] = None, + suff_search_rows: Optional[List[int]] = None, ) -> Model[List[Doc], List[Floats2d]]: """ @@ -242,74 +241,62 @@ def AffixMultiHashEmbed( if len(rows) != len(attrs): raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") - rows.extend( - process_affix_config_group( - "prefix", prefix_start_len, prefix_end_len, prefix_rows, None, False - ) + verify_rich_config_group( + "prefix", pref_lengths, pref_rows, None, False, case_sensitive ) - rows.extend( - process_affix_config_group( - "prefix_sc", - prefix_sc_start_len, - prefix_sc_end_len, - prefix_sc_rows, - prefix_scs, - True, - ) + verify_rich_config_group( + "prefix search", + pref_search_lengths, + pref_search_rows, + pref_search_chars, + True, + case_sensitive, ) - rows.extend( - process_affix_config_group( - "suffix", suffix_start_len, suffix_end_len, suffix_rows, None, False - ) + verify_rich_config_group( + "suffix", suff_lengths, suff_rows, None, False, case_sensitive ) - rows.extend( - process_affix_config_group( - "suffix_sc", - suffix_sc_start_len, - suffix_sc_end_len, - suffix_sc_rows, - suffix_scs, - True, - ) + verify_rich_config_group( + "suffix search", + suff_search_lengths, + suff_search_rows, + suff_search_chars, + True, + case_sensitive, ) - embeddings = [ # type:ignore - HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) # type: ignore + if pref_rows is not None: + rows.extend(pref_rows) + if pref_search_rows is not None: + rows.extend(pref_search_rows) + if suff_rows is not None: + rows.extend(suff_rows) + if suff_search_rows is not None: + rows.extend(suff_search_rows) + + embeddings: List[Model[Ints2d, Floats2d]] = [ + HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) for i, row in enumerate(rows) ] concat_size = width * (len(embeddings) + include_static_vectors) max_out: Model[Ragged, Ragged] = with_array( Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) ) - extractors = [FeatureExtractor(attrs)] - if prefix_start_len is not None or prefix_sc_start_len is not None: - extractors.append( - AffixExtractor( - suffs_not_prefs=False, - case_sensitive=affix_case_sensitive, - len_start=prefix_start_len, - len_end=prefix_end_len, - special_chars=prefix_scs, - sc_len_start=prefix_sc_start_len, - sc_len_end=prefix_sc_end_len, - ) - ) - if suffix_start_len is not None or suffix_sc_start_len is not None: - extractors.append( - AffixExtractor( - suffs_not_prefs=True, - case_sensitive=affix_case_sensitive, - len_start=suffix_start_len, - len_end=suffix_end_len, - special_chars=suffix_scs, - sc_len_start=suffix_sc_start_len, - sc_len_end=suffix_sc_end_len, - ) - ) + extractors = concatenate( + FeatureExtractor(attrs), + RichFeatureExtractor( + case_sensitive=case_sensitive, + pref_lengths=pref_lengths, + pref_search_chars=pref_search_chars, + pref_search_lengths=pref_search_lengths, + suff_lengths=suff_lengths, + suff_search_chars=suff_search_chars, + suff_search_lengths=suff_search_lengths, + ), + ) - if include_static_vectors: - feature_extractor: Model[List[Doc], Ragged] = chain( # type: ignore - concatenate(*extractors), + if include_static_vectors: + feature_extractor: Model[List[Doc], Ragged] = chain( + extractors, cast(Model[List[Ints2d], Ragged], list2ragged()), with_array(concatenate(*embeddings)), ) @@ -322,8 +309,8 @@ def AffixMultiHashEmbed( ragged2list(), ) else: - model = chain( # type: ignore - concatenate(*extractors), + model = chain( + extractors, cast(Model[List[Ints2d], Ragged], list2ragged()), with_array(concatenate(*embeddings)), max_out, diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py new file mode 100644 index 000000000..8c9e6b5e0 --- /dev/null +++ b/spacy/ml/richfeatureextractor.py @@ -0,0 +1,72 @@ +from typing import List, Optional, Callable, Tuple +from thinc.types import Ints2d +from thinc.api import Model, registry + +from ..tokens import Doc + + +@registry.layers("spacy.RichFeatureExtractor.v1") +def RichFeatureExtractor( + *, + case_sensitive: bool, + pref_lengths: Optional[List[int]] = None, + pref_search_chars: Optional[str] = None, + pref_search_lengths: Optional[List[int]] = None, + suff_lengths: Optional[List[int]] = None, + suff_search_chars: Optional[str] = None, + suff_search_lengths: Optional[List[int]] = None, +) -> Model[List[Doc], List[Ints2d]]: + return Model( + "extract_character_combination_hashes", + forward, + attrs={ + "case_sensitive": case_sensitive, + "pref_lengths": pref_lengths if pref_lengths is not None else [], + "pref_search_chars": pref_search_chars + if pref_search_chars is not None + else "", + "pref_search_lengths": pref_search_lengths + if pref_search_lengths is not None + else [], + "suff_lengths": suff_lengths if suff_lengths is not None else [], + "suff_search_chars": suff_search_chars + if suff_search_chars is not None + else "", + "suff_search_lengths": suff_search_lengths + if suff_search_lengths is not None + else [], + }, + ) + + +def forward( + model: Model[List[Doc], List[Ints2d]], docs, is_train: bool +) -> Tuple[List[Ints2d], Callable]: + ops = model.ops + case_sensitive: bool = model.attrs["case_sensitive"] + pref_lengths: List[int] = model.attrs["pref_lengths"] + pref_search_chars: str = model.attrs["pref_search_chars"] + pref_search_lengths: List[int] = model.attrs["pref_search_lengths"] + suff_lengths: List[int] = model.attrs["suff_lengths"] + suff_search_chars: str = model.attrs["suff_search_chars"] + suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] + features: List[Ints2d] = [] + for doc in docs: + prefix_hashes = doc.get_character_combination_hashes( + case_sensitive=case_sensitive, + suffs_not_prefs=False, + affix_lengths=pref_lengths, + search_chars=pref_search_chars, + search_lengths=pref_search_lengths, + ) + suffix_hashes = doc.get_character_combination_hashes( + case_sensitive=case_sensitive, + suffs_not_prefs=True, + affix_lengths=suff_lengths, + search_chars=suff_search_chars, + search_lengths=suff_search_lengths, + ) + features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes]))) + + backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] + return features, backprop diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 2537faff6..32e037649 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -169,6 +169,95 @@ updated). | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +### spacy.RichMultiHashEmbed.v1 {#RichMultiHashEmbed} + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.RichMultiHashEmbed.v1" +> width = 64 +> attrs = ["LOWER","SHAPE"] +> rows = [2000,1000] +> include_static_vectors = "False" +> case_sensitive = "False" +> pref_lengths = [2, 3, 5] +> pref_rows = [2000,2000,2000] +> suff_lengths = [2, 3, 4, 5] +> suff_rows = [2000,2000,2000,2000] +> suff_search_chars = "aeiouäöüyß" +> suff_search_lengths = [2, 3] +> suff_search_rows = [2000,2000] +> ``` + +Construct an embedding layer with the features of +[MultiHashEmbed](#spacymultihashembedv2-multihashembed) plus more detailed +features extracted from various positions in each token string. The fixed-length +`PREFIX` and `SUFFIX` features used in +[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich +enough when working with languages with complex morphology, and this layer +allows the specification of multiple prefixes and suffixes of any lengths. + +Additionally, it is possible to hash the results of character searches of +specified lengths. A list of search characters is specified; the characters in +each word are examined in order starting at the beginning or at the end, and +each character that matches one of the search characters is added in order to +the string to be hashed. The search continues until either the search result +string is full or the whole word has been examined. This feature is useful +because many languages exhibit morphological alternations where one letter or +letters regularly alternate with another letter or letters depending on the +presence of some other letter before or after it, e.g. German plural nouns where +the final two vowels are `ä-e` regularly correspond to singular lemmas where the +`e` is no longer present and the `ä` has become `a`. For most languages, +searching is likely to be useful starting at the end (`suff_*`), but the ability +to search from the beginning (`pref_*`) is also offered for completeness. Search +characters should consist of all characters that regularly alternate with other +characters in the language in question or whose presence before or after +characters that would otherwise alternate prevents the alternation from +occuring, e.g. an `ä` in a German plural noun does not become `a` if it the +third or fourth vowel from the end of the word. + +Internally, the model converts each token string to +[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character +from the string occupies two bytes. This assumption holds for all characters in +the Basic Multilingual Plane, which encompasses all characters that are ever +likely to be of interest when extracting features. There are, however, +characters like emojis that are in the Extended Multilingual Plane and occupy +four bytes, although importantly neither of the two byte pairs that make up such +a representation can be a valid two-byte character in its own right. The +following considerations apply to the processing of four-byte characters: + +- An exceptional four-byte character within a text consisting mostly of two-byte + characters will probably be ignored by the neural network accepting the + embedding layer as not matching any of the learned features. +- If anyone did want to train a model for a language like Lycian that is + generally written in four-byte characters, prefix and suffix features can + still be extracted, but the length specifications should all be doubled, i.e. + `[2,4,6]` to extract one-, two- and three-character affixes. In such a + situation length specifications that are odd numbers would serve no useful + purpose since they would refer to half-characters. +- Four-byte characters are not accepted within search character specification + strings and lead to an error being thrown. + +| Name | Description | +| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ | +| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ | +| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ | +| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | +| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to hash. ~~bool~~ | +| `pref_lengths` | The lengths of prefixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `s` and `spa`. ~~Optional[List[int]~~ | +| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ | +| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ | +| `pref_search_lengths` | The lengths of search result strings to hash, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ | +| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ | +| `suff_lengths` | The lengths of suffixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `y` and `aCy`. ~~Optional[List[int]~~ | +| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ | +| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ | +| `suff_search_lengths` | The lengths of search result strings to hash, where the searches start from the end of each word. ~~Optional[List[int]]~~ | +| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | + ### spacy.CharacterEmbed.v2 {#CharacterEmbed} > #### Example config @@ -587,8 +676,8 @@ consists of either two or three subnetworks: run once for each batch. - **lower**: Construct a feature-specific vector for each `(token, feature)` pair. This is also run once for each batch. Constructing the state - representation is then a matter of summing the component features and - applying the non-linearity. + representation is then a matter of summing the component features and applying + the non-linearity. - **upper** (optional): A feed-forward network that predicts scores from the state representation. If not present, the output from the lower model is used as action scores directly. @@ -628,8 +717,8 @@ same signature, but the `use_upper` argument was `True` by default. > ``` Build a tagger model, using a provided token-to-vector component. The tagger -model adds a linear layer with softmax activation to predict scores given -the token vectors. +model adds a linear layer with softmax activation to predict scores given the +token vectors. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------ | @@ -920,5 +1009,5 @@ A function that reads an existing `KnowledgeBase` from file. A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a named entity, and returns a list of plausible [`Candidate`](/api/kb/#candidate) objects. The default -`CandidateGenerator` uses the text of a mention to find its potential -aliases in the `KnowledgeBase`. Note that this function is case-dependent. +`CandidateGenerator` uses the text of a mention to find its potential aliases in +the `KnowledgeBase`. Note that this function is case-dependent.