Python code and documentation

This commit is contained in:
richardpaulhudson 2022-10-06 15:10:27 +02:00
parent 06fe50a12d
commit 581f380c00
4 changed files with 247 additions and 160 deletions

View File

@ -1,61 +0,0 @@
from typing import List, Optional, Callable, Tuple
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
@registry.layers("spacy.AffixExtractor.v1")
def AffixExtractor(
*,
suffs_not_prefs: bool,
case_sensitive: bool,
len_start: Optional[int],
len_end: Optional[int],
special_chars: Optional[str],
sc_len_start: Optional[int],
sc_len_end: Optional[int],
) -> Model[List[Doc], List[Ints2d]]:
return Model(
"extract_affixes",
forward,
attrs={
"suffs_not_prefs": suffs_not_prefs,
"case_sensitive": case_sensitive,
"len_start": len_start if len_start is not None else 0,
"len_end": len_end if len_end is not None else 0,
"special_chars": special_chars if special_chars is not None else "",
"sc_len_start": sc_len_start if sc_len_start is not None else 0,
"sc_len_end": sc_len_end if sc_len_end is not None else 0,
},
)
def forward(
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
) -> Tuple[List[Ints2d], Callable]:
suffs_not_prefs: bool = model.attrs["suffs_not_prefs"]
case_sensitive: bool = model.attrs["case_sensitive"]
len_start: int = model.attrs["len_start"]
len_end: int = model.attrs["len_end"]
special_chars: str = model.attrs["special_chars"]
sc_len_start: int = model.attrs["sc_len_start"]
sc_len_end: int = model.attrs["sc_len_end"]
features: List[Ints2d] = []
# for doc in docs:
# features.append(
# model.ops.asarray2i(
# doc.get_affix_hashes(
# suffs_not_prefs,
# case_sensitive,
# len_start,
# len_end,
# special_chars,
# sc_len_start,
# sc_len_end,
# )
# )
# )
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -1,6 +1,7 @@
from encodings import search_function
from typing import Optional, List, Union, cast
from spacy.ml.affixextractor import AffixExtractor
from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
from spacy.ml.richfeatureextractor import RichFeatureExtractor
from thinc.types import Floats2d, Ints2d, Ragged
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
@ -187,52 +188,50 @@ def MultiHashEmbed(
return model
def process_affix_config_group(
def verify_rich_config_group(
label: str,
start_len: Optional[int],
end_len: Optional[int],
lengths: Optional[List[int]],
rows: Optional[List[int]],
scs: Optional[str],
is_sc: bool,
) -> List[int]:
if start_len is not None or end_len is not None or rows is not None:
if start_len is None or end_len is None or rows is None:
search_chars: Optional[str],
is_search_char_group: bool,
case_sensitive: bool,
) -> None:
if lengths is not None or rows is not None:
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
raise ValueError(Errors.E1045.format(label=label))
if start_len < 0 or end_len < start_len + 1:
if lengths is None or rows is None:
raise ValueError(Errors.E1045.format(label=label))
if is_sc and scs is None:
if len(lengths) != len(rows):
raise ValueError(Errors.E1045.format(label=label))
if scs is not None and scs != scs.lower():
if any([length < 1 for length in lengths]):
raise ValueError(Errors.E1045.format(label=label))
if (
not case_sensitive
and search_chars is not None
and search_chars != search_chars.lower()
):
raise ValueError(Errors.E1044.format(label=label))
if len(rows) != end_len - start_len:
raise ValueError(Errors.E1045.format(label=label))
elif scs is not None:
elif search_chars is not None:
raise ValueError(Errors.E1045.format(label=label))
return rows if rows is not None else []
@registry.architectures("spacy.AffixMultiHashEmbed.v1")
def AffixMultiHashEmbed(
@registry.architectures("spacy.RichMultiHashEmbed.v1")
def RichMultiHashEmbed(
width: int,
attrs: List[Union[str, int]],
rows: List[int],
include_static_vectors: bool,
*,
affix_case_sensitive: bool,
suffix_start_len: Optional[int] = None,
suffix_end_len: Optional[int] = None,
suffix_rows: Optional[List[int]] = None,
suffix_scs: Optional[str] = None,
suffix_sc_start_len: Optional[int] = None,
suffix_sc_end_len: Optional[int] = None,
suffix_sc_rows: Optional[List[int]] = None,
prefix_start_len: Optional[int] = None,
prefix_end_len: Optional[int] = None,
prefix_rows: Optional[List[int]] = None,
prefix_scs: Optional[str] = None,
prefix_sc_start_len: Optional[int] = None,
prefix_sc_end_len: Optional[int] = None,
prefix_sc_rows: Optional[List[int]] = None,
case_sensitive: bool,
pref_lengths: Optional[List[int]] = None,
pref_rows: Optional[List[int]] = None,
pref_search_chars: Optional[str] = None,
pref_search_lengths: Optional[List[int]] = None,
pref_search_rows: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
suff_rows: Optional[List[int]] = None,
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
suff_search_rows: Optional[List[int]] = None,
) -> Model[List[Doc], List[Floats2d]]:
"""
@ -242,74 +241,62 @@ def AffixMultiHashEmbed(
if len(rows) != len(attrs):
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
rows.extend(
process_affix_config_group(
"prefix", prefix_start_len, prefix_end_len, prefix_rows, None, False
)
verify_rich_config_group(
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
)
rows.extend(
process_affix_config_group(
"prefix_sc",
prefix_sc_start_len,
prefix_sc_end_len,
prefix_sc_rows,
prefix_scs,
True,
)
verify_rich_config_group(
"prefix search",
pref_search_lengths,
pref_search_rows,
pref_search_chars,
True,
case_sensitive,
)
rows.extend(
process_affix_config_group(
"suffix", suffix_start_len, suffix_end_len, suffix_rows, None, False
)
verify_rich_config_group(
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
)
rows.extend(
process_affix_config_group(
"suffix_sc",
suffix_sc_start_len,
suffix_sc_end_len,
suffix_sc_rows,
suffix_scs,
True,
)
verify_rich_config_group(
"suffix search",
suff_search_lengths,
suff_search_rows,
suff_search_chars,
True,
case_sensitive,
)
embeddings = [ # type:ignore
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) # type: ignore
if pref_rows is not None:
rows.extend(pref_rows)
if pref_search_rows is not None:
rows.extend(pref_search_rows)
if suff_rows is not None:
rows.extend(suff_rows)
if suff_search_rows is not None:
rows.extend(suff_search_rows)
embeddings: List[Model[Ints2d, Floats2d]] = [
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
for i, row in enumerate(rows)
]
concat_size = width * (len(embeddings) + include_static_vectors)
max_out: Model[Ragged, Ragged] = with_array(
Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
)
extractors = [FeatureExtractor(attrs)]
if prefix_start_len is not None or prefix_sc_start_len is not None:
extractors.append(
AffixExtractor(
suffs_not_prefs=False,
case_sensitive=affix_case_sensitive,
len_start=prefix_start_len,
len_end=prefix_end_len,
special_chars=prefix_scs,
sc_len_start=prefix_sc_start_len,
sc_len_end=prefix_sc_end_len,
)
)
if suffix_start_len is not None or suffix_sc_start_len is not None:
extractors.append(
AffixExtractor(
suffs_not_prefs=True,
case_sensitive=affix_case_sensitive,
len_start=suffix_start_len,
len_end=suffix_end_len,
special_chars=suffix_scs,
sc_len_start=suffix_sc_start_len,
sc_len_end=suffix_sc_end_len,
)
)
extractors = concatenate(
FeatureExtractor(attrs),
RichFeatureExtractor(
case_sensitive=case_sensitive,
pref_lengths=pref_lengths,
pref_search_chars=pref_search_chars,
pref_search_lengths=pref_search_lengths,
suff_lengths=suff_lengths,
suff_search_chars=suff_search_chars,
suff_search_lengths=suff_search_lengths,
),
)
if include_static_vectors:
feature_extractor: Model[List[Doc], Ragged] = chain( # type: ignore
concatenate(*extractors),
if include_static_vectors:
feature_extractor: Model[List[Doc], Ragged] = chain(
extractors,
cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(concatenate(*embeddings)),
)
@ -322,8 +309,8 @@ def AffixMultiHashEmbed(
ragged2list(),
)
else:
model = chain( # type: ignore
concatenate(*extractors),
model = chain(
extractors,
cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(concatenate(*embeddings)),
max_out,

View File

@ -0,0 +1,72 @@
from typing import List, Optional, Callable, Tuple
from thinc.types import Ints2d
from thinc.api import Model, registry
from ..tokens import Doc
@registry.layers("spacy.RichFeatureExtractor.v1")
def RichFeatureExtractor(
*,
case_sensitive: bool,
pref_lengths: Optional[List[int]] = None,
pref_search_chars: Optional[str] = None,
pref_search_lengths: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
suff_search_chars: Optional[str] = None,
suff_search_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"pref_lengths": pref_lengths if pref_lengths is not None else [],
"pref_search_chars": pref_search_chars
if pref_search_chars is not None
else "",
"pref_search_lengths": pref_search_lengths
if pref_search_lengths is not None
else [],
"suff_lengths": suff_lengths if suff_lengths is not None else [],
"suff_search_chars": suff_search_chars
if suff_search_chars is not None
else "",
"suff_search_lengths": suff_search_lengths
if suff_search_lengths is not None
else [],
},
)
def forward(
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
) -> Tuple[List[Ints2d], Callable]:
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: List[int] = model.attrs["pref_lengths"]
pref_search_chars: str = model.attrs["pref_search_chars"]
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
suff_lengths: List[int] = model.attrs["suff_lengths"]
suff_search_chars: str = model.attrs["suff_search_chars"]
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
features: List[Ints2d] = []
for doc in docs:
prefix_hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
suffs_not_prefs=False,
affix_lengths=pref_lengths,
search_chars=pref_search_chars,
search_lengths=pref_search_lengths,
)
suffix_hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
suffs_not_prefs=True,
affix_lengths=suff_lengths,
search_chars=suff_search_chars,
search_lengths=suff_search_lengths,
)
features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes])))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -169,6 +169,95 @@ updated).
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.RichMultiHashEmbed.v1 {#RichMultiHashEmbed}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.RichMultiHashEmbed.v1"
> width = 64
> attrs = ["LOWER","SHAPE"]
> rows = [2000,1000]
> include_static_vectors = "False"
> case_sensitive = "False"
> pref_lengths = [2, 3, 5]
> pref_rows = [2000,2000,2000]
> suff_lengths = [2, 3, 4, 5]
> suff_rows = [2000,2000,2000,2000]
> suff_search_chars = "aeiouäöüyß"
> suff_search_lengths = [2, 3]
> suff_search_rows = [2000,2000]
> ```
Construct an embedding layer with the features of
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) plus more detailed
features extracted from various positions in each token string. The fixed-length
`PREFIX` and `SUFFIX` features used in
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
enough when working with languages with complex morphology, and this layer
allows the specification of multiple prefixes and suffixes of any lengths.
Additionally, it is possible to hash the results of character searches of
specified lengths. A list of search characters is specified; the characters in
each word are examined in order starting at the beginning or at the end, and
each character that matches one of the search characters is added in order to
the string to be hashed. The search continues until either the search result
string is full or the whole word has been examined. This feature is useful
because many languages exhibit morphological alternations where one letter or
letters regularly alternate with another letter or letters depending on the
presence of some other letter before or after it, e.g. German plural nouns where
the final two vowels are `ä-e` regularly correspond to singular lemmas where the
`e` is no longer present and the `ä` has become `a`. For most languages,
searching is likely to be useful starting at the end (`suff_*`), but the ability
to search from the beginning (`pref_*`) is also offered for completeness. Search
characters should consist of all characters that regularly alternate with other
characters in the language in question or whose presence before or after
characters that would otherwise alternate prevents the alternation from
occuring, e.g. an `ä` in a German plural noun does not become `a` if it the
third or fourth vowel from the end of the word.
Internally, the model converts each token string to
[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
from the string occupies two bytes. This assumption holds for all characters in
the Basic Multilingual Plane, which encompasses all characters that are ever
likely to be of interest when extracting features. There are, however,
characters like emojis that are in the Extended Multilingual Plane and occupy
four bytes, although importantly neither of the two byte pairs that make up such
a representation can be a valid two-byte character in its own right. The
following considerations apply to the processing of four-byte characters:
- An exceptional four-byte character within a text consisting mostly of two-byte
characters will probably be ignored by the neural network accepting the
embedding layer as not matching any of the learned features.
- If anyone did want to train a model for a language like Lycian that is
generally written in four-byte characters, prefix and suffix features can
still be extracted, but the length specifications should all be doubled, i.e.
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
situation length specifications that are odd numbers would serve no useful
purpose since they would refer to half-characters.
- Four-byte characters are not accepted within search character specification
strings and lead to an error being thrown.
| Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to hash. ~~bool~~ |
| `pref_lengths` | The lengths of prefixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `s` and `spa`. ~~Optional[List[int]~~ |
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
| `pref_search_lengths` | The lengths of search result strings to hash, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
| `suff_lengths` | The lengths of suffixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `y` and `aCy`. ~~Optional[List[int]~~ |
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
| `suff_search_lengths` | The lengths of search result strings to hash, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v2 {#CharacterEmbed}
> #### Example config
@ -587,8 +676,8 @@ consists of either two or three subnetworks:
run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state
representation is then a matter of summing the component features and
applying the non-linearity.
representation is then a matter of summing the component features and applying
the non-linearity.
- **upper** (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is used
as action scores directly.
@ -628,8 +717,8 @@ same signature, but the `use_upper` argument was `True` by default.
> ```
Build a tagger model, using a provided token-to-vector component. The tagger
model adds a linear layer with softmax activation to predict scores given
the token vectors.
model adds a linear layer with softmax activation to predict scores given the
token vectors.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------ |
@ -920,5 +1009,5 @@ A function that reads an existing `KnowledgeBase` from file.
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
plausible [`Candidate`](/api/kb/#candidate) objects. The default
`CandidateGenerator` uses the text of a mention to find its potential
aliases in the `KnowledgeBase`. Note that this function is case-dependent.
`CandidateGenerator` uses the text of a mention to find its potential aliases in
the `KnowledgeBase`. Note that this function is case-dependent.