mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Python code and documentation
This commit is contained in:
parent
06fe50a12d
commit
581f380c00
|
@ -1,61 +0,0 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
|
||||
from ..tokens import Doc
|
||||
|
||||
|
||||
@registry.layers("spacy.AffixExtractor.v1")
|
||||
def AffixExtractor(
|
||||
*,
|
||||
suffs_not_prefs: bool,
|
||||
case_sensitive: bool,
|
||||
len_start: Optional[int],
|
||||
len_end: Optional[int],
|
||||
special_chars: Optional[str],
|
||||
sc_len_start: Optional[int],
|
||||
sc_len_end: Optional[int],
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model(
|
||||
"extract_affixes",
|
||||
forward,
|
||||
attrs={
|
||||
"suffs_not_prefs": suffs_not_prefs,
|
||||
"case_sensitive": case_sensitive,
|
||||
"len_start": len_start if len_start is not None else 0,
|
||||
"len_end": len_end if len_end is not None else 0,
|
||||
"special_chars": special_chars if special_chars is not None else "",
|
||||
"sc_len_start": sc_len_start if sc_len_start is not None else 0,
|
||||
"sc_len_end": sc_len_end if sc_len_end is not None else 0,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(
|
||||
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||
) -> Tuple[List[Ints2d], Callable]:
|
||||
suffs_not_prefs: bool = model.attrs["suffs_not_prefs"]
|
||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
len_start: int = model.attrs["len_start"]
|
||||
len_end: int = model.attrs["len_end"]
|
||||
special_chars: str = model.attrs["special_chars"]
|
||||
sc_len_start: int = model.attrs["sc_len_start"]
|
||||
sc_len_end: int = model.attrs["sc_len_end"]
|
||||
features: List[Ints2d] = []
|
||||
# for doc in docs:
|
||||
# features.append(
|
||||
# model.ops.asarray2i(
|
||||
# doc.get_affix_hashes(
|
||||
# suffs_not_prefs,
|
||||
# case_sensitive,
|
||||
# len_start,
|
||||
# len_end,
|
||||
# special_chars,
|
||||
# sc_len_start,
|
||||
# sc_len_end,
|
||||
# )
|
||||
# )
|
||||
# )
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
|
@ -1,6 +1,7 @@
|
|||
from encodings import search_function
|
||||
from typing import Optional, List, Union, cast
|
||||
from spacy.ml.affixextractor import AffixExtractor
|
||||
from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
|
||||
from spacy.ml.richfeatureextractor import RichFeatureExtractor
|
||||
from thinc.types import Floats2d, Ints2d, Ragged
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
|
@ -187,52 +188,50 @@ def MultiHashEmbed(
|
|||
return model
|
||||
|
||||
|
||||
def process_affix_config_group(
|
||||
def verify_rich_config_group(
|
||||
label: str,
|
||||
start_len: Optional[int],
|
||||
end_len: Optional[int],
|
||||
lengths: Optional[List[int]],
|
||||
rows: Optional[List[int]],
|
||||
scs: Optional[str],
|
||||
is_sc: bool,
|
||||
) -> List[int]:
|
||||
if start_len is not None or end_len is not None or rows is not None:
|
||||
if start_len is None or end_len is None or rows is None:
|
||||
search_chars: Optional[str],
|
||||
is_search_char_group: bool,
|
||||
case_sensitive: bool,
|
||||
) -> None:
|
||||
if lengths is not None or rows is not None:
|
||||
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
if start_len < 0 or end_len < start_len + 1:
|
||||
if lengths is None or rows is None:
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
if is_sc and scs is None:
|
||||
if len(lengths) != len(rows):
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
if scs is not None and scs != scs.lower():
|
||||
if any([length < 1 for length in lengths]):
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
if (
|
||||
not case_sensitive
|
||||
and search_chars is not None
|
||||
and search_chars != search_chars.lower()
|
||||
):
|
||||
raise ValueError(Errors.E1044.format(label=label))
|
||||
if len(rows) != end_len - start_len:
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
elif scs is not None:
|
||||
elif search_chars is not None:
|
||||
raise ValueError(Errors.E1045.format(label=label))
|
||||
return rows if rows is not None else []
|
||||
|
||||
|
||||
@registry.architectures("spacy.AffixMultiHashEmbed.v1")
|
||||
def AffixMultiHashEmbed(
|
||||
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
||||
def RichMultiHashEmbed(
|
||||
width: int,
|
||||
attrs: List[Union[str, int]],
|
||||
rows: List[int],
|
||||
include_static_vectors: bool,
|
||||
*,
|
||||
affix_case_sensitive: bool,
|
||||
suffix_start_len: Optional[int] = None,
|
||||
suffix_end_len: Optional[int] = None,
|
||||
suffix_rows: Optional[List[int]] = None,
|
||||
suffix_scs: Optional[str] = None,
|
||||
suffix_sc_start_len: Optional[int] = None,
|
||||
suffix_sc_end_len: Optional[int] = None,
|
||||
suffix_sc_rows: Optional[List[int]] = None,
|
||||
prefix_start_len: Optional[int] = None,
|
||||
prefix_end_len: Optional[int] = None,
|
||||
prefix_rows: Optional[List[int]] = None,
|
||||
prefix_scs: Optional[str] = None,
|
||||
prefix_sc_start_len: Optional[int] = None,
|
||||
prefix_sc_end_len: Optional[int] = None,
|
||||
prefix_sc_rows: Optional[List[int]] = None,
|
||||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
pref_rows: Optional[List[int]] = None,
|
||||
pref_search_chars: Optional[str] = None,
|
||||
pref_search_lengths: Optional[List[int]] = None,
|
||||
pref_search_rows: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
suff_rows: Optional[List[int]] = None,
|
||||
suff_search_chars: Optional[str] = None,
|
||||
suff_search_lengths: Optional[List[int]] = None,
|
||||
suff_search_rows: Optional[List[int]] = None,
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
|
||||
"""
|
||||
|
@ -242,74 +241,62 @@ def AffixMultiHashEmbed(
|
|||
if len(rows) != len(attrs):
|
||||
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||
|
||||
rows.extend(
|
||||
process_affix_config_group(
|
||||
"prefix", prefix_start_len, prefix_end_len, prefix_rows, None, False
|
||||
)
|
||||
verify_rich_config_group(
|
||||
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
|
||||
)
|
||||
rows.extend(
|
||||
process_affix_config_group(
|
||||
"prefix_sc",
|
||||
prefix_sc_start_len,
|
||||
prefix_sc_end_len,
|
||||
prefix_sc_rows,
|
||||
prefix_scs,
|
||||
True,
|
||||
)
|
||||
verify_rich_config_group(
|
||||
"prefix search",
|
||||
pref_search_lengths,
|
||||
pref_search_rows,
|
||||
pref_search_chars,
|
||||
True,
|
||||
case_sensitive,
|
||||
)
|
||||
rows.extend(
|
||||
process_affix_config_group(
|
||||
"suffix", suffix_start_len, suffix_end_len, suffix_rows, None, False
|
||||
)
|
||||
verify_rich_config_group(
|
||||
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
||||
)
|
||||
rows.extend(
|
||||
process_affix_config_group(
|
||||
"suffix_sc",
|
||||
suffix_sc_start_len,
|
||||
suffix_sc_end_len,
|
||||
suffix_sc_rows,
|
||||
suffix_scs,
|
||||
True,
|
||||
)
|
||||
verify_rich_config_group(
|
||||
"suffix search",
|
||||
suff_search_lengths,
|
||||
suff_search_rows,
|
||||
suff_search_chars,
|
||||
True,
|
||||
case_sensitive,
|
||||
)
|
||||
|
||||
embeddings = [ # type:ignore
|
||||
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0) # type: ignore
|
||||
if pref_rows is not None:
|
||||
rows.extend(pref_rows)
|
||||
if pref_search_rows is not None:
|
||||
rows.extend(pref_search_rows)
|
||||
if suff_rows is not None:
|
||||
rows.extend(suff_rows)
|
||||
if suff_search_rows is not None:
|
||||
rows.extend(suff_search_rows)
|
||||
|
||||
embeddings: List[Model[Ints2d, Floats2d]] = [
|
||||
HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
|
||||
for i, row in enumerate(rows)
|
||||
]
|
||||
concat_size = width * (len(embeddings) + include_static_vectors)
|
||||
max_out: Model[Ragged, Ragged] = with_array(
|
||||
Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
|
||||
)
|
||||
extractors = [FeatureExtractor(attrs)]
|
||||
if prefix_start_len is not None or prefix_sc_start_len is not None:
|
||||
extractors.append(
|
||||
AffixExtractor(
|
||||
suffs_not_prefs=False,
|
||||
case_sensitive=affix_case_sensitive,
|
||||
len_start=prefix_start_len,
|
||||
len_end=prefix_end_len,
|
||||
special_chars=prefix_scs,
|
||||
sc_len_start=prefix_sc_start_len,
|
||||
sc_len_end=prefix_sc_end_len,
|
||||
)
|
||||
)
|
||||
if suffix_start_len is not None or suffix_sc_start_len is not None:
|
||||
extractors.append(
|
||||
AffixExtractor(
|
||||
suffs_not_prefs=True,
|
||||
case_sensitive=affix_case_sensitive,
|
||||
len_start=suffix_start_len,
|
||||
len_end=suffix_end_len,
|
||||
special_chars=suffix_scs,
|
||||
sc_len_start=suffix_sc_start_len,
|
||||
sc_len_end=suffix_sc_end_len,
|
||||
)
|
||||
)
|
||||
extractors = concatenate(
|
||||
FeatureExtractor(attrs),
|
||||
RichFeatureExtractor(
|
||||
case_sensitive=case_sensitive,
|
||||
pref_lengths=pref_lengths,
|
||||
pref_search_chars=pref_search_chars,
|
||||
pref_search_lengths=pref_search_lengths,
|
||||
suff_lengths=suff_lengths,
|
||||
suff_search_chars=suff_search_chars,
|
||||
suff_search_lengths=suff_search_lengths,
|
||||
),
|
||||
)
|
||||
|
||||
if include_static_vectors:
|
||||
feature_extractor: Model[List[Doc], Ragged] = chain( # type: ignore
|
||||
concatenate(*extractors),
|
||||
if include_static_vectors:
|
||||
feature_extractor: Model[List[Doc], Ragged] = chain(
|
||||
extractors,
|
||||
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
||||
with_array(concatenate(*embeddings)),
|
||||
)
|
||||
|
@ -322,8 +309,8 @@ def AffixMultiHashEmbed(
|
|||
ragged2list(),
|
||||
)
|
||||
else:
|
||||
model = chain( # type: ignore
|
||||
concatenate(*extractors),
|
||||
model = chain(
|
||||
extractors,
|
||||
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
||||
with_array(concatenate(*embeddings)),
|
||||
max_out,
|
||||
|
|
72
spacy/ml/richfeatureextractor.py
Normal file
72
spacy/ml/richfeatureextractor.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry
|
||||
|
||||
from ..tokens import Doc
|
||||
|
||||
|
||||
@registry.layers("spacy.RichFeatureExtractor.v1")
|
||||
def RichFeatureExtractor(
|
||||
*,
|
||||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
pref_search_chars: Optional[str] = None,
|
||||
pref_search_lengths: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
suff_search_chars: Optional[str] = None,
|
||||
suff_search_lengths: Optional[List[int]] = None,
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model(
|
||||
"extract_character_combination_hashes",
|
||||
forward,
|
||||
attrs={
|
||||
"case_sensitive": case_sensitive,
|
||||
"pref_lengths": pref_lengths if pref_lengths is not None else [],
|
||||
"pref_search_chars": pref_search_chars
|
||||
if pref_search_chars is not None
|
||||
else "",
|
||||
"pref_search_lengths": pref_search_lengths
|
||||
if pref_search_lengths is not None
|
||||
else [],
|
||||
"suff_lengths": suff_lengths if suff_lengths is not None else [],
|
||||
"suff_search_chars": suff_search_chars
|
||||
if suff_search_chars is not None
|
||||
else "",
|
||||
"suff_search_lengths": suff_search_lengths
|
||||
if suff_search_lengths is not None
|
||||
else [],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(
|
||||
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||
) -> Tuple[List[Ints2d], Callable]:
|
||||
ops = model.ops
|
||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
pref_lengths: List[int] = model.attrs["pref_lengths"]
|
||||
pref_search_chars: str = model.attrs["pref_search_chars"]
|
||||
pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
|
||||
suff_lengths: List[int] = model.attrs["suff_lengths"]
|
||||
suff_search_chars: str = model.attrs["suff_search_chars"]
|
||||
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
prefix_hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
suffs_not_prefs=False,
|
||||
affix_lengths=pref_lengths,
|
||||
search_chars=pref_search_chars,
|
||||
search_lengths=pref_search_lengths,
|
||||
)
|
||||
suffix_hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
suffs_not_prefs=True,
|
||||
affix_lengths=suff_lengths,
|
||||
search_chars=suff_search_chars,
|
||||
search_lengths=suff_search_lengths,
|
||||
)
|
||||
features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes])))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
|
@ -169,6 +169,95 @@ updated).
|
|||
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.RichMultiHashEmbed.v1 {#RichMultiHashEmbed}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.RichMultiHashEmbed.v1"
|
||||
> width = 64
|
||||
> attrs = ["LOWER","SHAPE"]
|
||||
> rows = [2000,1000]
|
||||
> include_static_vectors = "False"
|
||||
> case_sensitive = "False"
|
||||
> pref_lengths = [2, 3, 5]
|
||||
> pref_rows = [2000,2000,2000]
|
||||
> suff_lengths = [2, 3, 4, 5]
|
||||
> suff_rows = [2000,2000,2000,2000]
|
||||
> suff_search_chars = "aeiouäöüyß"
|
||||
> suff_search_lengths = [2, 3]
|
||||
> suff_search_rows = [2000,2000]
|
||||
> ```
|
||||
|
||||
Construct an embedding layer with the features of
|
||||
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) plus more detailed
|
||||
features extracted from various positions in each token string. The fixed-length
|
||||
`PREFIX` and `SUFFIX` features used in
|
||||
[MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
|
||||
enough when working with languages with complex morphology, and this layer
|
||||
allows the specification of multiple prefixes and suffixes of any lengths.
|
||||
|
||||
Additionally, it is possible to hash the results of character searches of
|
||||
specified lengths. A list of search characters is specified; the characters in
|
||||
each word are examined in order starting at the beginning or at the end, and
|
||||
each character that matches one of the search characters is added in order to
|
||||
the string to be hashed. The search continues until either the search result
|
||||
string is full or the whole word has been examined. This feature is useful
|
||||
because many languages exhibit morphological alternations where one letter or
|
||||
letters regularly alternate with another letter or letters depending on the
|
||||
presence of some other letter before or after it, e.g. German plural nouns where
|
||||
the final two vowels are `ä-e` regularly correspond to singular lemmas where the
|
||||
`e` is no longer present and the `ä` has become `a`. For most languages,
|
||||
searching is likely to be useful starting at the end (`suff_*`), but the ability
|
||||
to search from the beginning (`pref_*`) is also offered for completeness. Search
|
||||
characters should consist of all characters that regularly alternate with other
|
||||
characters in the language in question or whose presence before or after
|
||||
characters that would otherwise alternate prevents the alternation from
|
||||
occuring, e.g. an `ä` in a German plural noun does not become `a` if it the
|
||||
third or fourth vowel from the end of the word.
|
||||
|
||||
Internally, the model converts each token string to
|
||||
[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
|
||||
from the string occupies two bytes. This assumption holds for all characters in
|
||||
the Basic Multilingual Plane, which encompasses all characters that are ever
|
||||
likely to be of interest when extracting features. There are, however,
|
||||
characters like emojis that are in the Extended Multilingual Plane and occupy
|
||||
four bytes, although importantly neither of the two byte pairs that make up such
|
||||
a representation can be a valid two-byte character in its own right. The
|
||||
following considerations apply to the processing of four-byte characters:
|
||||
|
||||
- An exceptional four-byte character within a text consisting mostly of two-byte
|
||||
characters will probably be ignored by the neural network accepting the
|
||||
embedding layer as not matching any of the learned features.
|
||||
- If anyone did want to train a model for a language like Lycian that is
|
||||
generally written in four-byte characters, prefix and suffix features can
|
||||
still be extracted, but the length specifications should all be doubled, i.e.
|
||||
`[2,4,6]` to extract one-, two- and three-character affixes. In such a
|
||||
situation length specifications that are odd numbers would serve no useful
|
||||
purpose since they would refer to half-characters.
|
||||
- Four-byte characters are not accepted within search character specification
|
||||
strings and lead to an error being thrown.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
|
||||
| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
|
||||
| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
|
||||
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
||||
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to hash. ~~bool~~ |
|
||||
| `pref_lengths` | The lengths of prefixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `s` and `spa`. ~~Optional[List[int]~~ |
|
||||
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
||||
| `pref_search_lengths` | The lengths of search result strings to hash, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
||||
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
|
||||
| `suff_lengths` | The lengths of suffixes to hash for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to hashes being generated for `y` and `aCy`. ~~Optional[List[int]~~ |
|
||||
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
||||
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
||||
| `suff_search_lengths` | The lengths of search result strings to hash, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
|
||||
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.CharacterEmbed.v2 {#CharacterEmbed}
|
||||
|
||||
> #### Example config
|
||||
|
@ -587,8 +676,8 @@ consists of either two or three subnetworks:
|
|||
run once for each batch.
|
||||
- **lower**: Construct a feature-specific vector for each `(token, feature)`
|
||||
pair. This is also run once for each batch. Constructing the state
|
||||
representation is then a matter of summing the component features and
|
||||
applying the non-linearity.
|
||||
representation is then a matter of summing the component features and applying
|
||||
the non-linearity.
|
||||
- **upper** (optional): A feed-forward network that predicts scores from the
|
||||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
@ -628,8 +717,8 @@ same signature, but the `use_upper` argument was `True` by default.
|
|||
> ```
|
||||
|
||||
Build a tagger model, using a provided token-to-vector component. The tagger
|
||||
model adds a linear layer with softmax activation to predict scores given
|
||||
the token vectors.
|
||||
model adds a linear layer with softmax activation to predict scores given the
|
||||
token vectors.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
|
@ -920,5 +1009,5 @@ A function that reads an existing `KnowledgeBase` from file.
|
|||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
||||
plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
||||
`CandidateGenerator` uses the text of a mention to find its potential
|
||||
aliases in the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
|
Loading…
Reference in New Issue
Block a user