Remove component factory functions from component files

This commit is contained in:
Matthew Honnibal 2025-05-20 18:29:31 +02:00
parent 6f1a65a8bd
commit fd91e8a0e4
18 changed files with 0 additions and 662 deletions

View File

@ -22,10 +22,6 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:

View File

@ -39,145 +39,6 @@ subword_features = true
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
optionally learn to merge tokens that had been over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using an imitation learning objective. The parser follows
the actions predicted by the current weights, and at each state, determines
which actions are compatible with the optimal parse that could be reached
from the current state. The weights such that the scores assigned to the
set of optimal actions is increased, while scores assigned to other
actions are decreased. Note that more than one action may be optimal for
a given state.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
scorer (Optional[Callable]): The scoring method.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None,
scorer=scorer,
)
def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled
dependency parsing, and can optionally learn to merge tokens that had been
over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using a global objective. That is, it learns to assign
probabilities to whole parses.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None,
scorer=scorer,
)
def parser_score(examples, **kwargs):
"""Score a batch of examples.

View File

@ -39,27 +39,6 @@ subword_features = true
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
def make_edit_tree_lemmatizer(
nlp: Language,
name: str,
model: Model,
backoff: Optional[str],
min_tree_freq: int,
overwrite: bool,
top_k: int,
scorer: Optional[Callable],
):
"""Construct an EditTreeLemmatizer component."""
return EditTreeLemmatizer(
nlp.vocab,
model,
name,
backoff=backoff,
min_tree_freq=min_tree_freq,
overwrite=overwrite,
top_k=top_k,
scorer=scorer,
)
class EditTreeLemmatizer(TrainablePipe):

View File

@ -40,84 +40,6 @@ subword_features = true
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
def make_entity_linker(
nlp: Language,
name: str,
model: Model,
*,
labels_discard: Iterable[str],
n_sents: int,
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
):
"""Construct an EntityLinker component.
model (Model[List[Doc], Floats2d]): A model that learns document vector
representations. Given a batch of Doc objects, it should return a single
array, with one row per item in the batch.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
n_sents (int): The number of neighbouring sentences to take into account.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
component must provide entity annotations.
candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
"""
if not model.attrs.get("include_span_maker", False):
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
)
return EntityLinker(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
get_candidates_batch=get_candidates_batch,
generate_empty_kb=generate_empty_kb,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
candidates_batch_size=candidates_batch_size,
threshold=threshold,
)
def entity_linker_score(examples, **kwargs):

View File

@ -19,26 +19,6 @@ DEFAULT_ENT_ID_SEP = "||"
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
scorer: Optional[Callable],
):
return EntityRuler(
nlp,
name,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
scorer=scorer,
)
def entity_ruler_score(examples, **kwargs):

View File

@ -73,10 +73,6 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
return doc
def make_token_splitter(
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
):
return TokenSplitter(min_length=min_length, split_length=split_length)
class TokenSplitter:
@ -136,8 +132,6 @@ class TokenSplitter:
util.from_disk(path, serializers, [])
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
return DocCleaner(attrs, silent=silent)
class DocCleaner:

View File

@ -16,17 +16,6 @@ from ..vocab import Vocab
from .pipe import Pipe
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:

View File

@ -47,18 +47,6 @@ maxout_pieces = 3
DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
overwrite: bool,
extend: bool,
label_smoothing: float,
scorer: Optional[Callable],
):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
def morphologizer_score(examples, **kwargs):
def morph_key_getter(token, attr):
return getattr(token, attr).key

View File

@ -30,10 +30,6 @@ subword_features = true
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
return MultitaskObjective(nlp.vocab, model, name)
class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a
side-objective.

View File

@ -36,121 +36,6 @@ subword_features = true
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
def make_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
incorrect_spans_key=incorrect_spans_key,
multitasks=[],
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
scorer=scorer,
)
def make_beam_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities.
scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def ner_score(examples, **kwargs):
return get_ner_prf(examples, **kwargs)

View File

@ -34,10 +34,6 @@ subword_features = true
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
def senter_score(examples, **kwargs):
def has_sents(doc):
return doc.has_annotation("SENT_START")

View File

@ -41,43 +41,6 @@ depth = 4
DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
def make_span_finder(
nlp: Language,
name: str,
model: Model[Iterable[Doc], Floats2d],
spans_key: str,
threshold: float,
max_length: Optional[int],
min_length: Optional[int],
scorer: Optional[Callable],
) -> "SpanFinder":
"""Create a SpanFinder component. The component predicts whether a token is
the start or the end of a potential span.
model (Model[List[Doc], Floats2d]): A model instance that
is given a list of documents and predicts a probability for each token.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
threshold (float): Minimum probability to consider a prediction positive.
max_length (Optional[int]): Maximum length of the produced spans, defaults
to None meaning unlimited length.
min_length (Optional[int]): Minimum length of the produced spans, defaults
to None meaning shortest span length is 1.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
"""
return SpanFinder(
nlp,
model=model,
threshold=threshold,
name=name,
scorer=scorer,
max_length=max_length,
min_length=min_length,
spans_key=spans_key,
)
def make_span_finder_scorer():

View File

@ -32,61 +32,8 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
DEFAULT_SPANS_KEY = "ruler"
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
scorer: Optional[Callable],
ent_id_sep: str,
):
if overwrite_ents:
ents_filter = prioritize_new_ents_filter
else:
ents_filter = prioritize_existing_ents_filter
return SpanRuler(
nlp,
name,
spans_key=None,
spans_filter=None,
annotate_ents=True,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=False,
scorer=scorer,
)
def make_span_ruler(
nlp: Language,
name: str,
spans_key: Optional[str],
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
annotate_ents: bool,
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite: bool,
scorer: Optional[Callable],
):
return SpanRuler(
nlp,
name,
spans_key=spans_key,
spans_filter=spans_filter,
annotate_ents=annotate_ents,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=overwrite,
scorer=scorer,
)
def prioritize_new_ents_filter(

View File

@ -157,108 +157,8 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
return partial(preset_spans_suggester, spans_key=spans_key)
def make_spancat(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to
0.5.
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
"""
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=None,
allow_overlap=True,
max_positive=max_positive,
threshold=threshold,
scorer=scorer,
add_negative_label=False,
)
def make_spancat_singlelabel(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
negative_weight: float,
allow_overlap: bool,
scorer: Optional[Callable],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-class
classification. With this configuration each span can get at most one
label. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
negative_weight (float): Multiplier for the loss terms.
Can be used to downweight the negative samples if there are too many.
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
Otherwise it produces non-overlapping spans greedily prioritizing
higher assigned label scores.
"""
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=negative_weight,
allow_overlap=allow_overlap,
max_positive=1,
add_negative_label=True,
threshold=None,
scorer=scorer,
)
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:

View File

@ -35,25 +35,6 @@ subword_features = true
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
def make_tagger(
nlp: Language,
name: str,
model: Model,
overwrite: bool,
scorer: Optional[Callable],
neg_prefix: str,
label_smoothing: float,
):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
the tag probabilities. The output vectors should match the number of tags
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
def tagger_score(examples, **kwargs):
return Scorer.score_token_attr(examples, "tag", **kwargs)

View File

@ -74,23 +74,6 @@ subword_features = true
"""
def make_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be mutually exclusive (i.e. one true label per doc).
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
"""
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:

View File

@ -72,26 +72,6 @@ subword_features = true
"""
def make_multilabel_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "MultiLabel_TextCategorizer":
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels
per doc).
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
"""
return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:

View File

@ -24,8 +24,6 @@ subword_features = true
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
return Tok2Vec(nlp.vocab, model, name)
class Tok2Vec(TrainablePipe):