diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 5b5de78ef..f10d83ca3 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -22,10 +22,6 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -def make_attribute_ruler( - nlp: Language, name: str, validate: bool, scorer: Optional[Callable] -): - return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 42d50dde6..8e0285636 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -39,145 +39,6 @@ subword_features = true DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] -def make_parser( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - learn_tokens: bool, - min_action_freq: int, - scorer: Optional[Callable], -): - """Create a transition-based DependencyParser component. The dependency parser - jointly learns sentence segmentation and labelled dependency parsing, and can - optionally learn to merge tokens that had been over-segmented by the tokenizer. - - The parser uses a variant of the non-monotonic arc-eager transition-system - described by Honnibal and Johnson (2014), with the addition of a "break" - transition to perform the sentence segmentation. Nivre's pseudo-projective - dependency transformation is used to allow the parser to predict - non-projective parses. - - The parser is trained using an imitation learning objective. The parser follows - the actions predicted by the current weights, and at each state, determines - which actions are compatible with the optimal parse that could be reached - from the current state. The weights such that the scores assigned to the - set of optimal actions is increased, while scores assigned to other - actions are decreased. Note that more than one action may be optimal for - a given state. - - model (Model): The model for the transition-based parser. The model needs - to have a specific substructure of named components --- see the - spacy.ml.tb_framework.TransitionModel for details. - moves (Optional[TransitionSystem]): This defines how the parse-state is created, - updated and evaluated. If 'moves' is None, a new instance is - created with `self.TransitionSystem()`. Defaults to `None`. - update_with_oracle_cut_size (int): During training, cut long sequences into - shorter segments by creating intermediate states based on the gold-standard - history. The model is not very sensitive to this parameter, so you usually - won't need to change it. 100 is a good default. - learn_tokens (bool): Whether to learn to merge subtokens that are split - relative to the gold standard. Experimental. - min_action_freq (int): The minimum frequency of labelled actions to retain. - Rarer labelled actions have their label backed-off to "dep". While this - primarily affects the label accuracy, it can also affect the attachment - structure, as the labels are used to represent the pseudo-projectivity - transformation. - scorer (Optional[Callable]): The scoring method. - """ - return DependencyParser( - nlp.vocab, - model, - name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=[], - learn_tokens=learn_tokens, - min_action_freq=min_action_freq, - beam_width=1, - beam_density=0.0, - beam_update_prob=0.0, - # At some point in the future we can try to implement support for - # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None, - scorer=scorer, - ) - - -def make_beam_parser( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - learn_tokens: bool, - min_action_freq: int, - beam_width: int, - beam_density: float, - beam_update_prob: float, - scorer: Optional[Callable], -): - """Create a transition-based DependencyParser component that uses beam-search. - The dependency parser jointly learns sentence segmentation and labelled - dependency parsing, and can optionally learn to merge tokens that had been - over-segmented by the tokenizer. - - The parser uses a variant of the non-monotonic arc-eager transition-system - described by Honnibal and Johnson (2014), with the addition of a "break" - transition to perform the sentence segmentation. Nivre's pseudo-projective - dependency transformation is used to allow the parser to predict - non-projective parses. - - The parser is trained using a global objective. That is, it learns to assign - probabilities to whole parses. - - model (Model): The model for the transition-based parser. The model needs - to have a specific substructure of named components --- see the - spacy.ml.tb_framework.TransitionModel for details. - moves (Optional[TransitionSystem]): This defines how the parse-state is created, - updated and evaluated. If 'moves' is None, a new instance is - created with `self.TransitionSystem()`. Defaults to `None`. - update_with_oracle_cut_size (int): During training, cut long sequences into - shorter segments by creating intermediate states based on the gold-standard - history. The model is not very sensitive to this parameter, so you usually - won't need to change it. 100 is a good default. - beam_width (int): The number of candidate analyses to maintain. - beam_density (float): The minimum ratio between the scores of the first and - last candidates in the beam. This allows the parser to avoid exploring - candidates that are too far behind. This is mostly intended to improve - efficiency, but it can also improve accuracy as deeper search is not - always better. - beam_update_prob (float): The chance of making a beam update, instead of a - greedy update. Greedy updates are an approximation for the beam updates, - and are faster to compute. - learn_tokens (bool): Whether to learn to merge subtokens that are split - relative to the gold standard. Experimental. - min_action_freq (int): The minimum frequency of labelled actions to retain. - Rarer labelled actions have their label backed-off to "dep". While this - primarily affects the label accuracy, it can also affect the attachment - structure, as the labels are used to represent the pseudo-projectivity - transformation. - """ - return DependencyParser( - nlp.vocab, - model, - name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - beam_width=beam_width, - beam_density=beam_density, - beam_update_prob=beam_update_prob, - multitasks=[], - learn_tokens=learn_tokens, - min_action_freq=min_action_freq, - # At some point in the future we can try to implement support for - # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None, - scorer=scorer, - ) - - def parser_score(examples, **kwargs): """Score a batch of examples. diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index f8ae2cba3..782ff9705 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -39,27 +39,6 @@ subword_features = true DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"] -def make_edit_tree_lemmatizer( - nlp: Language, - name: str, - model: Model, - backoff: Optional[str], - min_tree_freq: int, - overwrite: bool, - top_k: int, - scorer: Optional[Callable], -): - """Construct an EditTreeLemmatizer component.""" - return EditTreeLemmatizer( - nlp.vocab, - model, - name, - backoff=backoff, - min_tree_freq=min_tree_freq, - overwrite=overwrite, - top_k=top_k, - scorer=scorer, - ) class EditTreeLemmatizer(TrainablePipe): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 65293a301..9510a9a59 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -40,84 +40,6 @@ subword_features = true DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] -def make_entity_linker( - nlp: Language, - name: str, - model: Model, - *, - labels_discard: Iterable[str], - n_sents: int, - incl_prior: bool, - incl_context: bool, - entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] - ], - generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], - overwrite: bool, - scorer: Optional[Callable], - use_gold_ents: bool, - candidates_batch_size: int, - threshold: Optional[float] = None, -): - """Construct an EntityLinker component. - - model (Model[List[Doc], Floats2d]): A model that learns document vector - representations. Given a batch of Doc objects, it should return a single - array, with one row per item in the batch. - labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. - n_sents (int): The number of neighbouring sentences to take into account. - incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. - incl_context (bool): Whether or not to include the local context in the model. - entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that - produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] - ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. - generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. - scorer (Optional[Callable]): The scoring method. - use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another - component must provide entity annotations. - candidates_batch_size (int): Size of batches for entity candidate generation. - threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, - prediction is discarded. If None, predictions are not filtered by any threshold. - """ - - if not model.attrs.get("include_span_maker", False): - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) - return EntityLinker( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - get_candidates_batch=get_candidates_batch, - generate_empty_kb=generate_empty_kb, - overwrite=overwrite, - scorer=scorer, - use_gold_ents=use_gold_ents, - candidates_batch_size=candidates_batch_size, - threshold=threshold, - ) def entity_linker_score(examples, **kwargs): diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 22df8065d..e05227962 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -19,26 +19,6 @@ DEFAULT_ENT_ID_SEP = "||" PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] -def make_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite_ents: bool, - ent_id_sep: str, - scorer: Optional[Callable], -): - return EntityRuler( - nlp, - name, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite_ents=overwrite_ents, - ent_id_sep=ent_id_sep, - scorer=scorer, - ) def entity_ruler_score(examples, **kwargs): diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index e788979cf..738e9774b 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -73,10 +73,6 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: return doc -def make_token_splitter( - nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 -): - return TokenSplitter(min_length=min_length, split_length=split_length) class TokenSplitter: @@ -136,8 +132,6 @@ class TokenSplitter: util.from_disk(path, serializers, []) -def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool): - return DocCleaner(attrs, silent=silent) class DocCleaner: diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index f737b84b5..c67558d28 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -16,17 +16,6 @@ from ..vocab import Vocab from .pipe import Pipe -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - overwrite: bool, - scorer: Optional[Callable], -): - return Lemmatizer( - nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer - ) def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 937bd00da..c2b116fae 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -47,18 +47,6 @@ maxout_pieces = 3 DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] -def make_morphologizer( - nlp: Language, - model: Model, - name: str, - overwrite: bool, - extend: bool, - label_smoothing: float, - scorer: Optional[Callable], -): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer) - - def morphologizer_score(examples, **kwargs): def morph_key_getter(token, attr): return getattr(token, attr).key diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index a7fdbd9b4..83ffb4526 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -30,10 +30,6 @@ subword_features = true DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] -def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str): - return MultitaskObjective(nlp.vocab, model, name) - - class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a side-objective. diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 548f4b966..ef24035f7 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -36,121 +36,6 @@ subword_features = true DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] -def make_ner( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - incorrect_spans_key: Optional[str], - scorer: Optional[Callable], -): - """Create a transition-based EntityRecognizer component. The entity recognizer - identifies non-overlapping labelled spans of tokens. - - The transition-based algorithm used encodes certain assumptions that are - effective for "traditional" named entity recognition tasks, but may not be - a good fit for every span identification problem. Specifically, the loss - function optimizes for whole entity accuracy, so if your inter-annotator - agreement on boundary tokens is low, the component will likely perform poorly - on your problem. The transition-based algorithm also assumes that the most - decisive information about your entities will be close to their initial tokens. - If your entities are long and characterised by tokens in their middle, the - component will likely do poorly on your task. - - model (Model): The model for the transition-based parser. The model needs - to have a specific substructure of named components --- see the - spacy.ml.tb_framework.TransitionModel for details. - moves (Optional[TransitionSystem]): This defines how the parse-state is created, - updated and evaluated. If 'moves' is None, a new instance is - created with `self.TransitionSystem()`. Defaults to `None`. - update_with_oracle_cut_size (int): During training, cut long sequences into - shorter segments by creating intermediate states based on the gold-standard - history. The model is not very sensitive to this parameter, so you usually - won't need to change it. 100 is a good default. - incorrect_spans_key (Optional[str]): Identifies spans that are known - to be incorrect entity annotations. The incorrect entity annotations - can be stored in the span group, under this key. - scorer (Optional[Callable]): The scoring method. - """ - return EntityRecognizer( - nlp.vocab, - model, - name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - incorrect_spans_key=incorrect_spans_key, - multitasks=[], - beam_width=1, - beam_density=0.0, - beam_update_prob=0.0, - scorer=scorer, - ) - - -def make_beam_ner( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - beam_width: int, - beam_density: float, - beam_update_prob: float, - incorrect_spans_key: Optional[str], - scorer: Optional[Callable], -): - """Create a transition-based EntityRecognizer component that uses beam-search. - The entity recognizer identifies non-overlapping labelled spans of tokens. - - The transition-based algorithm used encodes certain assumptions that are - effective for "traditional" named entity recognition tasks, but may not be - a good fit for every span identification problem. Specifically, the loss - function optimizes for whole entity accuracy, so if your inter-annotator - agreement on boundary tokens is low, the component will likely perform poorly - on your problem. The transition-based algorithm also assumes that the most - decisive information about your entities will be close to their initial tokens. - If your entities are long and characterised by tokens in their middle, the - component will likely do poorly on your task. - - model (Model): The model for the transition-based parser. The model needs - to have a specific substructure of named components --- see the - spacy.ml.tb_framework.TransitionModel for details. - moves (Optional[TransitionSystem]): This defines how the parse-state is created, - updated and evaluated. If 'moves' is None, a new instance is - created with `self.TransitionSystem()`. Defaults to `None`. - update_with_oracle_cut_size (int): During training, cut long sequences into - shorter segments by creating intermediate states based on the gold-standard - history. The model is not very sensitive to this parameter, so you usually - won't need to change it. 100 is a good default. - beam_width (int): The number of candidate analyses to maintain. - beam_density (float): The minimum ratio between the scores of the first and - last candidates in the beam. This allows the parser to avoid exploring - candidates that are too far behind. This is mostly intended to improve - efficiency, but it can also improve accuracy as deeper search is not - always better. - beam_update_prob (float): The chance of making a beam update, instead of a - greedy update. Greedy updates are an approximation for the beam updates, - and are faster to compute. - incorrect_spans_key (Optional[str]): Optional key into span groups of - entities known to be non-entities. - scorer (Optional[Callable]): The scoring method. - """ - return EntityRecognizer( - nlp.vocab, - model, - name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=[], - beam_width=beam_width, - beam_density=beam_density, - beam_update_prob=beam_update_prob, - incorrect_spans_key=incorrect_spans_key, - scorer=scorer, - ) - - def ner_score(examples, **kwargs): return get_ner_prf(examples, **kwargs) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index c8e09c5ab..4aea61a9b 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -34,10 +34,6 @@ subword_features = true DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] -def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): - return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) - - def senter_score(examples, **kwargs): def has_sents(doc): return doc.has_annotation("SENT_START") diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index 709a67b7f..36283044b 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -41,43 +41,6 @@ depth = 4 DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"] -def make_span_finder( - nlp: Language, - name: str, - model: Model[Iterable[Doc], Floats2d], - spans_key: str, - threshold: float, - max_length: Optional[int], - min_length: Optional[int], - scorer: Optional[Callable], -) -> "SpanFinder": - """Create a SpanFinder component. The component predicts whether a token is - the start or the end of a potential span. - - model (Model[List[Doc], Floats2d]): A model instance that - is given a list of documents and predicts a probability for each token. - spans_key (str): Key of the doc.spans dict to save the spans under. During - initialization and training, the component will look for spans on the - reference document under the same key. - threshold (float): Minimum probability to consider a prediction positive. - max_length (Optional[int]): Maximum length of the produced spans, defaults - to None meaning unlimited length. - min_length (Optional[int]): Minimum length of the produced spans, defaults - to None meaning shortest span length is 1. - scorer (Optional[Callable]): The scoring method. Defaults to - Scorer.score_spans for the Doc.spans[spans_key] with overlapping - spans allowed. - """ - return SpanFinder( - nlp, - model=model, - threshold=threshold, - name=name, - scorer=scorer, - max_length=max_length, - min_length=min_length, - spans_key=spans_key, - ) def make_span_finder_scorer(): diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 1f9ab2622..cffd6036b 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -32,61 +32,8 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] DEFAULT_SPANS_KEY = "ruler" -def make_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite_ents: bool, - scorer: Optional[Callable], - ent_id_sep: str, -): - if overwrite_ents: - ents_filter = prioritize_new_ents_filter - else: - ents_filter = prioritize_existing_ents_filter - return SpanRuler( - nlp, - name, - spans_key=None, - spans_filter=None, - annotate_ents=True, - ents_filter=ents_filter, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite=False, - scorer=scorer, - ) -def make_span_ruler( - nlp: Language, - name: str, - spans_key: Optional[str], - spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]], - annotate_ents: bool, - ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]], - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite: bool, - scorer: Optional[Callable], -): - return SpanRuler( - nlp, - name, - spans_key=spans_key, - spans_filter=spans_filter, - annotate_ents=annotate_ents, - ents_filter=ents_filter, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite=overwrite, - scorer=scorer, - ) def prioritize_new_ents_filter( diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 3d49bef2a..f69a4d8cd 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -157,108 +157,8 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: return partial(preset_spans_suggester, spans_key=spans_key) -def make_spancat( - nlp: Language, - name: str, - suggester: Suggester, - model: Model[Tuple[List[Doc], Ragged], Floats2d], - spans_key: str, - scorer: Optional[Callable], - threshold: float, - max_positive: Optional[int], -) -> "SpanCategorizer": - """Create a SpanCategorizer component and configure it for multi-label - classification to be able to assign multiple labels for each span. - The span categorizer consists of two - parts: a suggester function that proposes candidate spans, and a labeller - model that predicts one or more labels for each span. - - name (str): The component instance name, used to add entries to the - losses during training. - suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. - Spans are returned as a ragged array with two integer columns, for the - start and end positions. - model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that - is given a list of documents and (start, end) indices representing - candidate span offsets. The model predicts a probability for each category - for each span. - spans_key (str): Key of the doc.spans dict to save the spans under. During - initialization and training, the component will look for spans on the - reference document under the same key. - scorer (Optional[Callable]): The scoring method. Defaults to - Scorer.score_spans for the Doc.spans[spans_key] with overlapping - spans allowed. - threshold (float): Minimum probability to consider a prediction positive. - Spans with a positive prediction will be saved on the Doc. Defaults to - 0.5. - max_positive (Optional[int]): Maximum number of labels to consider positive - per span. Defaults to None, indicating no limit. - """ - return SpanCategorizer( - nlp.vocab, - model=model, - suggester=suggester, - name=name, - spans_key=spans_key, - negative_weight=None, - allow_overlap=True, - max_positive=max_positive, - threshold=threshold, - scorer=scorer, - add_negative_label=False, - ) -def make_spancat_singlelabel( - nlp: Language, - name: str, - suggester: Suggester, - model: Model[Tuple[List[Doc], Ragged], Floats2d], - spans_key: str, - negative_weight: float, - allow_overlap: bool, - scorer: Optional[Callable], -) -> "SpanCategorizer": - """Create a SpanCategorizer component and configure it for multi-class - classification. With this configuration each span can get at most one - label. The span categorizer consists of two - parts: a suggester function that proposes candidate spans, and a labeller - model that predicts one or more labels for each span. - - name (str): The component instance name, used to add entries to the - losses during training. - suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. - Spans are returned as a ragged array with two integer columns, for the - start and end positions. - model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that - is given a list of documents and (start, end) indices representing - candidate span offsets. The model predicts a probability for each category - for each span. - spans_key (str): Key of the doc.spans dict to save the spans under. During - initialization and training, the component will look for spans on the - reference document under the same key. - scorer (Optional[Callable]): The scoring method. Defaults to - Scorer.score_spans for the Doc.spans[spans_key] with overlapping - spans allowed. - negative_weight (float): Multiplier for the loss terms. - Can be used to downweight the negative samples if there are too many. - allow_overlap (bool): If True the data is assumed to contain overlapping spans. - Otherwise it produces non-overlapping spans greedily prioritizing - higher assigned label scores. - """ - return SpanCategorizer( - nlp.vocab, - model=model, - suggester=suggester, - name=name, - spans_key=spans_key, - negative_weight=negative_weight, - allow_overlap=allow_overlap, - max_positive=1, - add_negative_label=True, - threshold=None, - scorer=scorer, - ) def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index f0085d3ff..35b60a72c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -35,25 +35,6 @@ subword_features = true DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] -def make_tagger( - nlp: Language, - name: str, - model: Model, - overwrite: bool, - scorer: Optional[Callable], - neg_prefix: str, - label_smoothing: float, -): - """Construct a part-of-speech tagger component. - - model (Model[List[Doc], List[Floats2d]]): A model instance that predicts - the tag probabilities. The output vectors should match the number of tags - in size, and be normalized as probabilities (all scores between 0 and 1, - with the rows summing to 1). - """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing) - - def tagger_score(examples, **kwargs): return Scorer.score_token_attr(examples, "tag", **kwargs) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 98393355f..21c5e397a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -74,23 +74,6 @@ subword_features = true """ -def make_textcat( - nlp: Language, - name: str, - model: Model[List[Doc], List[Floats2d]], - threshold: float, - scorer: Optional[Callable], -) -> "TextCategorizer": - """Create a TextCategorizer component. The text categorizer predicts categories - over a whole document. It can learn one or more labels, and the labels are considered - to be mutually exclusive (i.e. one true label per doc). - - model (Model[List[Doc], List[Floats2d]]): A model instance that predicts - scores for each category. - threshold (float): Cutoff to consider a prediction "positive". - scorer (Optional[Callable]): The scoring method. - """ - return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index f1306f92c..19df66c3d 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -72,26 +72,6 @@ subword_features = true """ -def make_multilabel_textcat( - nlp: Language, - name: str, - model: Model[List[Doc], List[Floats2d]], - threshold: float, - scorer: Optional[Callable], -) -> "MultiLabel_TextCategorizer": - """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories - over a whole document. It can learn one or more labels, and the labels are considered - to be non-mutually exclusive, which means that there can be zero or more labels - per doc). - - model (Model[List[Doc], List[Floats2d]]): A model instance that predicts - scores for each category. - threshold (float): Cutoff to consider a prediction "positive". - scorer (Optional[Callable]): The scoring method. - """ - return MultiLabel_TextCategorizer( - nlp.vocab, model, name, threshold=threshold, scorer=scorer - ) def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 22c30d548..e4ae2520f 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -24,8 +24,6 @@ subword_features = true DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"] -def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": - return Tok2Vec(nlp.vocab, model, name) class Tok2Vec(TrainablePipe):