mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	multi-label textcat component (#6474)
* multi-label textcat component * formatting * fix comment * cleanup * fix from #6481 * random edit to push the tests * add explicit error when textcat is called with multi-label gold data * fix error nr * small fix
This commit is contained in:
		
							parent
							
								
									1a77607036
								
							
						
					
					
						commit
						afc5714d32
					
				| 
						 | 
				
			
			@ -169,7 +169,7 @@ def init_config(
 | 
			
		|||
        "Hardware": variables["hardware"].upper(),
 | 
			
		||||
        "Transformer": template_vars.transformer.get("name", False),
 | 
			
		||||
    }
 | 
			
		||||
    msg.info("Generated template specific for your use case")
 | 
			
		||||
    msg.info("Generated config template specific for your use case")
 | 
			
		||||
    for label, value in use_case.items():
 | 
			
		||||
        msg.text(f"- {label}: {value}")
 | 
			
		||||
    with show_validation_error(hint_fill=False):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -149,13 +149,44 @@ grad_factor = 1.0
 | 
			
		|||
 | 
			
		||||
[components.textcat.model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
 | 
			
		||||
{% else -%}
 | 
			
		||||
[components.textcat.model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
{%- endif %}
 | 
			
		||||
{%- endif %}
 | 
			
		||||
 | 
			
		||||
{% if "textcat_multilabel" in components %}
 | 
			
		||||
[components.textcat_multilabel]
 | 
			
		||||
factory = "textcat_multilabel"
 | 
			
		||||
 | 
			
		||||
{% if optimize == "accuracy" %}
 | 
			
		||||
[components.textcat_multilabel.model]
 | 
			
		||||
@architectures = "spacy.TextCatEnsemble.v2"
 | 
			
		||||
nO = null
 | 
			
		||||
 | 
			
		||||
[components.textcat_multilabel.model.tok2vec]
 | 
			
		||||
@architectures = "spacy-transformers.TransformerListener.v1"
 | 
			
		||||
grad_factor = 1.0
 | 
			
		||||
 | 
			
		||||
[components.textcat_multilabel.model.tok2vec.pooling]
 | 
			
		||||
@layers = "reduce_mean.v1"
 | 
			
		||||
 | 
			
		||||
[components.textcat_multilabel.model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
 | 
			
		||||
{% else -%}
 | 
			
		||||
[components.textcat_multilabel.model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
| 
						 | 
				
			
			@ -288,13 +319,41 @@ width = ${components.tok2vec.model.encode.width}
 | 
			
		|||
 | 
			
		||||
[components.textcat.model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
 | 
			
		||||
{% else -%}
 | 
			
		||||
[components.textcat.model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
{%- endif %}
 | 
			
		||||
{%- endif %}
 | 
			
		||||
 | 
			
		||||
{% if "textcat_multilabel" in components %}
 | 
			
		||||
[components.textcat_multilabel]
 | 
			
		||||
factory = "textcat_multilabel"
 | 
			
		||||
 | 
			
		||||
{% if optimize == "accuracy" %}
 | 
			
		||||
[components.textcat_multilabel.model]
 | 
			
		||||
@architectures = "spacy.TextCatEnsemble.v2"
 | 
			
		||||
nO = null
 | 
			
		||||
 | 
			
		||||
[components.textcat_multilabel.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
width = ${components.tok2vec.model.encode.width}
 | 
			
		||||
 | 
			
		||||
[components.textcat_multilabel.model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
 | 
			
		||||
{% else -%}
 | 
			
		||||
[components.textcat_multilabel.model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
| 
						 | 
				
			
			@ -303,7 +362,7 @@ no_output_layer = false
 | 
			
		|||
{% endif %}
 | 
			
		||||
 | 
			
		||||
{% for pipe in components %}
 | 
			
		||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
 | 
			
		||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
 | 
			
		||||
{# Other components defined by the user: we just assume they're factories #}
 | 
			
		||||
[components.{{ pipe }}]
 | 
			
		||||
factory = "{{ pipe }}"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -463,6 +463,10 @@ class Errors:
 | 
			
		|||
            "issue tracker: http://github.com/explosion/spaCy/issues")
 | 
			
		||||
 | 
			
		||||
    # TODO: fix numbering after merging develop into master
 | 
			
		||||
    E895 = ("The 'textcat' component received gold-standard annotations with "
 | 
			
		||||
            "multiple labels per document. In spaCy 3 you should use the "
 | 
			
		||||
            "'textcat_multilabel' component for this instead. "
 | 
			
		||||
            "Example of an offending annotation: {value}")
 | 
			
		||||
    E896 = ("There was an error using the static vectors. Ensure that the vectors "
 | 
			
		||||
            "of the vocab are properly initialized, or set 'include_static_vectors' "
 | 
			
		||||
            "to False.")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,6 +11,7 @@ from .senter import SentenceRecognizer
 | 
			
		|||
from .sentencizer import Sentencizer
 | 
			
		||||
from .tagger import Tagger
 | 
			
		||||
from .textcat import TextCategorizer
 | 
			
		||||
from .textcat_multilabel import MultiLabel_TextCategorizer
 | 
			
		||||
from .tok2vec import Tok2Vec
 | 
			
		||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -22,13 +23,14 @@ __all__ = [
 | 
			
		|||
    "EntityRuler",
 | 
			
		||||
    "Morphologizer",
 | 
			
		||||
    "Lemmatizer",
 | 
			
		||||
    "TrainablePipe",
 | 
			
		||||
    "MultiLabel_TextCategorizer",
 | 
			
		||||
    "Pipe",
 | 
			
		||||
    "SentenceRecognizer",
 | 
			
		||||
    "Sentencizer",
 | 
			
		||||
    "Tagger",
 | 
			
		||||
    "TextCategorizer",
 | 
			
		||||
    "Tok2Vec",
 | 
			
		||||
    "TrainablePipe",
 | 
			
		||||
    "merge_entities",
 | 
			
		||||
    "merge_noun_chunks",
 | 
			
		||||
    "merge_subtokens",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ from ..tokens import Doc
 | 
			
		|||
from ..vocab import Vocab
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
default_model_config = """
 | 
			
		||||
single_label_default_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatEnsemble.v2"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -37,24 +37,24 @@ depth = 2
 | 
			
		|||
 | 
			
		||||
[model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
"""
 | 
			
		||||
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
 | 
			
		||||
DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["model"]
 | 
			
		||||
 | 
			
		||||
bow_model_config = """
 | 
			
		||||
single_label_bow_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
cnn_model_config = """
 | 
			
		||||
single_label_cnn_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatCNN.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
| 
						 | 
				
			
			@ -71,7 +71,7 @@ subword_features = true
 | 
			
		|||
@Language.factory(
 | 
			
		||||
    "textcat",
 | 
			
		||||
    assigns=["doc.cats"],
 | 
			
		||||
    default_config={"threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
 | 
			
		||||
    default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
 | 
			
		||||
    default_score_weights={
 | 
			
		||||
        "cats_score": 1.0,
 | 
			
		||||
        "cats_score_desc": None,
 | 
			
		||||
| 
						 | 
				
			
			@ -103,7 +103,7 @@ def make_textcat(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
class TextCategorizer(TrainablePipe):
 | 
			
		||||
    """Pipeline component for text classification.
 | 
			
		||||
    """Pipeline component for single-label text classification.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://nightly.spacy.io/api/textcategorizer
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
    def __init__(
 | 
			
		||||
        self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Initialize a text categorizer.
 | 
			
		||||
        """Initialize a text categorizer for single-label classification.
 | 
			
		||||
 | 
			
		||||
        vocab (Vocab): The shared vocabulary.
 | 
			
		||||
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
			
		||||
| 
						 | 
				
			
			@ -214,6 +214,7 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
            losses = {}
 | 
			
		||||
        losses.setdefault(self.name, 0.0)
 | 
			
		||||
        validate_examples(examples, "TextCategorizer.update")
 | 
			
		||||
        self._validate_categories(examples)
 | 
			
		||||
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
 | 
			
		||||
            # Handle cases where there are no tokens in any docs.
 | 
			
		||||
            return losses
 | 
			
		||||
| 
						 | 
				
			
			@ -256,6 +257,7 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        if self._rehearsal_model is None:
 | 
			
		||||
            return losses
 | 
			
		||||
        validate_examples(examples, "TextCategorizer.rehearse")
 | 
			
		||||
        self._validate_categories(examples)
 | 
			
		||||
        docs = [eg.predicted for eg in examples]
 | 
			
		||||
        if not any(len(doc) for doc in docs):
 | 
			
		||||
            # Handle cases where there are no tokens in any docs.
 | 
			
		||||
| 
						 | 
				
			
			@ -296,6 +298,7 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
 | 
			
		||||
        """
 | 
			
		||||
        validate_examples(examples, "TextCategorizer.get_loss")
 | 
			
		||||
        self._validate_categories(examples)
 | 
			
		||||
        truths, not_missing = self._examples_to_truth(examples)
 | 
			
		||||
        not_missing = self.model.ops.asarray(not_missing)
 | 
			
		||||
        d_scores = (scores - truths) / scores.shape[0]
 | 
			
		||||
| 
						 | 
				
			
			@ -341,6 +344,7 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
 | 
			
		||||
        """
 | 
			
		||||
        validate_get_examples(get_examples, "TextCategorizer.initialize")
 | 
			
		||||
        self._validate_categories(get_examples())
 | 
			
		||||
        if labels is None:
 | 
			
		||||
            for example in get_examples():
 | 
			
		||||
                for cat in example.y.cats:
 | 
			
		||||
| 
						 | 
				
			
			@ -373,12 +377,20 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        DOCS: https://nightly.spacy.io/api/textcategorizer#score
 | 
			
		||||
        """
 | 
			
		||||
        validate_examples(examples, "TextCategorizer.score")
 | 
			
		||||
        self._validate_categories(examples)
 | 
			
		||||
        return Scorer.score_cats(
 | 
			
		||||
            examples,
 | 
			
		||||
            "cats",
 | 
			
		||||
            labels=self.labels,
 | 
			
		||||
            multi_label=self.model.attrs["multi_label"],
 | 
			
		||||
            multi_label=False,
 | 
			
		||||
            positive_label=self.cfg["positive_label"],
 | 
			
		||||
            threshold=self.cfg["threshold"],
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def _validate_categories(self, examples: List[Example]):
 | 
			
		||||
        """Check whether the provided examples all have single-label cats annotations."""
 | 
			
		||||
        for ex in examples:
 | 
			
		||||
            if list(ex.reference.cats.values()).count(1.0) > 1:
 | 
			
		||||
                raise ValueError(Errors.E895.format(value=ex.reference.cats))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										191
									
								
								spacy/pipeline/textcat_multilabel.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								spacy/pipeline/textcat_multilabel.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,191 @@
 | 
			
		|||
from itertools import islice
 | 
			
		||||
from typing import Iterable, Optional, Dict, List, Callable, Any
 | 
			
		||||
 | 
			
		||||
from thinc.api import Model, Config
 | 
			
		||||
from thinc.types import Floats2d
 | 
			
		||||
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from ..training import Example, validate_examples, validate_get_examples
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..scorer import Scorer
 | 
			
		||||
from ..tokens import Doc
 | 
			
		||||
from ..vocab import Vocab
 | 
			
		||||
from .textcat import TextCategorizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
multi_label_default_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatEnsemble.v2"
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2Vec.v1"
 | 
			
		||||
 | 
			
		||||
[model.tok2vec.embed]
 | 
			
		||||
@architectures = "spacy.MultiHashEmbed.v1"
 | 
			
		||||
width = 64
 | 
			
		||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
 | 
			
		||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 | 
			
		||||
include_static_vectors = false
 | 
			
		||||
 | 
			
		||||
[model.tok2vec.encode]
 | 
			
		||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
 | 
			
		||||
width = ${model.tok2vec.embed.width}
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
depth = 2
 | 
			
		||||
 | 
			
		||||
[model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
"""
 | 
			
		||||
DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["model"]
 | 
			
		||||
 | 
			
		||||
multi_label_bow_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
multi_label_cnn_config = """
 | 
			
		||||
[model]
 | 
			
		||||
@architectures = "spacy.TextCatCNN.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = null
 | 
			
		||||
width = 96
 | 
			
		||||
depth = 4
 | 
			
		||||
embed_size = 2000
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
subword_features = true
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@Language.factory(
 | 
			
		||||
    "textcat_multilabel",
 | 
			
		||||
    assigns=["doc.cats"],
 | 
			
		||||
    default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
 | 
			
		||||
    default_score_weights={
 | 
			
		||||
        "cats_score": 1.0,
 | 
			
		||||
        "cats_score_desc": None,
 | 
			
		||||
        "cats_micro_p": None,
 | 
			
		||||
        "cats_micro_r": None,
 | 
			
		||||
        "cats_micro_f": None,
 | 
			
		||||
        "cats_macro_p": None,
 | 
			
		||||
        "cats_macro_r": None,
 | 
			
		||||
        "cats_macro_f": None,
 | 
			
		||||
        "cats_macro_auc": None,
 | 
			
		||||
        "cats_f_per_type": None,
 | 
			
		||||
        "cats_macro_auc_per_type": None,
 | 
			
		||||
    },
 | 
			
		||||
)
 | 
			
		||||
def make_multilabel_textcat(
 | 
			
		||||
    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 | 
			
		||||
) -> "TextCategorizer":
 | 
			
		||||
    """Create a TextCategorizer compoment. The text categorizer predicts categories
 | 
			
		||||
    over a whole document. It can learn one or more labels, and the labels can
 | 
			
		||||
    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
 | 
			
		||||
    (i.e. zero or more labels may be true per doc). The multi-label setting is
 | 
			
		||||
    controlled by the model instance that's provided.
 | 
			
		||||
 | 
			
		||||
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
 | 
			
		||||
        scores for each category.
 | 
			
		||||
    threshold (float): Cutoff to consider a prediction "positive".
 | 
			
		||||
    """
 | 
			
		||||
    return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MultiLabel_TextCategorizer(TextCategorizer):
 | 
			
		||||
    """Pipeline component for multi-label text classification.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        vocab: Vocab,
 | 
			
		||||
        model: Model,
 | 
			
		||||
        name: str = "textcat_multilabel",
 | 
			
		||||
        *,
 | 
			
		||||
        threshold: float,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Initialize a text categorizer for multi-label classification.
 | 
			
		||||
 | 
			
		||||
        vocab (Vocab): The shared vocabulary.
 | 
			
		||||
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
			
		||||
        name (str): The component instance name, used to add entries to the
 | 
			
		||||
            losses during training.
 | 
			
		||||
        threshold (float): Cutoff to consider a prediction "positive".
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#init
 | 
			
		||||
        """
 | 
			
		||||
        self.vocab = vocab
 | 
			
		||||
        self.model = model
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self._rehearsal_model = None
 | 
			
		||||
        cfg = {"labels": [], "threshold": threshold}
 | 
			
		||||
        self.cfg = dict(cfg)
 | 
			
		||||
 | 
			
		||||
    def initialize(
 | 
			
		||||
        self,
 | 
			
		||||
        get_examples: Callable[[], Iterable[Example]],
 | 
			
		||||
        *,
 | 
			
		||||
        nlp: Optional[Language] = None,
 | 
			
		||||
        labels: Optional[Dict] = None,
 | 
			
		||||
    ):
 | 
			
		||||
        """Initialize the pipe for training, using a representative set
 | 
			
		||||
        of data examples.
 | 
			
		||||
 | 
			
		||||
        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
			
		||||
            returns a representative sample of gold-standard Example objects.
 | 
			
		||||
        nlp (Language): The current nlp object the component is part of.
 | 
			
		||||
        labels: The labels to add to the component, typically generated by the
 | 
			
		||||
            `init labels` command. If no labels are provided, the get_examples
 | 
			
		||||
            callback is used to extract the labels from the data.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#initialize
 | 
			
		||||
        """
 | 
			
		||||
        validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
 | 
			
		||||
        if labels is None:
 | 
			
		||||
            for example in get_examples():
 | 
			
		||||
                for cat in example.y.cats:
 | 
			
		||||
                    self.add_label(cat)
 | 
			
		||||
        else:
 | 
			
		||||
            for label in labels:
 | 
			
		||||
                self.add_label(label)
 | 
			
		||||
        subbatch = list(islice(get_examples(), 10))
 | 
			
		||||
        doc_sample = [eg.reference for eg in subbatch]
 | 
			
		||||
        label_sample, _ = self._examples_to_truth(subbatch)
 | 
			
		||||
        self._require_labels()
 | 
			
		||||
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
 | 
			
		||||
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
 | 
			
		||||
        self.model.initialize(X=doc_sample, Y=label_sample)
 | 
			
		||||
 | 
			
		||||
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
 | 
			
		||||
        """Score a batch of examples.
 | 
			
		||||
 | 
			
		||||
        examples (Iterable[Example]): The examples to score.
 | 
			
		||||
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#score
 | 
			
		||||
        """
 | 
			
		||||
        validate_examples(examples, "MultiLabel_TextCategorizer.score")
 | 
			
		||||
        return Scorer.score_cats(
 | 
			
		||||
            examples,
 | 
			
		||||
            "cats",
 | 
			
		||||
            labels=self.labels,
 | 
			
		||||
            multi_label=True,
 | 
			
		||||
            threshold=self.cfg["threshold"],
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _validate_categories(self, examples: List[Example]):
 | 
			
		||||
        """This component allows any type of single- or multi-label annotations.
 | 
			
		||||
        This method overwrites the more strict one from 'textcat'. """
 | 
			
		||||
        pass
 | 
			
		||||
| 
						 | 
				
			
			@ -460,7 +460,7 @@ class Scorer:
 | 
			
		|||
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
 | 
			
		||||
                if gold_score is not None and gold_score > 0:
 | 
			
		||||
                    f_per_type[gold_label].fn += 1
 | 
			
		||||
            else:
 | 
			
		||||
            elif pred_cats:
 | 
			
		||||
                pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
 | 
			
		||||
                if pred_score >= threshold:
 | 
			
		||||
                    f_per_type[pred_label].fp += 1
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,15 +15,31 @@ from spacy.training import Example
 | 
			
		|||
from ..util import make_tempdir
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
TRAIN_DATA = [
 | 
			
		||||
TRAIN_DATA_SINGLE_LABEL = [
 | 
			
		||||
    ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
 | 
			
		||||
    ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
TRAIN_DATA_MULTI_LABEL = [
 | 
			
		||||
    ("I'm angry and confused", {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}),
 | 
			
		||||
    ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
def make_get_examples(nlp):
 | 
			
		||||
 | 
			
		||||
def make_get_examples_single_label(nlp):
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for t in TRAIN_DATA:
 | 
			
		||||
    for t in TRAIN_DATA_SINGLE_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
 | 
			
		||||
 | 
			
		||||
    def get_examples():
 | 
			
		||||
        return train_examples
 | 
			
		||||
 | 
			
		||||
    return get_examples
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def make_get_examples_multi_label(nlp):
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for t in TRAIN_DATA_MULTI_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
 | 
			
		||||
 | 
			
		||||
    def get_examples():
 | 
			
		||||
| 
						 | 
				
			
			@ -85,49 +101,75 @@ def test_textcat_learns_multilabel():
 | 
			
		|||
                    assert score > 0.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_label_types():
 | 
			
		||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
 | 
			
		||||
def test_label_types(name):
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    textcat = nlp.add_pipe(name)
 | 
			
		||||
    textcat.add_label("answer")
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        textcat.add_label(9)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_no_label():
 | 
			
		||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
 | 
			
		||||
def test_no_label(name):
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    nlp.add_pipe("textcat")
 | 
			
		||||
    nlp.add_pipe(name)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        nlp.initialize()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_implicit_label():
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "name,get_examples",
 | 
			
		||||
    [
 | 
			
		||||
        ("textcat", make_get_examples_single_label),
 | 
			
		||||
        ("textcat_multilabel", make_get_examples_multi_label),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_implicit_label(name, get_examples):
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    nlp.add_pipe("textcat")
 | 
			
		||||
    nlp.initialize(get_examples=make_get_examples(nlp))
 | 
			
		||||
    nlp.add_pipe(name)
 | 
			
		||||
    nlp.initialize(get_examples=get_examples(nlp))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_no_resize():
 | 
			
		||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
 | 
			
		||||
def test_no_resize(name):
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    textcat = nlp.add_pipe(name)
 | 
			
		||||
    textcat.add_label("POSITIVE")
 | 
			
		||||
    textcat.add_label("NEGATIVE")
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
    assert textcat.model.get_dim("nO") == 2
 | 
			
		||||
    assert textcat.model.get_dim("nO") >= 2
 | 
			
		||||
    # this throws an error because the textcat can't be resized after initialization
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        textcat.add_label("NEUTRAL")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_initialize_examples():
 | 
			
		||||
def test_error_with_multi_labels():
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    for text, annotations in TRAIN_DATA:
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        optimizer = nlp.initialize(get_examples=lambda: train_examples)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "name,get_examples, train_data",
 | 
			
		||||
    [
 | 
			
		||||
        ("textcat", make_get_examples_single_label, TRAIN_DATA_SINGLE_LABEL),
 | 
			
		||||
        ("textcat_multilabel", make_get_examples_multi_label, TRAIN_DATA_MULTI_LABEL),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_initialize_examples(name, get_examples, train_data):
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    textcat = nlp.add_pipe(name)
 | 
			
		||||
    for text, annotations in train_data:
 | 
			
		||||
        for label, value in annotations.get("cats").items():
 | 
			
		||||
            textcat.add_label(label)
 | 
			
		||||
    # you shouldn't really call this more than once, but for testing it should be fine
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
    get_examples = make_get_examples(nlp)
 | 
			
		||||
    nlp.initialize(get_examples=get_examples)
 | 
			
		||||
    nlp.initialize(get_examples=get_examples(nlp))
 | 
			
		||||
    with pytest.raises(TypeError):
 | 
			
		||||
        nlp.initialize(get_examples=lambda: None)
 | 
			
		||||
    with pytest.raises(TypeError):
 | 
			
		||||
| 
						 | 
				
			
			@ -138,12 +180,10 @@ def test_overfitting_IO():
 | 
			
		|||
    # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
 | 
			
		||||
    fix_random_seed(0)
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
 | 
			
		||||
    # Set exclusive labels
 | 
			
		||||
    config = {"model": {"linear_model": {"exclusive_classes": True}}}
 | 
			
		||||
    textcat = nlp.add_pipe("textcat", config=config)
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA:
 | 
			
		||||
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
			
		||||
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 | 
			
		||||
    assert textcat.model.get_dim("nO") == 2
 | 
			
		||||
| 
						 | 
				
			
			@ -172,6 +212,8 @@ def test_overfitting_IO():
 | 
			
		|||
    # Test scoring
 | 
			
		||||
    scores = nlp.evaluate(train_examples)
 | 
			
		||||
    assert scores["cats_micro_f"] == 1.0
 | 
			
		||||
    assert scores["cats_macro_f"] == 1.0
 | 
			
		||||
    assert scores["cats_macro_auc"] == 1.0
 | 
			
		||||
    assert scores["cats_score"] == 1.0
 | 
			
		||||
    assert "cats_score_desc" in scores
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -192,7 +234,7 @@ def test_overfitting_IO_multi():
 | 
			
		|||
    config = {"model": {"linear_model": {"exclusive_classes": False}}}
 | 
			
		||||
    textcat = nlp.add_pipe("textcat", config=config)
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA:
 | 
			
		||||
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
			
		||||
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 | 
			
		||||
    assert textcat.model.get_dim("nO") == 2
 | 
			
		||||
| 
						 | 
				
			
			@ -231,27 +273,75 @@ def test_overfitting_IO_multi():
 | 
			
		|||
    assert_equal(batch_cats_1, no_batch_cats)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_overfitting_IO_multi():
 | 
			
		||||
    # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
 | 
			
		||||
    fix_random_seed(0)
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
			
		||||
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 | 
			
		||||
    assert textcat.model.get_dim("nO") == 3
 | 
			
		||||
 | 
			
		||||
    for i in range(100):
 | 
			
		||||
        losses = {}
 | 
			
		||||
        nlp.update(train_examples, sgd=optimizer, losses=losses)
 | 
			
		||||
    assert losses["textcat_multilabel"] < 0.01
 | 
			
		||||
 | 
			
		||||
    # test the trained model
 | 
			
		||||
    test_text = "I am confused but happy."
 | 
			
		||||
    doc = nlp(test_text)
 | 
			
		||||
    cats = doc.cats
 | 
			
		||||
    assert cats["HAPPY"] > 0.9
 | 
			
		||||
    assert cats["CONFUSED"] > 0.9
 | 
			
		||||
 | 
			
		||||
    # Also test the results are still the same after IO
 | 
			
		||||
    with make_tempdir() as tmp_dir:
 | 
			
		||||
        nlp.to_disk(tmp_dir)
 | 
			
		||||
        nlp2 = util.load_model_from_path(tmp_dir)
 | 
			
		||||
        doc2 = nlp2(test_text)
 | 
			
		||||
        cats2 = doc2.cats
 | 
			
		||||
        assert cats2["HAPPY"] > 0.9
 | 
			
		||||
        assert cats2["CONFUSED"] > 0.9
 | 
			
		||||
 | 
			
		||||
    # Test scoring
 | 
			
		||||
    scores = nlp.evaluate(train_examples)
 | 
			
		||||
    assert scores["cats_micro_f"] == 1.0
 | 
			
		||||
    assert scores["cats_macro_f"] == 1.0
 | 
			
		||||
    assert "cats_score_desc" in scores
 | 
			
		||||
 | 
			
		||||
    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
 | 
			
		||||
    texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
 | 
			
		||||
    batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
 | 
			
		||||
    batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
 | 
			
		||||
    no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
 | 
			
		||||
    assert_equal(batch_deps_1, batch_deps_2)
 | 
			
		||||
    assert_equal(batch_deps_1, no_batch_deps)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# fmt: off
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "textcat_config",
 | 
			
		||||
    "name,train_data,textcat_config",
 | 
			
		||||
    [
 | 
			
		||||
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False},
 | 
			
		||||
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
 | 
			
		||||
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
 | 
			
		||||
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
 | 
			
		||||
        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
 | 
			
		||||
        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
 | 
			
		||||
        {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
 | 
			
		||||
        {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
 | 
			
		||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
 | 
			
		||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
 | 
			
		||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
 | 
			
		||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
 | 
			
		||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
 | 
			
		||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
 | 
			
		||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
			
		||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
# fmt: on
 | 
			
		||||
def test_textcat_configs(textcat_config):
 | 
			
		||||
def test_textcat_configs(name, train_data, textcat_config):
 | 
			
		||||
    pipe_config = {"model": textcat_config}
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat", config=pipe_config)
 | 
			
		||||
    textcat = nlp.add_pipe(name, config=pipe_config)
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA:
 | 
			
		||||
    for text, annotations in train_data:
 | 
			
		||||
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
			
		||||
        for label, value in annotations.get("cats").items():
 | 
			
		||||
            textcat.add_label(label)
 | 
			
		||||
| 
						 | 
				
			
			@ -264,15 +354,24 @@ def test_textcat_configs(textcat_config):
 | 
			
		|||
def test_positive_class():
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    get_examples = make_get_examples(nlp)
 | 
			
		||||
    get_examples = make_get_examples_single_label(nlp)
 | 
			
		||||
    textcat.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
 | 
			
		||||
    assert textcat.labels == ("POS", "NEG")
 | 
			
		||||
    assert textcat.cfg["positive_label"] == "POS"
 | 
			
		||||
 | 
			
		||||
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
    get_examples = make_get_examples_multi_label(nlp)
 | 
			
		||||
    with pytest.raises(TypeError):
 | 
			
		||||
        textcat_multilabel.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
 | 
			
		||||
    textcat_multilabel.initialize(get_examples, labels=["FICTION", "DRAMA"])
 | 
			
		||||
    assert textcat_multilabel.labels == ("FICTION", "DRAMA")
 | 
			
		||||
    assert "positive_label" not in textcat_multilabel.cfg
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_positive_class_not_present():
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    get_examples = make_get_examples(nlp)
 | 
			
		||||
    get_examples = make_get_examples_single_label(nlp)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -280,11 +379,9 @@ def test_positive_class_not_present():
 | 
			
		|||
def test_positive_class_not_binary():
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    textcat = nlp.add_pipe("textcat")
 | 
			
		||||
    get_examples = make_get_examples(nlp)
 | 
			
		||||
    get_examples = make_get_examples_multi_label(nlp)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        textcat.initialize(
 | 
			
		||||
            get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
 | 
			
		||||
        )
 | 
			
		||||
        textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_textcat_evaluation():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,8 +2,11 @@ import pytest
 | 
			
		|||
from thinc.api import Config, fix_random_seed
 | 
			
		||||
 | 
			
		||||
from spacy.lang.en import English
 | 
			
		||||
from spacy.pipeline.textcat import default_model_config, bow_model_config
 | 
			
		||||
from spacy.pipeline.textcat import cnn_model_config
 | 
			
		||||
from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
 | 
			
		||||
from spacy.pipeline.textcat import single_label_cnn_config
 | 
			
		||||
from spacy.pipeline.textcat_multilabel import multi_label_default_config
 | 
			
		||||
from spacy.pipeline.textcat_multilabel import multi_label_bow_config
 | 
			
		||||
from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
 | 
			
		||||
from spacy.tokens import Span
 | 
			
		||||
from spacy import displacy
 | 
			
		||||
from spacy.pipeline import merge_entities
 | 
			
		||||
| 
						 | 
				
			
			@ -11,7 +14,15 @@ from spacy.training import Example
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "textcat_config", [default_model_config, bow_model_config, cnn_model_config]
 | 
			
		||||
    "textcat_config",
 | 
			
		||||
    [
 | 
			
		||||
        single_label_default_config,
 | 
			
		||||
        single_label_bow_config,
 | 
			
		||||
        single_label_cnn_config,
 | 
			
		||||
        multi_label_default_config,
 | 
			
		||||
        multi_label_bow_config,
 | 
			
		||||
        multi_label_cnn_config,
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_issue5551(textcat_config):
 | 
			
		||||
    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,7 +4,7 @@ from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
 | 
			
		|||
from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
 | 
			
		||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 | 
			
		||||
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 | 
			
		||||
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 | 
			
		||||
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 | 
			
		||||
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 | 
			
		||||
from spacy.lang.en import English
 | 
			
		||||
from thinc.api import Linear
 | 
			
		||||
| 
						 | 
				
			
			@ -188,7 +188,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
 | 
			
		|||
 | 
			
		||||
def test_serialize_textcat_empty(en_vocab):
 | 
			
		||||
    # See issue #1105
 | 
			
		||||
    cfg = {"model": DEFAULT_TEXTCAT_MODEL}
 | 
			
		||||
    cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}
 | 
			
		||||
    model = registry.resolve(cfg, validate=True)["model"]
 | 
			
		||||
    textcat = TextCategorizer(en_vocab, model, threshold=0.5)
 | 
			
		||||
    textcat.to_bytes(exclude=["vocab"])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ def test_readers():
 | 
			
		|||
    for example in train_corpus(nlp):
 | 
			
		||||
        nlp.update([example], sgd=optimizer)
 | 
			
		||||
    scores = nlp.evaluate(list(dev_corpus(nlp)))
 | 
			
		||||
    assert scores["cats_score"] == 0.0
 | 
			
		||||
    assert scores["cats_macro_auc"] == 0.0
 | 
			
		||||
    # ensure the pipeline runs
 | 
			
		||||
    doc = nlp("Quick test")
 | 
			
		||||
    assert doc.cats
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -94,7 +94,7 @@ Defines the `nlp` object, its tokenizer and
 | 
			
		|||
>
 | 
			
		||||
> [components.textcat.model]
 | 
			
		||||
> @architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
> exclusive_classes = false
 | 
			
		||||
> exclusive_classes = true
 | 
			
		||||
> ngram_size = 1
 | 
			
		||||
> no_output_layer = false
 | 
			
		||||
> ```
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										454
									
								
								website/docs/api/multilabel_textcategorizer.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										454
									
								
								website/docs/api/multilabel_textcategorizer.md
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,454 @@
 | 
			
		|||
---
 | 
			
		||||
title: Multi-label TextCategorizer
 | 
			
		||||
tag: class
 | 
			
		||||
source: spacy/pipeline/textcat_multilabel.py
 | 
			
		||||
new: 3
 | 
			
		||||
teaser: 'Pipeline component for multi-label text classification'
 | 
			
		||||
api_base_class: /api/pipe
 | 
			
		||||
api_string_name: textcat_multilabel
 | 
			
		||||
api_trainable: true
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
The text categorizer predicts **categories over a whole document**. It 
 | 
			
		||||
learns non-mutually exclusive labels, which means that zero or more labels 
 | 
			
		||||
may be true per document.
 | 
			
		||||
 | 
			
		||||
## Config and implementation {#config}
 | 
			
		||||
 | 
			
		||||
The default config is defined by the pipeline component factory and describes
 | 
			
		||||
how the component should be configured. You can override its settings via the
 | 
			
		||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 | 
			
		||||
[`config.cfg` for training](/usage/training#config). See the
 | 
			
		||||
[model architectures](/api/architectures) documentation for details on the
 | 
			
		||||
architectures and their arguments and hyperparameters.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
 | 
			
		||||
> config = {
 | 
			
		||||
>    "threshold": 0.5,
 | 
			
		||||
>    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
 | 
			
		||||
> }
 | 
			
		||||
> nlp.add_pipe("textcat_multilabel", config=config)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Setting     | Description                                                                                                                                                      |
 | 
			
		||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
 | 
			
		||||
| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"}
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> # Construction via add_pipe with default model
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
>
 | 
			
		||||
> # Construction via add_pipe with custom model
 | 
			
		||||
> config = {"model": {"@architectures": "my_textcat"}}
 | 
			
		||||
> parser = nlp.add_pipe("textcat_multilabel", config=config)
 | 
			
		||||
>
 | 
			
		||||
> # Construction from class
 | 
			
		||||
> from spacy.pipeline import MultiLabel_TextCategorizer
 | 
			
		||||
> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
Create a new pipeline instance. In your application, you would normally use a
 | 
			
		||||
shortcut for this and instantiate the component using its string name and
 | 
			
		||||
[`nlp.add_pipe`](/api/language#create_pipe).
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                                                |
 | 
			
		||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                           |
 | 
			
		||||
| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
			
		||||
| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
 | 
			
		||||
| _keyword-only_ |                                                                                                                            |
 | 
			
		||||
| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"}
 | 
			
		||||
 | 
			
		||||
Apply the pipe to one document. The document is modified in place, and returned.
 | 
			
		||||
This usually happens under the hood when the `nlp` object is called on a text
 | 
			
		||||
and all pipeline components are applied to the `Doc` in order. Both
 | 
			
		||||
[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe)
 | 
			
		||||
delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and
 | 
			
		||||
[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> doc = nlp("This is a sentence.")
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> # This usually happens under the hood
 | 
			
		||||
> processed = textcat(doc)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                      |
 | 
			
		||||
| ----------- | -------------------------------- |
 | 
			
		||||
| `doc`       | The document to process. ~~Doc~~ |
 | 
			
		||||
| **RETURNS** | The processed document. ~~Doc~~  |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.pipe {#pipe tag="method"}
 | 
			
		||||
 | 
			
		||||
Apply the pipe to a stream of documents. This usually happens under the hood
 | 
			
		||||
when the `nlp` object is called on a text and all pipeline components are
 | 
			
		||||
applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and
 | 
			
		||||
[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the
 | 
			
		||||
[`predict`](/api/multilabel_textcategorizer#predict) and
 | 
			
		||||
[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> for doc in textcat.pipe(docs, batch_size=50):
 | 
			
		||||
>     pass
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                   |
 | 
			
		||||
| -------------- | ------------------------------------------------------------- |
 | 
			
		||||
| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
 | 
			
		||||
| _keyword-only_ |                                                               |
 | 
			
		||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
			
		||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"}
 | 
			
		||||
 | 
			
		||||
Initialize the component for training. `get_examples` should be a function that
 | 
			
		||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
			
		||||
used to **initialize the model** of the component and can either be the full
 | 
			
		||||
training data or a representative sample. Initialization includes validating the
 | 
			
		||||
network,
 | 
			
		||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
			
		||||
setting up the label scheme based on the data. This method is typically called
 | 
			
		||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
			
		||||
arguments it receives via the
 | 
			
		||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
			
		||||
config.
 | 
			
		||||
 | 
			
		||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
			
		||||
 | 
			
		||||
This method was previously called `begin_training`.
 | 
			
		||||
 | 
			
		||||
</Infobox>
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> textcat.initialize(lambda: [], nlp=nlp)
 | 
			
		||||
> ```
 | 
			
		||||
>
 | 
			
		||||
> ```ini
 | 
			
		||||
> ### config.cfg
 | 
			
		||||
> [initialize.components.textcat_multilabel]
 | 
			
		||||
>
 | 
			
		||||
> [initialize.components.textcat_multilabel.labels]
 | 
			
		||||
> @readers = "spacy.read_labels.v1"
 | 
			
		||||
> path = "corpus/labels/textcat.json
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name             | Description                                                                                                                                                                                                                                                                                                                                                                                                |
 | 
			
		||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
 | 
			
		||||
| _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | 
			
		||||
| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | 
			
		||||
| `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.predict {#predict tag="method"}
 | 
			
		||||
 | 
			
		||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 | 
			
		||||
modifying them.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> scores = textcat.predict([doc1, doc2])
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                                 |
 | 
			
		||||
| ----------- | ------------------------------------------- |
 | 
			
		||||
| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
 | 
			
		||||
| **RETURNS** | The model's prediction for each document.   |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"}
 | 
			
		||||
 | 
			
		||||
Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> scores = textcat.predict(docs)
 | 
			
		||||
> textcat.set_annotations(docs, scores)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name     | Description                                               |
 | 
			
		||||
| -------- | --------------------------------------------------------- |
 | 
			
		||||
| `docs`   | The documents to modify. ~~Iterable[Doc]~~                |
 | 
			
		||||
| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.update {#update tag="method"}
 | 
			
		||||
 | 
			
		||||
Learn from a batch of [`Example`](/api/example) objects containing the
 | 
			
		||||
predictions and gold-standard annotations, and update the component's model.
 | 
			
		||||
Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
 | 
			
		||||
[`get_loss`](/api/multilabel_textcategorizer#get_loss).
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> optimizer = nlp.initialize()
 | 
			
		||||
> losses = textcat.update(examples, sgd=optimizer)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name              | Description                                                                                                                        |
 | 
			
		||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
 | 
			
		||||
| _keyword-only_    |                                                                                                                                    |
 | 
			
		||||
| `drop`            | The dropout rate. ~~float~~                                                                                                        |
 | 
			
		||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
 | 
			
		||||
| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
 | 
			
		||||
| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | 
			
		||||
| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
 | 
			
		||||
 | 
			
		||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
 | 
			
		||||
current model to make predictions similar to an initial model to try to address
 | 
			
		||||
the "catastrophic forgetting" problem. This feature is experimental.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> optimizer = nlp.resume_training()
 | 
			
		||||
> losses = textcat.rehearse(examples, sgd=optimizer)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                                              |
 | 
			
		||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
 | 
			
		||||
| _keyword-only_ |                                                                                                                          |
 | 
			
		||||
| `drop`         | The dropout rate. ~~float~~                                                                                              |
 | 
			
		||||
| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | 
			
		||||
| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | 
			
		||||
| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"}
 | 
			
		||||
 | 
			
		||||
Find the loss and gradient of loss for the batch of documents and their
 | 
			
		||||
predicted scores.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
> scores = textcat.predict([eg.predicted for eg in examples])
 | 
			
		||||
> loss, d_loss = textcat.get_loss(examples, scores)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                                                                 |
 | 
			
		||||
| ----------- | --------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`  | The batch of examples. ~~Iterable[Example]~~                                |
 | 
			
		||||
| `scores`    | Scores representing the model's predictions.                                |
 | 
			
		||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.score {#score tag="method" new="3"}
 | 
			
		||||
 | 
			
		||||
Score a batch of examples.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> scores = textcat.score(examples)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name             | Description                                                                                                          |
 | 
			
		||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`       | The examples to score. ~~Iterable[Example]~~                                                                         |
 | 
			
		||||
| _keyword-only_   |                                                                                                                      |
 | 
			
		||||
| **RETURNS**      | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"}
 | 
			
		||||
 | 
			
		||||
Create an optimizer for the pipeline component.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> optimizer = textcat.create_optimizer()
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                  |
 | 
			
		||||
| ----------- | ---------------------------- |
 | 
			
		||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"}
 | 
			
		||||
 | 
			
		||||
Modify the pipe's model to use the given parameter values.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> with textcat.use_params(optimizer.averages):
 | 
			
		||||
>     textcat.to_disk("/best_model")
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name     | Description                                        |
 | 
			
		||||
| -------- | -------------------------------------------------- |
 | 
			
		||||
| `params` | The parameter values to use in the model. ~~dict~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.add_label {#add_label tag="method"}
 | 
			
		||||
 | 
			
		||||
Add a new label to the pipe. Raises an error if the output dimension is already
 | 
			
		||||
set, or if the model has already been fully [initialized](#initialize). Note
 | 
			
		||||
that you don't have to call this method if you provide a **representative data
 | 
			
		||||
sample** to the [`initialize`](#initialize) method. In this case, all labels
 | 
			
		||||
found in the sample will be automatically added to the model, and the output
 | 
			
		||||
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
 | 
			
		||||
automatically.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> textcat.add_label("MY_LABEL")
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                                                 |
 | 
			
		||||
| ----------- | ----------------------------------------------------------- |
 | 
			
		||||
| `label`     | The label to add. ~~str~~                                   |
 | 
			
		||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
Serialize the pipe to disk.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> textcat.to_disk("/path/to/textcat")
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                                                                |
 | 
			
		||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
			
		||||
| _keyword-only_ |                                                                                                                                            |
 | 
			
		||||
| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> textcat.from_disk("/path/to/textcat")
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                     |
 | 
			
		||||
| -------------- | ----------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
			
		||||
| _keyword-only_ |                                                                                                 |
 | 
			
		||||
| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | 
			
		||||
| **RETURNS**    | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                      |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> textcat_bytes = textcat.to_bytes()
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
Serialize the pipe to a bytestring.
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                 |
 | 
			
		||||
| -------------- | ------------------------------------------------------------------------------------------- |
 | 
			
		||||
| _keyword-only_ |                                                                                             |
 | 
			
		||||
| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | 
			
		||||
| **RETURNS**    | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~                              |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"}
 | 
			
		||||
 | 
			
		||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat_bytes = textcat.to_bytes()
 | 
			
		||||
> textcat = nlp.add_pipe("textcat")
 | 
			
		||||
> textcat.from_bytes(textcat_bytes)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                 |
 | 
			
		||||
| -------------- | ------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
 | 
			
		||||
| _keyword-only_ |                                                                                             |
 | 
			
		||||
| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | 
			
		||||
| **RETURNS**    | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                           |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.labels {#labels tag="property"}
 | 
			
		||||
 | 
			
		||||
The labels currently added to the component.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> textcat.add_label("MY_LABEL")
 | 
			
		||||
> assert "MY_LABEL" in textcat.labels
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                                            |
 | 
			
		||||
| ----------- | ------------------------------------------------------ |
 | 
			
		||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 | 
			
		||||
 | 
			
		||||
## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"}
 | 
			
		||||
 | 
			
		||||
The labels currently added to the component and their internal meta information.
 | 
			
		||||
This is the data generated by [`init labels`](/api/cli#init-labels) and used by
 | 
			
		||||
[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize
 | 
			
		||||
the model with a pre-defined label set.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> labels = textcat.label_data
 | 
			
		||||
> textcat.initialize(lambda: [], nlp=nlp, labels=labels)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Description                                                |
 | 
			
		||||
| ----------- | ---------------------------------------------------------- |
 | 
			
		||||
| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
 | 
			
		||||
 | 
			
		||||
## Serialization fields {#serialization-fields}
 | 
			
		||||
 | 
			
		||||
During serialization, spaCy will export several data fields used to restore
 | 
			
		||||
different aspects of the object. If needed, you can exclude them from
 | 
			
		||||
serialization by passing in the string names via the `exclude` argument.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> data = textcat.to_disk("/path", exclude=["vocab"])
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name    | Description                                                    |
 | 
			
		||||
| ------- | -------------------------------------------------------------- |
 | 
			
		||||
| `vocab` | The shared [`Vocab`](/api/vocab).                              |
 | 
			
		||||
| `cfg`   | The config file. You usually don't want to exclude this.       |
 | 
			
		||||
| `model` | The binary model data. You usually don't want to exclude this. |
 | 
			
		||||
| 
						 | 
				
			
			@ -3,17 +3,15 @@ title: TextCategorizer
 | 
			
		|||
tag: class
 | 
			
		||||
source: spacy/pipeline/textcat.py
 | 
			
		||||
new: 2
 | 
			
		||||
teaser: 'Pipeline component for text classification'
 | 
			
		||||
teaser: 'Pipeline component for single-label text classification'
 | 
			
		||||
api_base_class: /api/pipe
 | 
			
		||||
api_string_name: textcat
 | 
			
		||||
api_trainable: true
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
The text categorizer predicts **categories over a whole document**. It can learn
 | 
			
		||||
one or more labels, and the labels can be mutually exclusive (i.e. one true
 | 
			
		||||
label per document) or non-mutually exclusive (i.e. zero or more labels may be
 | 
			
		||||
true per document). The multi-label setting is controlled by the model instance
 | 
			
		||||
that's provided.
 | 
			
		||||
one or more labels, and the labels are mutually exclusive - there is exactly one 
 | 
			
		||||
true label per document. 
 | 
			
		||||
 | 
			
		||||
## Config and implementation {#config}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -27,10 +25,10 @@ architectures and their arguments and hyperparameters.
 | 
			
		|||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 | 
			
		||||
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 | 
			
		||||
> config = {
 | 
			
		||||
>    "threshold": 0.5,
 | 
			
		||||
>    "model": DEFAULT_TEXTCAT_MODEL,
 | 
			
		||||
>    "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 | 
			
		||||
> }
 | 
			
		||||
> nlp.add_pipe("textcat", config=config)
 | 
			
		||||
> ```
 | 
			
		||||
| 
						 | 
				
			
			@ -280,7 +278,6 @@ Score a batch of examples.
 | 
			
		|||
| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`       | The examples to score. ~~Iterable[Example]~~                                                                         |
 | 
			
		||||
| _keyword-only_   |                                                                                                                      |
 | 
			
		||||
| `positive_label` | Optional positive label. ~~Optional[str]~~                                                                           |
 | 
			
		||||
| **RETURNS**      | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 | 
			
		||||
 | 
			
		||||
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -152,7 +152,7 @@ depth = 2
 | 
			
		|||
 | 
			
		||||
[components.textcat.model.linear_model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
```
 | 
			
		||||
| 
						 | 
				
			
			@ -170,7 +170,7 @@ labels = []
 | 
			
		|||
 | 
			
		||||
[components.textcat.model]
 | 
			
		||||
@architectures = "spacy.TextCatBOW.v1"
 | 
			
		||||
exclusive_classes = false
 | 
			
		||||
exclusive_classes = true
 | 
			
		||||
ngram_size = 1
 | 
			
		||||
no_output_layer = false
 | 
			
		||||
nO = null
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user