textcat scoring fix and multi_label docs (#6974)

* add multi-label textcat to menu * add infobox on textcat API * add info to v3 migration guide * small edits * further fixes in doc strings * add infobox to textcat architectures * add textcat_multilabel to overview of built-in components * spelling * fix unrelated warn msg * Add textcat_multilabel to quickstart [ci skip] * remove separate documentation page for multilabel_textcategorizer * small edits * positive label clarification * avoid duplicating information in self.cfg and fix textcat.score * fix multilabel textcat too * revert threshold to storage in cfg * revert threshold stuff for multi-textcat Co-authored-by: Ines Montani <ines@ines.io>
2025-09-17 01:22:37 +03:00 · 2021-03-09 13:04:22 +01:00 · 2021-03-09 13:04:22 +01:00 · 932887b950
commit 932887b950
parent 39de3602e0
10 changed files with 191 additions and 520 deletions
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -60,7 +60,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
        model_name = model
        if model in OLD_MODEL_SHORTCUTS:
            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
            )
            model_name = OLD_MODEL_SHORTCUTS[model]
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -88,11 +88,9 @@ subword_features = true
 def make_textcat(
    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be mutually exclusive (i.e. one true label per doc).

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
@ -317,9 +315,11 @@ class TextCategorizer(TrainablePipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
-        labels: The labels to add to the component, typically generated by the
+        labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.
+        positive_label (Optional[str]): The positive label for a binary task with exclusive classes,
+            `None` otherwise and by default.

        DOCS: https://spacy.io/api/textcategorizer#initialize
        """
@ -358,13 +358,13 @@ class TextCategorizer(TrainablePipe):
        """
        validate_examples(examples, "TextCategorizer.score")
        self._validate_categories(examples)
+        kwargs.setdefault("threshold", self.cfg["threshold"])
+        kwargs.setdefault("positive_label", self.cfg["positive_label"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=False,
-            positive_label=self.cfg["positive_label"],
-            threshold=self.cfg["threshold"],
            **kwargs,
        )

--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -88,11 +88,10 @@ subword_features = true
 def make_multilabel_textcat(
    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be non-mutually exclusive, which means that there can be zero or more labels
+    per doc).

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
@ -104,7 +103,7 @@ def make_multilabel_textcat(
 class MultiLabel_TextCategorizer(TextCategorizer):
    """Pipeline component for multi-label text classification.

-    DOCS: https://spacy.io/api/multilabel_textcategorizer
+    DOCS: https://spacy.io/api/textcategorizer
    """

    def __init__(
@ -123,7 +122,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#init
+        DOCS: https://spacy.io/api/textcategorizer#init
        """
        self.vocab = vocab
        self.model = model
@ -149,7 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#initialize
+        DOCS: https://spacy.io/api/textcategorizer#initialize
        """
        validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
        if labels is None:
@ -173,15 +172,15 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#score
+        DOCS: https://spacy.io/api/textcategorizer#score
        """
        validate_examples(examples, "MultiLabel_TextCategorizer.score")
+        kwargs.setdefault("threshold", self.cfg["threshold"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=True,
-            threshold=self.cfg["threshold"],
            **kwargs,
        )

--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -370,3 +370,51 @@ def test_textcat_evaluation():

    assert scores["cats_micro_p"] == 4 / 5
    assert scores["cats_micro_r"] == 4 / 6
+
+
+def test_textcat_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    macro_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"})
+    pos_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+    assert pos_f > macro_f
+
+
+def test_textcat_multi_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat_multilabel")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -589,6 +589,17 @@ several different built-in architectures. It is recommended to experiment with
 different architectures and settings to determine what works best on your
 specific data and challenge.

+<Infobox title="Single-label vs. multi-label classification" variant="warning">
+
+When the architecture for a text classification challenge contains a setting for
+`exclusive_classes`, it is important to use the correct value for the correct
+pipeline component. The `textcat` component should always be used for
+single-label use-cases where `exclusive_classes = true`, while the
+`textcat_multilabel` should be used for multi-label settings with
+`exclusive_classes = false`.
+
+</Infobox>
+
 ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}

 > #### Example Config
--- a/website/docs/api/multilabel_textcategorizer.md
+++ b/website/docs/api/multilabel_textcategorizer.md
@ -1,453 +0,0 @@
---
-title: Multi-label TextCategorizer
-tag: class
-source: spacy/pipeline/textcat_multilabel.py
-new: 3
-teaser: 'Pipeline component for multi-label text classification'
-api_base_class: /api/pipe
-api_string_name: textcat_multilabel
-api_trainable: true
---
-
-The text categorizer predicts **categories over a whole document**. It 
-learns non-mutually exclusive labels, which means that zero or more labels 
-may be true per document.
-
-## Config and implementation {#config}
-
-The default config is defined by the pipeline component factory and describes
-how the component should be configured. You can override its settings via the
-`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
-[`config.cfg` for training](/usage/training#config). See the
-[model architectures](/api/architectures) documentation for details on the
-architectures and their arguments and hyperparameters.
-
-> #### Example
->
-> ```python
-> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
-> config = {
->    "threshold": 0.5,
->    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-> }
-> nlp.add_pipe("textcat_multilabel", config=config)
-> ```
-
-| Setting     | Description                                                                                                                                                      |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
-| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
-
-```python
-%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py
-```
-
-## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"}
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe with default model
-> textcat = nlp.add_pipe("textcat_multilabel")
->
-> # Construction via add_pipe with custom model
-> config = {"model": {"@architectures": "my_textcat"}}
-> parser = nlp.add_pipe("textcat_multilabel", config=config)
->
-> # Construction from class
-> from spacy.pipeline import MultiLabel_TextCategorizer
-> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5)
-> ```
-
-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#create_pipe).
-
-| Name           | Description                                                                                                                |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                           |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
-| _keyword-only_ |                                                                                                                            |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
-
-## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"}
-
-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
-[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe)
-delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and
-[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> doc = nlp("This is a sentence.")
-> textcat = nlp.add_pipe("textcat_multilabel")
-> # This usually happens under the hood
-> processed = textcat(doc)
-> ```
-
-| Name        | Description                      |
-| ----------- | -------------------------------- |
-| `doc`       | The document to process. ~~Doc~~ |
-| **RETURNS** | The processed document. ~~Doc~~  |
-
-## MultiLabel_TextCategorizer.pipe {#pipe tag="method"}
-
-Apply the pipe to a stream of documents. This usually happens under the hood
-when the `nlp` object is called on a text and all pipeline components are
-applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and
-[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the
-[`predict`](/api/multilabel_textcategorizer#predict) and
-[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> for doc in textcat.pipe(docs, batch_size=50):
->     pass
-> ```
-
-| Name           | Description                                                   |
-| -------------- | ------------------------------------------------------------- |
-| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
-| _keyword-only_ |                                                               |
-| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
-| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-
-## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"}
-
-Initialize the component for training. `get_examples` should be a function that
-returns an iterable of [`Example`](/api/example) objects. The data examples are
-used to **initialize the model** of the component and can either be the full
-training data or a representative sample. Initialization includes validating the
-network,
-[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> textcat.initialize(lambda: [], nlp=nlp)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.textcat_multilabel]
->
-> [initialize.components.textcat_multilabel.labels]
-> @readers = "spacy.read_labels.v1"
-> path = "corpus/labels/textcat.json
-> ```
-
-| Name             | Description                                                                                                                                                                                                                                                                                                                                                                                                |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
-| _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
-| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
-| `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-
-## MultiLabel_TextCategorizer.predict {#predict tag="method"}
-
-Apply the component's model to a batch of [`Doc`](/api/doc) objects without
-modifying them.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict([doc1, doc2])
-> ```
-
-| Name        | Description                                 |
-| ----------- | ------------------------------------------- |
-| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
-| **RETURNS** | The model's prediction for each document.   |
-
-## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"}
-
-Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict(docs)
-> textcat.set_annotations(docs, scores)
-> ```
-
-| Name     | Description                                               |
-| -------- | --------------------------------------------------------- |
-| `docs`   | The documents to modify. ~~Iterable[Doc]~~                |
-| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. |
-
-## MultiLabel_TextCategorizer.update {#update tag="method"}
-
-Learn from a batch of [`Example`](/api/example) objects containing the
-predictions and gold-standard annotations, and update the component's model.
-Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
-[`get_loss`](/api/multilabel_textcategorizer#get_loss).
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> optimizer = nlp.initialize()
-> losses = textcat.update(examples, sgd=optimizer)
-> ```
-
-| Name              | Description                                                                                                                        |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
-
-## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
-
-Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model to try to address
-the "catastrophic forgetting" problem. This feature is experimental.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> optimizer = nlp.resume_training()
-> losses = textcat.rehearse(examples, sgd=optimizer)
-> ```
-
-| Name           | Description                                                                                                              |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          |
-| `drop`         | The dropout rate. ~~float~~                                                                                              |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
-| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
-
-## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"}
-
-Find the loss and gradient of loss for the batch of documents and their
-predicted scores.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict([eg.predicted for eg in examples])
-> loss, d_loss = textcat.get_loss(examples, scores)
-> ```
-
-| Name        | Description                                                                 |
-| ----------- | --------------------------------------------------------------------------- |
-| `examples`  | The batch of examples. ~~Iterable[Example]~~                                |
-| `scores`    | Scores representing the model's predictions.                                |
-| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-
-## MultiLabel_TextCategorizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = textcat.score(examples)
-> ```
-
-| Name             | Description                                                                                                          |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The examples to score. ~~Iterable[Example]~~                                                                         |
-| _keyword-only_   |                                                                                                                      |
-| **RETURNS**      | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
-## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"}
-
-Create an optimizer for the pipeline component.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.create_optimizer()
-> ```
-
-| Name        | Description                  |
-| ----------- | ---------------------------- |
-| **RETURNS** | The optimizer. ~~Optimizer~~ |
-
-## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"}
-
-Modify the pipe's model to use the given parameter values.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> with textcat.use_params(optimizer.averages):
->     textcat.to_disk("/best_model")
-> ```
-
-| Name     | Description                                        |
-| -------- | -------------------------------------------------- |
-| `params` | The parameter values to use in the model. ~~dict~~ |
-
-## MultiLabel_TextCategorizer.add_label {#add_label tag="method"}
-
-Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#initialize). Note
-that you don't have to call this method if you provide a **representative data
-sample** to the [`initialize`](#initialize) method. In this case, all labels
-found in the sample will be automatically added to the model, and the output
-dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.add_label("MY_LABEL")
-> ```
-
-| Name        | Description                                                 |
-| ----------- | ----------------------------------------------------------- |
-| `label`     | The label to add. ~~str~~                                   |
-| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
-
-## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"}
-
-Serialize the pipe to disk.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.to_disk("/path/to/textcat")
-> ```
-
-| Name           | Description                                                                                                                                |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| _keyword-only_ |                                                                                                                                            |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
-
-## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"}
-
-Load the pipe from disk. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.from_disk("/path/to/textcat")
-> ```
-
-| Name           | Description                                                                                     |
-| -------------- | ----------------------------------------------------------------------------------------------- |
-| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| _keyword-only_ |                                                                                                 |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
-| **RETURNS**    | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                      |
-
-## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"}
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat_bytes = textcat.to_bytes()
-> ```
-
-Serialize the pipe to a bytestring.
-
-| Name           | Description                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------- |
-| _keyword-only_ |                                                                                             |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~                              |
-
-## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> textcat_bytes = textcat.to_bytes()
-> textcat = nlp.add_pipe("textcat")
-> textcat.from_bytes(textcat_bytes)
-> ```
-
-| Name           | Description                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------- |
-| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
-| _keyword-only_ |                                                                                             |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                           |
-
-## MultiLabel_TextCategorizer.labels {#labels tag="property"}
-
-The labels currently added to the component.
-
-> #### Example
->
-> ```python
-> textcat.add_label("MY_LABEL")
-> assert "MY_LABEL" in textcat.labels
-> ```
-
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
-
-## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"}
-
-The labels currently added to the component and their internal meta information.
-This is the data generated by [`init labels`](/api/cli#init-labels) and used by
-[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize
-the model with a pre-defined label set.
-
-> #### Example
->
-> ```python
-> labels = textcat.label_data
-> textcat.initialize(lambda: [], nlp=nlp, labels=labels)
-> ```
-
-| Name        | Description                                                |
-| ----------- | ---------------------------------------------------------- |
-| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
-
-## Serialization fields {#serialization-fields}
-
-During serialization, spaCy will export several data fields used to restore
-different aspects of the object. If needed, you can exclude them from
-serialization by passing in the string names via the `exclude` argument.
-
-> #### Example
->
-> ```python
-> data = textcat.to_disk("/path", exclude=["vocab"])
-> ```
-
-| Name    | Description                                                    |
-| ------- | -------------------------------------------------------------- |
-| `vocab` | The shared [`Vocab`](/api/vocab).                              |
-| `cfg`   | The config file. You usually don't want to exclude this.       |
-| `model` | The binary model data. You usually don't want to exclude this. |
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -3,15 +3,30 @@ title: TextCategorizer
 tag: class
 source: spacy/pipeline/textcat.py
 new: 2
-teaser: 'Pipeline component for single-label text classification'
+teaser: 'Pipeline component for text classification'
 api_base_class: /api/pipe
 api_string_name: textcat
 api_trainable: true
 ---

-The text categorizer predicts **categories over a whole document**. It can learn
-one or more labels, and the labels are mutually exclusive - there is exactly one 
-true label per document. 
+The text categorizer predicts **categories over a whole document**. and comes in
+two flavours: `textcat` and `textcat_multilabel`. When you need to predict
+exactly one true label per document, use the `textcat` which has mutually
+exclusive labels. If you want to perform multi-label classification and predict
+zero, one or more labels per document, use the `textcat_multilabel` component
+instead.
+
+Both components are documented on this page.
+
+<Infobox title="Migration from v2" variant="warning">
+
+In spaCy v2, the `textcat` component could also perform **multi-label
+classification**, and even used this setting by default. Since v3.0, the
+component `textcat_multilabel` should be used for multi-label classification
+instead. The `textcat` component is now used for mutually exclusive classes
+only.
+
+</Infobox>

 ## Config and implementation {#config}

@ -22,7 +37,7 @@ how the component should be configured. You can override its settings via the
 [model architectures](/api/architectures) documentation for details on the
 architectures and their arguments and hyperparameters.

-> #### Example
+> #### Example (textcat)
 >
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
@ -33,6 +48,17 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("textcat", config=config)
 > ```

+> #### Example (textcat_multilabel)
+>
+> ```python
+> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
+> config = {
+>    "threshold": 0.5,
+>    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
+> }
+> nlp.add_pipe("textcat_multilabel", config=config)
+> ```
+
 | Setting     | Description                                                                                                                                                      |
 | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
@ -48,6 +74,7 @@ architectures and their arguments and hyperparameters.
 >
 > ```python
 > # Construction via add_pipe with default model
+> # Use 'textcat_multilabel' for multi-label classification
 > textcat = nlp.add_pipe("textcat")
 >
 > # Construction via add_pipe with custom model
@ -55,6 +82,7 @@ architectures and their arguments and hyperparameters.
 > parser = nlp.add_pipe("textcat", config=config)
 >
 > # Construction from class
+> # Use 'MultiLabel_TextCategorizer' for multi-label classification
 > from spacy.pipeline import TextCategorizer
 > textcat = TextCategorizer(nlp.vocab, model, threshold=0.5)
 > ```
@ -161,7 +189,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~                                                                                                                                                                                                                                                                                              |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |

 ## TextCategorizer.predict {#predict tag="method"}

@ -213,7 +241,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 > ```

 | Name           | Description                                                                                                              |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
 | _keyword-only_ |                                                                                                                          |
 | `drop`         | The dropout rate. ~~float~~                                                                                              |
@ -274,7 +302,7 @@ Score a batch of examples.
 > ```

 | Name           | Description                                                                                                          |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
+| -------------- | -------------------------------------------------------------------------------------------------------------------- |
 | `examples`     | The examples to score. ~~Iterable[Example]~~                                                                         |
 | _keyword-only_ |                                                                                                                      |
 | **RETURNS**    | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -224,13 +224,14 @@ available pipeline components and component functions.
 > ```

 | String name          | Component                                            | Description                                                                               |
-| ----------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| -------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
 | `tagger`             | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
 | `parser`             | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
 | `ner`                | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
 | `entity_linker`      | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
 | `entity_ruler`       | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
-| `textcat`         | [`TextCategorizer`](/api/textcategorizer)       | Assign text categories.                                                                   |
+| `textcat`            | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
+| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
 | `lemmatizer`         | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words.                                                               |
 | `morphologizer`      | [`Morphologizer`](/api/morphologizer)                | Assign morphological features and coarse-grained POS tags.                                |
 | `attribute_ruler`    | [`AttributeRuler`](/api/attributeruler)              | Assign token attribute mappings and rule-based exceptions.                                |
@ -400,8 +401,8 @@ vectors available – otherwise, it won't be able to make the same predictions.
 > ```
 >
 > By default, sourced components will be updated with your data during training.
-> If you want to preserve the component as-is, you can "freeze" it if the pipeline 
-> is not using a shared `Tok2Vec` layer:
+> If you want to preserve the component as-is, you can "freeze" it if the
+> pipeline is not using a shared `Tok2Vec` layer:
 >
 > ```ini
 > [training]
@ -1244,7 +1245,7 @@ labels = []
 # the argument "model"
 [components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
-exclusive_classes = false
+exclusive_classes = true
 ngram_size = 1
 no_output_layer = false

--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -321,13 +321,14 @@ add to your pipeline and customize for your use case:
 > ```

 | Name                                                  | Description                                                                                                                                                                                                             |
-| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [`SentenceRecognizer`](/api/sentencerecognizer)       | Trainable component for sentence segmentation.                                                                                                                                                                          |
 | [`Morphologizer`](/api/morphologizer)                 | Trainable component to predict morphological features.                                                                                                                                                                  |
 | [`Lemmatizer`](/api/lemmatizer)                       | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                           |
 | [`AttributeRuler`](/api/attributeruler)               | Component for setting token attributes using match patterns.                                                                                                                                                            |
 | [`Transformer`](/api/transformer)                     | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
 | [`TrainablePipe`](/api/pipe)                          | Base class for trainable pipeline components.                                                                                                                                                                           |
+| [`Multi-label TextCategorizer`](/api/textcategorizer) | Trainable component for multi-label text classification.                                                                                                                                                                |

 <Infobox title="Details & Documentation" emoji="📖" list>

@ -592,6 +593,10 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 - Various keyword arguments across functions and methods are now explicitly
  declared as **keyword-only** arguments. Those arguments are documented
  accordingly across the API reference using the <Tag>keyword-only</Tag> tag.
+- The `textcat` pipeline component is now only applicable for classification of
+  mutually exclusives classes - i.e. one predicted class per input sentence or
+  document. To perform multi-label classification, use the new
+  `textcat_multilabel` component instead.

 ### Removed or renamed API {#incompat-removed}

--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@ -9,6 +9,7 @@ import { htmlToReact } from '../components/util'
 const DEFAULT_LANG = 'en'
 const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_OPT = 'efficiency'
+const DEFAULT_TEXTCAT_EXCLUSIVE = true
 const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
 const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
 # you can run spacy init fill-config to auto-fill all default settings:
@ -27,6 +28,19 @@ const DATA = [
        options: COMPONENTS.map(id => ({ id, title: id })),
        multiple: true,
    },
+    {
+        id: 'textcat',
+        title: 'Text Classification',
+        multiple: true,
+        options: [
+            {
+                id: 'exclusive',
+                title: 'exclusive categories',
+                checked: DEFAULT_TEXTCAT_EXCLUSIVE,
+                help: 'only one label can apply',
+            },
+        ],
+    },
    {
        id: 'hardware',
        title: 'Hardware',
@ -49,14 +63,28 @@ const DATA = [

 export default function QuickstartTraining({ id, title, download = 'base_config.cfg' }) {
    const [lang, setLang] = useState(DEFAULT_LANG)
+    const [_components, _setComponents] = useState([])
    const [components, setComponents] = useState([])
    const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
    const [[optimize], setOptimize] = useState([DEFAULT_OPT])
+    const [textcatExclusive, setTextcatExclusive] = useState(DEFAULT_TEXTCAT_EXCLUSIVE)
+
+    function updateComponents(value, isExclusive) {
+        _setComponents(value)
+        const updated = value.map(c => (c === 'textcat' && !isExclusive ? 'textcat_multilabel' : c))
+        setComponents(updated)
+    }
+
    const setters = {
        lang: setLang,
-        components: setComponents,
+        components: v => updateComponents(v, textcatExclusive),
        hardware: setHardware,
        optimize: setOptimize,
+        textcat: v => {
+            const isExclusive = v.includes('exclusive')
+            setTextcatExclusive(isExclusive)
+            updateComponents(_components, isExclusive)
+        },
    }
    const reco = GENERATOR_DATA[lang] || GENERATOR_DATA.__default__
    const content = generator({
@ -78,20 +106,24 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
        <StaticQuery
            query={query}
            render={({ site }) => {
+                let data = DATA
                const langs = site.siteMetadata.languages
-                DATA[0].dropdown = langs
+                data[0].dropdown = langs
                    .map(({ name, code }) => ({
                        id: code,
                        title: name,
                    }))
                    .sort((a, b) => a.title.localeCompare(b.title))
+                if (!_components.includes('textcat')) {
+                    data = data.filter(({ id }) => id !== 'textcat')
+                }
                return (
                    <Quickstart
                        id="quickstart-widget"
                        Container="div"
                        download={download}
                        rawContent={rawContent}
-                        data={DATA}
+                        data={data}
                        title={title}
                        id={id}
                        setters={setters}