From 29906884c5de57c73f5512452a7eb871061fb96c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Jul 2021 12:35:22 +0200
Subject: [PATCH] Raise an error for textcat with <2 labels (#8584)

* Raise an error for textcat with <2 labels

Raise an error if initializing a `textcat` component without at least
two labels.

* Add similar note to docs

* Update positive_label description in API docs
---
 spacy/errors.py                      | 5 +++++
 spacy/pipeline/textcat.py            | 2 ++
 spacy/tests/pipeline/test_textcat.py | 6 ++++++
 website/docs/api/textcategorizer.md  | 9 +++++----
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4c0ccfb8f..2173dd58a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -521,6 +521,11 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E867 = ("The 'textcat' component requires at least two labels because it "
+            "uses mutually exclusive classes where exactly one label is True "
+            "for each doc. For binary classification tasks, you can use two "
+            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
+            "can use the 'textcat_multilabel' component with one label.")
     E868 = ("Found a conflicting gold annotation in a reference document, "
             "with the following char-based span occurring both in the gold ents "
             "as well as in the negative spans: {span}.")
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 72a6dcd61..0dde5de82 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -336,6 +336,8 @@ class TextCategorizer(TrainablePipe):
         else:
             for label in labels:
                 self.add_label(label)
+        if len(self.labels) < 2:
+            raise ValueError(Errors.E867)
         if positive_label is not None:
             if positive_label not in self.labels:
                 err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index fdb44b412..b134b8508 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -108,6 +108,12 @@ def test_label_types(name):
     textcat.add_label("answer")
     with pytest.raises(ValueError):
         textcat.add_label(9)
+    # textcat requires at least two labels
+    if name == "textcat":
+        with pytest.raises(ValueError):
+            nlp.initialize()
+    else:
+        nlp.initialize()
 
 
 @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index fdd235b85..baa30ae01 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -10,11 +10,12 @@ api_trainable: true
 ---
 
 The text categorizer predicts **categories over a whole document**. and comes in
-two flavours: `textcat` and `textcat_multilabel`. When you need to predict
+two flavors: `textcat` and `textcat_multilabel`. When you need to predict
 exactly one true label per document, use the `textcat` which has mutually
 exclusive labels. If you want to perform multi-label classification and predict
-zero, one or more labels per document, use the `textcat_multilabel` component
-instead.
+zero, one or more true labels per document, use the `textcat_multilabel`
+component instead. For a binary classification task, you can use `textcat` with
+**two** labels or `textcat_multilabel` with **one** label.
 
 Both components are documented on this page.
 
@@ -189,7 +190,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                             |
 
 ## TextCategorizer.predict {#predict tag="method"}