From 29906884c5de57c73f5512452a7eb871061fb96c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 6 Jul 2021 12:35:22 +0200 Subject: [PATCH] Raise an error for textcat with <2 labels (#8584) * Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs --- spacy/errors.py | 5 +++++ spacy/pipeline/textcat.py | 2 ++ spacy/tests/pipeline/test_textcat.py | 6 ++++++ website/docs/api/textcategorizer.md | 9 +++++---- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 4c0ccfb8f..2173dd58a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -521,6 +521,11 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E867 = ("The 'textcat' component requires at least two labels because it " + "uses mutually exclusive classes where exactly one label is True " + "for each doc. For binary classification tasks, you can use two " + "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you " + "can use the 'textcat_multilabel' component with one label.") E868 = ("Found a conflicting gold annotation in a reference document, " "with the following char-based span occurring both in the gold ents " "as well as in the negative spans: {span}.") diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 72a6dcd61..0dde5de82 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -336,6 +336,8 @@ class TextCategorizer(TrainablePipe): else: for label in labels: self.add_label(label) + if len(self.labels) < 2: + raise ValueError(Errors.E867) if positive_label is not None: if positive_label not in self.labels: err = Errors.E920.format(pos_label=positive_label, labels=self.labels) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index fdb44b412..b134b8508 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -108,6 +108,12 @@ def test_label_types(name): textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) + # textcat requires at least two labels + if name == "textcat": + with pytest.raises(ValueError): + nlp.initialize() + else: + nlp.initialize() @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index fdd235b85..baa30ae01 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -10,11 +10,12 @@ api_trainable: true --- The text categorizer predicts **categories over a whole document**. and comes in -two flavours: `textcat` and `textcat_multilabel`. When you need to predict +two flavors: `textcat` and `textcat_multilabel`. When you need to predict exactly one true label per document, use the `textcat` which has mutually exclusive labels. If you want to perform multi-label classification and predict -zero, one or more labels per document, use the `textcat_multilabel` component -instead. +zero, one or more true labels per document, use the `textcat_multilabel` +component instead. For a binary classification task, you can use `textcat` with +**two** labels or `textcat_multilabel` with **one** label. Both components are documented on this page. @@ -189,7 +190,7 @@ This method was previously called `begin_training`. | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | ## TextCategorizer.predict {#predict tag="method"}