mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Raise an error for textcat with <2 labels (#8584)
* Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs
This commit is contained in:
parent
3b1d5350d0
commit
29906884c5
|
@ -521,6 +521,11 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
|
||||||
|
"can use the 'textcat_multilabel' component with one label.")
|
||||||
E868 = ("Found a conflicting gold annotation in a reference document, "
|
E868 = ("Found a conflicting gold annotation in a reference document, "
|
||||||
"with the following char-based span occurring both in the gold ents "
|
"with the following char-based span occurring both in the gold ents "
|
||||||
"as well as in the negative spans: {span}.")
|
"as well as in the negative spans: {span}.")
|
||||||
|
|
|
@ -336,6 +336,8 @@ class TextCategorizer(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.add_label(label)
|
self.add_label(label)
|
||||||
|
if len(self.labels) < 2:
|
||||||
|
raise ValueError(Errors.E867)
|
||||||
if positive_label is not None:
|
if positive_label is not None:
|
||||||
if positive_label not in self.labels:
|
if positive_label not in self.labels:
|
||||||
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
||||||
|
|
|
@ -108,6 +108,12 @@ def test_label_types(name):
|
||||||
textcat.add_label("answer")
|
textcat.add_label("answer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.add_label(9)
|
textcat.add_label(9)
|
||||||
|
# textcat requires at least two labels
|
||||||
|
if name == "textcat":
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize()
|
||||||
|
else:
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||||
|
|
|
@ -10,11 +10,12 @@ api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
The text categorizer predicts **categories over a whole document**. and comes in
|
The text categorizer predicts **categories over a whole document**. and comes in
|
||||||
two flavours: `textcat` and `textcat_multilabel`. When you need to predict
|
two flavors: `textcat` and `textcat_multilabel`. When you need to predict
|
||||||
exactly one true label per document, use the `textcat` which has mutually
|
exactly one true label per document, use the `textcat` which has mutually
|
||||||
exclusive labels. If you want to perform multi-label classification and predict
|
exclusive labels. If you want to perform multi-label classification and predict
|
||||||
zero, one or more labels per document, use the `textcat_multilabel` component
|
zero, one or more true labels per document, use the `textcat_multilabel`
|
||||||
instead.
|
component instead. For a binary classification task, you can use `textcat` with
|
||||||
|
**two** labels or `textcat_multilabel` with **one** label.
|
||||||
|
|
||||||
Both components are documented on this page.
|
Both components are documented on this page.
|
||||||
|
|
||||||
|
@ -189,7 +190,7 @@ This method was previously called `begin_training`.
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user