From 3216a33149ad26767640a2ff276870c49e349b10 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 14 Sep 2020 17:08:00 +0200 Subject: [PATCH] positive_label config for textcat (#6062) * hook up positive_label in textcat * unit tests * documentation * formatting * tests * fix typo * move verify_config to after begin_training * revert accidential commit --- spacy/cli/train.py | 22 ++++----- spacy/errors.py | 5 ++ spacy/pipeline/textcat.py | 47 ++++++++++++------- spacy/tests/pipeline/test_textcat.py | 34 +++++++++++++- .../serialize/test_serialize_pipeline.py | 2 +- website/docs/api/textcategorizer.md | 30 ++++++------ 6 files changed, 93 insertions(+), 47 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0bc493e56..ae4a8455e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -89,7 +89,6 @@ def train( nlp, config = util.load_model_from_config(config) if config["training"]["vectors"] is not None: util.load_vectors_into_model(nlp, config["training"]["vectors"]) - verify_config(nlp) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] @@ -108,6 +107,8 @@ def train( nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + # Verify the config after calling 'begin_training' to ensure labels are properly initialized + verify_config(nlp) if tag_map: # Replace tag map with provided mapping @@ -401,7 +402,7 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config and loaded nlp object.""" + """Perform additional checks based on the config, loaded nlp object and training data.""" # TODO: maybe we should validate based on the actual components, the list # in config["nlp"]["pipeline"] instead? for pipe_config in nlp.config["components"].values(): @@ -415,18 +416,13 @@ def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: # if 'positive_label' is provided: double check whether it's in the data and # the task is binary if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) + textcat_labels = nlp.get_pipe("textcat").labels pos_label = pipe_config.get("positive_label") if pos_label not in textcat_labels: - msg.fail( - f"The textcat's 'positive_label' config setting '{pos_label}' " - f"does not match any label in the training data.", - exits=1, + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) ) - if len(textcat_labels) != 2: - msg.fail( - f"A textcat 'positive_label' '{pos_label}' was " - f"provided for training data that does not appear to be a " - f"binary classification problem with two labels.", - exits=1, + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) ) diff --git a/spacy/errors.py b/spacy/errors.py index 8f95609a6..f857bea52 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,11 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training " + "data that does not appear to be a binary classification problem " + "with two labels. Labels found: {labels}") + E920 = ("The textcat's 'positive_label' config setting '{pos_label}' " + "does not match any label in the training data. Labels found: {labels}") E921 = ("The method 'set_output' can only be called on components that have " "a Model with a 'resize_output' attribute. Otherwise, the output " "layer can not be dynamically changed.") diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 22d1de08f..3f6250680 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -56,7 +56,12 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL}, + default_config={ + "labels": [], + "threshold": 0.5, + "positive_label": None, + "model": DEFAULT_TEXTCAT_MODEL, + }, scores=[ "cats_score", "cats_score_desc", @@ -74,8 +79,9 @@ def make_textcat( nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], - labels: Iterable[str], + labels: List[str], threshold: float, + positive_label: Optional[str], ) -> "TextCategorizer": """Create a TextCategorizer compoment. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels can @@ -88,8 +94,16 @@ def make_textcat( labels (list): A list of categories to learn. If empty, the model infers the categories from the data. threshold (float): Cutoff to consider a prediction "positive". + positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise. """ - return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold) + return TextCategorizer( + nlp.vocab, + model, + name, + labels=labels, + threshold=threshold, + positive_label=positive_label, + ) class TextCategorizer(Pipe): @@ -104,8 +118,9 @@ class TextCategorizer(Pipe): model: Model, name: str = "textcat", *, - labels: Iterable[str], + labels: List[str], threshold: float, + positive_label: Optional[str], ) -> None: """Initialize a text categorizer. @@ -113,8 +128,9 @@ class TextCategorizer(Pipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels (Iterable[str]): The labels to use. + labels (List[str]): The labels to use. threshold (float): Cutoff to consider a prediction "positive". + positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise. DOCS: https://nightly.spacy.io/api/textcategorizer#init """ @@ -122,7 +138,11 @@ class TextCategorizer(Pipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": labels, "threshold": threshold} + cfg = { + "labels": labels, + "threshold": threshold, + "positive_label": positive_label, + } self.cfg = dict(cfg) @property @@ -131,10 +151,10 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#labels """ - return tuple(self.cfg.setdefault("labels", [])) + return tuple(self.cfg["labels"]) @labels.setter - def labels(self, value: Iterable[str]) -> None: + def labels(self, value: List[str]) -> None: self.cfg["labels"] = tuple(value) def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: @@ -353,17 +373,10 @@ class TextCategorizer(Pipe): sgd = self.create_optimizer() return sgd - def score( - self, - examples: Iterable[Example], - *, - positive_label: Optional[str] = None, - **kwargs, - ) -> Dict[str, Any]: + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. - positive_label (str): Optional positive label. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. DOCS: https://nightly.spacy.io/api/textcategorizer#score @@ -374,7 +387,7 @@ class TextCategorizer(Pipe): "cats", labels=self.labels, multi_label=self.model.attrs["multi_label"], - positive_label=positive_label, + positive_label=self.cfg["positive_label"], threshold=self.cfg["threshold"], **kwargs, ) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index d12a7211a..99b5132ca 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -10,6 +10,7 @@ from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from ..util import make_tempdir +from ...cli.train import verify_textcat_config from ...training import Example @@ -130,7 +131,10 @@ def test_overfitting_IO(): fix_random_seed(0) nlp = English() # Set exclusive labels - textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}}) + textcat = nlp.add_pipe( + "textcat", + config={"model": {"exclusive_classes": True}, "positive_label": "POSITIVE"}, + ) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -159,7 +163,7 @@ def test_overfitting_IO(): assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001) # Test scoring - scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) + scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores @@ -194,3 +198,29 @@ def test_textcat_configs(textcat_config): for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) + + +def test_positive_class(): + nlp = English() + pipe_config = {"positive_label": "POS", "labels": ["POS", "NEG"]} + textcat = nlp.add_pipe("textcat", config=pipe_config) + assert textcat.labels == ("POS", "NEG") + verify_textcat_config(nlp, pipe_config) + + +def test_positive_class_not_present(): + nlp = English() + pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING"]} + textcat = nlp.add_pipe("textcat", config=pipe_config) + assert textcat.labels == ("SOME", "THING") + with pytest.raises(ValueError): + verify_textcat_config(nlp, pipe_config) + + +def test_positive_class_not_binary(): + nlp = English() + pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING", "POS"]} + textcat = nlp.add_pipe("textcat", config=pipe_config) + assert textcat.labels == ("SOME", "THING", "POS") + with pytest.raises(ValueError): + verify_textcat_config(nlp, pipe_config) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index e621aebd8..eedad31e0 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5) + textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None) textcat.to_bytes(exclude=["vocab"]) diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 95f4c6c1a..b68039094 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -36,11 +36,12 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("textcat", config=config) > ``` -| Setting | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | -| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ | +| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/textcat.py @@ -60,21 +61,22 @@ architectures and their arguments and hyperparameters. > > # Construction from class > from spacy.pipeline import TextCategorizer -> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5) +> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5, positive_label="POS") > ``` Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `labels` | The labels to use. ~~Iterable[str]~~ | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `labels` | The labels to use. ~~Iterable[str]~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"}