From afc5714d32a191f25ffbdbe8ad6697f0580184a2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 6 Jan 2021 03:07:14 +0100 Subject: [PATCH] multi-label textcat component (#6474) * multi-label textcat component * formatting * fix comment * cleanup * fix from #6481 * random edit to push the tests * add explicit error when textcat is called with multi-label gold data * fix error nr * small fix --- spacy/cli/init_config.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 65 ++- spacy/errors.py | 4 + spacy/pipeline/__init__.py | 4 +- spacy/pipeline/textcat.py | 34 +- spacy/pipeline/textcat_multilabel.py | 191 ++++++++ spacy/scorer.py | 2 +- spacy/tests/pipeline/test_textcat.py | 179 +++++-- spacy/tests/regression/test_issue5501-6000.py | 17 +- .../serialize/test_serialize_pipeline.py | 4 +- spacy/tests/training/test_readers.py | 2 +- website/docs/api/data-formats.md | 2 +- .../docs/api/multilabel_textcategorizer.md | 454 ++++++++++++++++++ website/docs/api/textcategorizer.md | 13 +- website/docs/usage/layers-architectures.md | 4 +- 15 files changed, 902 insertions(+), 75 deletions(-) create mode 100644 spacy/pipeline/textcat_multilabel.py create mode 100644 website/docs/api/multilabel_textcategorizer.md diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 65d90fc79..e862454f7 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -169,7 +169,7 @@ def init_config( "Hardware": variables["hardware"].upper(), "Transformer": template_vars.transformer.get("name", False), } - msg.info("Generated template specific for your use case") + msg.info("Generated config template specific for your use case") for label, value in use_case.items(): msg.text(f"- {label}: {value}") with show_validation_error(hint_fill=False): diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index bce3738b0..71ba5f6bc 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -149,13 +149,44 @@ grad_factor = 1.0 [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" +exclusive_classes = true +ngram_size = 1 +no_output_layer = false +{%- endif %} +{%- endif %} + +{% if "textcat_multilabel" in components %} +[components.textcat_multilabel] +factory = "textcat_multilabel" + +{% if optimize == "accuracy" %} +[components.textcat_multilabel.model] +@architectures = "spacy.TextCatEnsemble.v2" +nO = null + +[components.textcat_multilabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat_multilabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + +[components.textcat_multilabel.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + +{% else -%} +[components.textcat_multilabel.model] +@architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -288,13 +319,41 @@ width = ${components.tok2vec.model.encode.width} [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" +exclusive_classes = true +ngram_size = 1 +no_output_layer = false +{%- endif %} +{%- endif %} + +{% if "textcat_multilabel" in components %} +[components.textcat_multilabel] +factory = "textcat_multilabel" + +{% if optimize == "accuracy" %} +[components.textcat_multilabel.model] +@architectures = "spacy.TextCatEnsemble.v2" +nO = null + +[components.textcat_multilabel.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.textcat_multilabel.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + +{% else -%} +[components.textcat_multilabel.model] +@architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -303,7 +362,7 @@ no_output_layer = false {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %} +{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" diff --git a/spacy/errors.py b/spacy/errors.py index 6d3591ab5..a286a5ac3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -463,6 +463,10 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E895 = ("The 'textcat' component received gold-standard annotations with " + "multiple labels per document. In spaCy 3 you should use the " + "'textcat_multilabel' component for this instead. " + "Example of an offending annotation: {value}") E896 = ("There was an error using the static vectors. Ensure that the vectors " "of the vocab are properly initialized, or set 'include_static_vectors' " "to False.") diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index cec5b4eb5..1fa53a556 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -11,6 +11,7 @@ from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .tagger import Tagger from .textcat import TextCategorizer +from .textcat_multilabel import MultiLabel_TextCategorizer from .tok2vec import Tok2Vec from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -22,13 +23,14 @@ __all__ = [ "EntityRuler", "Morphologizer", "Lemmatizer", - "TrainablePipe", + "MultiLabel_TextCategorizer", "Pipe", "SentenceRecognizer", "Sentencizer", "Tagger", "TextCategorizer", "Tok2Vec", + "TrainablePipe", "merge_entities", "merge_noun_chunks", "merge_subtokens", diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 0781a000c..8a5d6938e 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -14,7 +14,7 @@ from ..tokens import Doc from ..vocab import Vocab -default_model_config = """ +single_label_default_config = """ [model] @architectures = "spacy.TextCatEnsemble.v2" @@ -37,24 +37,24 @@ depth = 2 [model.linear_model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false """ -DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] +DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["model"] -bow_model_config = """ +single_label_bow_config = """ [model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false """ -cnn_model_config = """ +single_label_cnn_config = """ [model] @architectures = "spacy.TextCatCNN.v1" -exclusive_classes = false +exclusive_classes = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -71,7 +71,7 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={"threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL}, + default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL}, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -103,7 +103,7 @@ def make_textcat( class TextCategorizer(TrainablePipe): - """Pipeline component for text classification. + """Pipeline component for single-label text classification. DOCS: https://nightly.spacy.io/api/textcategorizer """ @@ -111,7 +111,7 @@ class TextCategorizer(TrainablePipe): def __init__( self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float ) -> None: - """Initialize a text categorizer. + """Initialize a text categorizer for single-label classification. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. @@ -214,6 +214,7 @@ class TextCategorizer(TrainablePipe): losses = {} losses.setdefault(self.name, 0.0) validate_examples(examples, "TextCategorizer.update") + self._validate_categories(examples) if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return losses @@ -256,6 +257,7 @@ class TextCategorizer(TrainablePipe): if self._rehearsal_model is None: return losses validate_examples(examples, "TextCategorizer.rehearse") + self._validate_categories(examples) docs = [eg.predicted for eg in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. @@ -296,6 +298,7 @@ class TextCategorizer(TrainablePipe): DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss """ validate_examples(examples, "TextCategorizer.get_loss") + self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) d_scores = (scores - truths) / scores.shape[0] @@ -341,6 +344,7 @@ class TextCategorizer(TrainablePipe): DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ validate_get_examples(get_examples, "TextCategorizer.initialize") + self._validate_categories(get_examples()) if labels is None: for example in get_examples(): for cat in example.y.cats: @@ -373,12 +377,20 @@ class TextCategorizer(TrainablePipe): DOCS: https://nightly.spacy.io/api/textcategorizer#score """ validate_examples(examples, "TextCategorizer.score") + self._validate_categories(examples) return Scorer.score_cats( examples, "cats", labels=self.labels, - multi_label=self.model.attrs["multi_label"], + multi_label=False, positive_label=self.cfg["positive_label"], threshold=self.cfg["threshold"], **kwargs, ) + + + def _validate_categories(self, examples: List[Example]): + """Check whether the provided examples all have single-label cats annotations.""" + for ex in examples: + if list(ex.reference.cats.values()).count(1.0) > 1: + raise ValueError(Errors.E895.format(value=ex.reference.cats)) diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py new file mode 100644 index 000000000..fc8920414 --- /dev/null +++ b/spacy/pipeline/textcat_multilabel.py @@ -0,0 +1,191 @@ +from itertools import islice +from typing import Iterable, Optional, Dict, List, Callable, Any + +from thinc.api import Model, Config +from thinc.types import Floats2d + +from ..language import Language +from ..training import Example, validate_examples, validate_get_examples +from ..errors import Errors +from ..scorer import Scorer +from ..tokens import Doc +from ..vocab import Vocab +from .textcat import TextCategorizer + + +multi_label_default_config = """ +[model] +@architectures = "spacy.TextCatEnsemble.v2" + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 64 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false +""" +DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["model"] + +multi_label_bow_config = """ +[model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false +""" + +multi_label_cnn_config = """ +[model] +@architectures = "spacy.TextCatCNN.v1" +exclusive_classes = false + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true +""" + + +@Language.factory( + "textcat_multilabel", + assigns=["doc.cats"], + default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + "cats_macro_auc_per_type": None, + }, +) +def make_multilabel_textcat( + nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float +) -> "TextCategorizer": + """Create a TextCategorizer compoment. The text categorizer predicts categories + over a whole document. It can learn one or more labels, and the labels can + be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive + (i.e. zero or more labels may be true per doc). The multi-label setting is + controlled by the model instance that's provided. + + model (Model[List[Doc], List[Floats2d]]): A model instance that predicts + scores for each category. + threshold (float): Cutoff to consider a prediction "positive". + """ + return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold) + + +class MultiLabel_TextCategorizer(TextCategorizer): + """Pipeline component for multi-label text classification. + + DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer + """ + + def __init__( + self, + vocab: Vocab, + model: Model, + name: str = "textcat_multilabel", + *, + threshold: float, + ) -> None: + """Initialize a text categorizer for multi-label classification. + + vocab (Vocab): The shared vocabulary. + model (thinc.api.Model): The Thinc Model powering the pipeline component. + name (str): The component instance name, used to add entries to the + losses during training. + threshold (float): Cutoff to consider a prediction "positive". + + DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#init + """ + self.vocab = vocab + self.model = model + self.name = name + self._rehearsal_model = None + cfg = {"labels": [], "threshold": threshold} + self.cfg = dict(cfg) + + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language] = None, + labels: Optional[Dict] = None, + ): + """Initialize the pipe for training, using a representative set + of data examples. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. + + DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#initialize + """ + validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize") + if labels is None: + for example in get_examples(): + for cat in example.y.cats: + self.add_label(cat) + else: + for label in labels: + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) + doc_sample = [eg.reference for eg in subbatch] + label_sample, _ = self._examples_to_truth(subbatch) + self._require_labels() + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) + + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. + + DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#score + """ + validate_examples(examples, "MultiLabel_TextCategorizer.score") + return Scorer.score_cats( + examples, + "cats", + labels=self.labels, + multi_label=True, + threshold=self.cfg["threshold"], + **kwargs, + ) + + def _validate_categories(self, examples: List[Example]): + """This component allows any type of single- or multi-label annotations. + This method overwrites the more strict one from 'textcat'. """ + pass diff --git a/spacy/scorer.py b/spacy/scorer.py index 1c1a56371..dd0cde77b 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -460,7 +460,7 @@ class Scorer: gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) if gold_score is not None and gold_score > 0: f_per_type[gold_label].fn += 1 - else: + elif pred_cats: pred_label, pred_score = max(pred_cats, key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 733535b32..04a3eb27d 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -15,15 +15,31 @@ from spacy.training import Example from ..util import make_tempdir -TRAIN_DATA = [ +TRAIN_DATA_SINGLE_LABEL = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), ] +TRAIN_DATA_MULTI_LABEL = [ + ("I'm angry and confused", {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}), + ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}), +] -def make_get_examples(nlp): + +def make_get_examples_single_label(nlp): train_examples = [] - for t in TRAIN_DATA: + for t in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + def get_examples(): + return train_examples + + return get_examples + + +def make_get_examples_multi_label(nlp): + train_examples = [] + for t in TRAIN_DATA_MULTI_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) def get_examples(): @@ -85,49 +101,75 @@ def test_textcat_learns_multilabel(): assert score > 0.5 -def test_label_types(): +@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) +def test_label_types(name): nlp = Language() - textcat = nlp.add_pipe("textcat") + textcat = nlp.add_pipe(name) textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) -def test_no_label(): +@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) +def test_no_label(name): nlp = Language() - nlp.add_pipe("textcat") + nlp.add_pipe(name) with pytest.raises(ValueError): nlp.initialize() -def test_implicit_label(): +@pytest.mark.parametrize( + "name,get_examples", + [ + ("textcat", make_get_examples_single_label), + ("textcat_multilabel", make_get_examples_multi_label), + ], +) +def test_implicit_label(name, get_examples): nlp = Language() - nlp.add_pipe("textcat") - nlp.initialize(get_examples=make_get_examples(nlp)) + nlp.add_pipe(name) + nlp.initialize(get_examples=get_examples(nlp)) -def test_no_resize(): +@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) +def test_no_resize(name): nlp = Language() - textcat = nlp.add_pipe("textcat") + textcat = nlp.add_pipe(name) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() - assert textcat.model.get_dim("nO") == 2 + assert textcat.model.get_dim("nO") >= 2 # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL") -def test_initialize_examples(): +def test_error_with_multi_labels(): nlp = Language() textcat = nlp.add_pipe("textcat") - for text, annotations in TRAIN_DATA: + train_examples = [] + for text, annotations in TRAIN_DATA_MULTI_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + with pytest.raises(ValueError): + optimizer = nlp.initialize(get_examples=lambda: train_examples) + + +@pytest.mark.parametrize( + "name,get_examples, train_data", + [ + ("textcat", make_get_examples_single_label, TRAIN_DATA_SINGLE_LABEL), + ("textcat_multilabel", make_get_examples_multi_label, TRAIN_DATA_MULTI_LABEL), + ], +) +def test_initialize_examples(name, get_examples, train_data): + nlp = Language() + textcat = nlp.add_pipe(name) + for text, annotations in train_data: for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() - get_examples = make_get_examples(nlp) - nlp.initialize(get_examples=get_examples) + nlp.initialize(get_examples=get_examples(nlp)) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): @@ -138,12 +180,10 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() - nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} - # Set exclusive labels - config = {"model": {"linear_model": {"exclusive_classes": True}}} - textcat = nlp.add_pipe("textcat", config=config) + textcat = nlp.add_pipe("textcat") + train_examples = [] - for text, annotations in TRAIN_DATA: + for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 @@ -172,6 +212,8 @@ def test_overfitting_IO(): # Test scoring scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 + assert scores["cats_macro_f"] == 1.0 + assert scores["cats_macro_auc"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores @@ -192,7 +234,7 @@ def test_overfitting_IO_multi(): config = {"model": {"linear_model": {"exclusive_classes": False}}} textcat = nlp.add_pipe("textcat", config=config) train_examples = [] - for text, annotations in TRAIN_DATA: + for text, annotations in TRAIN_DATA_MULTI_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 @@ -231,27 +273,75 @@ def test_overfitting_IO_multi(): assert_equal(batch_cats_1, no_batch_cats) +def test_overfitting_IO_multi(): + # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly + fix_random_seed(0) + nlp = English() + textcat = nlp.add_pipe("textcat_multilabel") + + train_examples = [] + for text, annotations in TRAIN_DATA_MULTI_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert textcat.model.get_dim("nO") == 3 + + for i in range(100): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["textcat_multilabel"] < 0.01 + + # test the trained model + test_text = "I am confused but happy." + doc = nlp(test_text) + cats = doc.cats + assert cats["HAPPY"] > 0.9 + assert cats["CONFUSED"] > 0.9 + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + cats2 = doc2.cats + assert cats2["HAPPY"] > 0.9 + assert cats2["CONFUSED"] > 0.9 + + # Test scoring + scores = nlp.evaluate(train_examples) + assert scores["cats_micro_f"] == 1.0 + assert scores["cats_macro_f"] == 1.0 + assert "cats_score_desc" in scores + + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."] + batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] + batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] + no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] + assert_equal(batch_deps_1, batch_deps_2) + assert_equal(batch_deps_1, no_batch_deps) + + # fmt: off @pytest.mark.parametrize( - "textcat_config", + "name,train_data,textcat_config", [ - {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}, - {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, - {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, - {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, - {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, - {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, - {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) # fmt: on -def test_textcat_configs(textcat_config): +def test_textcat_configs(name, train_data, textcat_config): pipe_config = {"model": textcat_config} nlp = English() - textcat = nlp.add_pipe("textcat", config=pipe_config) + textcat = nlp.add_pipe(name, config=pipe_config) train_examples = [] - for text, annotations in TRAIN_DATA: + for text, annotations in train_data: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) @@ -264,15 +354,24 @@ def test_textcat_configs(textcat_config): def test_positive_class(): nlp = English() textcat = nlp.add_pipe("textcat") - get_examples = make_get_examples(nlp) + get_examples = make_get_examples_single_label(nlp) textcat.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS") assert textcat.labels == ("POS", "NEG") + assert textcat.cfg["positive_label"] == "POS" + + textcat_multilabel = nlp.add_pipe("textcat_multilabel") + get_examples = make_get_examples_multi_label(nlp) + with pytest.raises(TypeError): + textcat_multilabel.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS") + textcat_multilabel.initialize(get_examples, labels=["FICTION", "DRAMA"]) + assert textcat_multilabel.labels == ("FICTION", "DRAMA") + assert "positive_label" not in textcat_multilabel.cfg def test_positive_class_not_present(): nlp = English() textcat = nlp.add_pipe("textcat") - get_examples = make_get_examples(nlp) + get_examples = make_get_examples_single_label(nlp) with pytest.raises(ValueError): textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS") @@ -280,11 +379,9 @@ def test_positive_class_not_present(): def test_positive_class_not_binary(): nlp = English() textcat = nlp.add_pipe("textcat") - get_examples = make_get_examples(nlp) + get_examples = make_get_examples_multi_label(nlp) with pytest.raises(ValueError): - textcat.initialize( - get_examples, labels=["SOME", "THING", "POS"], positive_label="POS" - ) + textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS") def test_textcat_evaluation(): diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index 65fc8dda7..8d1199e98 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -2,8 +2,11 @@ import pytest from thinc.api import Config, fix_random_seed from spacy.lang.en import English -from spacy.pipeline.textcat import default_model_config, bow_model_config -from spacy.pipeline.textcat import cnn_model_config +from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config from spacy.tokens import Span from spacy import displacy from spacy.pipeline import merge_entities @@ -11,7 +14,15 @@ from spacy.training import Example @pytest.mark.parametrize( - "textcat_config", [default_model_config, bow_model_config, cnn_model_config] + "textcat_config", + [ + single_label_default_config, + single_label_bow_config, + single_label_cnn_config, + multi_label_default_config, + multi_label_bow_config, + multi_label_cnn_config, + ], ) def test_issue5551(textcat_config): """Test that after fixing the random seed, the results of the pipeline are truly identical""" diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 6c24257d0..48c7082bb 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -4,7 +4,7 @@ from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL -from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL +from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.lang.en import English from thinc.api import Linear @@ -188,7 +188,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): def test_serialize_textcat_empty(en_vocab): # See issue #1105 - cfg = {"model": DEFAULT_TEXTCAT_MODEL} + cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL} model = registry.resolve(cfg, validate=True)["model"] textcat = TextCategorizer(en_vocab, model, threshold=0.5) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 86166a048..1c698abcc 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -51,7 +51,7 @@ def test_readers(): for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) - assert scores["cats_score"] == 0.0 + assert scores["cats_macro_auc"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index fc2cda547..4f134c808 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -94,7 +94,7 @@ Defines the `nlp` object, its tokenizer and > > [components.textcat.model] > @architectures = "spacy.TextCatBOW.v1" -> exclusive_classes = false +> exclusive_classes = true > ngram_size = 1 > no_output_layer = false > ``` diff --git a/website/docs/api/multilabel_textcategorizer.md b/website/docs/api/multilabel_textcategorizer.md new file mode 100644 index 000000000..8400c68f7 --- /dev/null +++ b/website/docs/api/multilabel_textcategorizer.md @@ -0,0 +1,454 @@ +--- +title: Multi-label TextCategorizer +tag: class +source: spacy/pipeline/textcat_multilabel.py +new: 3 +teaser: 'Pipeline component for multi-label text classification' +api_base_class: /api/pipe +api_string_name: textcat_multilabel +api_trainable: true +--- + +The text categorizer predicts **categories over a whole document**. It +learns non-mutually exclusive labels, which means that zero or more labels +may be true per document. + +## Config and implementation {#config} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL +> config = { +> "threshold": 0.5, +> "model": DEFAULT_MULTI_TEXTCAT_MODEL, +> } +> nlp.add_pipe("textcat_multilabel", config=config) +> ``` + +| Setting | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | + +```python +%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py +``` + +## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> textcat = nlp.add_pipe("textcat_multilabel") +> +> # Construction via add_pipe with custom model +> config = {"model": {"@architectures": "my_textcat"}} +> parser = nlp.add_pipe("textcat_multilabel", config=config) +> +> # Construction from class +> from spacy.pipeline import MultiLabel_TextCategorizer +> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | + +## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe) +delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and +[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> textcat = nlp.add_pipe("textcat_multilabel") +> # This usually happens under the hood +> processed = textcat(doc) +> ``` + +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | + +## MultiLabel_TextCategorizer.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and +[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the +[`predict`](/api/multilabel_textcategorizer#predict) and +[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> for doc in textcat.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | + +## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"} + +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, +[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + + + +This method was previously called `begin_training`. + + + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> textcat.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.textcat_multilabel] +> +> [initialize.components.textcat_multilabel.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/textcat.json +> ``` + +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | + +## MultiLabel_TextCategorizer.predict {#predict tag="method"} + +Apply the component's model to a batch of [`Doc`](/api/doc) objects without +modifying them. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> scores = textcat.predict([doc1, doc2]) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | + +## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"} + +Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> scores = textcat.predict(docs) +> textcat.set_annotations(docs, scores) +> ``` + +| Name | Description | +| -------- | --------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. | + +## MultiLabel_TextCategorizer.update {#update tag="method"} + +Learn from a batch of [`Example`](/api/example) objects containing the +predictions and gold-standard annotations, and update the component's model. +Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and +[`get_loss`](/api/multilabel_textcategorizer#get_loss). + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> optimizer = nlp.initialize() +> losses = textcat.update(examples, sgd=optimizer) +> ``` + +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + +## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} + +Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the +current model to make predictions similar to an initial model to try to address +the "catastrophic forgetting" problem. This feature is experimental. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> optimizer = nlp.resume_training() +> losses = textcat.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + +## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"} + +Find the loss and gradient of loss for the batch of documents and their +predicted scores. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat_multilabel") +> scores = textcat.predict([eg.predicted for eg in examples]) +> loss, d_loss = textcat.get_loss(examples, scores) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + +## MultiLabel_TextCategorizer.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = textcat.score(examples) +> ``` + +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | + +## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> optimizer = textcat.create_optimizer() +> ``` + +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | + +## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's model to use the given parameter values. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> with textcat.use_params(optimizer.averages): +> textcat.to_disk("/best_model") +> ``` + +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | + +## MultiLabel_TextCategorizer.add_label {#add_label tag="method"} + +Add a new label to the pipe. Raises an error if the output dimension is already +set, or if the model has already been fully [initialized](#initialize). Note +that you don't have to call this method if you provide a **representative data +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> textcat.add_label("MY_LABEL") +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | + +## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> textcat.to_disk("/path/to/textcat") +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | + +## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> textcat.from_disk("/path/to/textcat") +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ | + +## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> textcat_bytes = textcat.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~ | + +## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> textcat_bytes = textcat.to_bytes() +> textcat = nlp.add_pipe("textcat") +> textcat.from_bytes(textcat_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ | + +## MultiLabel_TextCategorizer.labels {#labels tag="property"} + +The labels currently added to the component. + +> #### Example +> +> ```python +> textcat.add_label("MY_LABEL") +> assert "MY_LABEL" in textcat.labels +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | + +## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize +the model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = textcat.label_data +> textcat.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = textcat.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 447765e15..016b9c310 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -3,17 +3,15 @@ title: TextCategorizer tag: class source: spacy/pipeline/textcat.py new: 2 -teaser: 'Pipeline component for text classification' +teaser: 'Pipeline component for single-label text classification' api_base_class: /api/pipe api_string_name: textcat api_trainable: true --- The text categorizer predicts **categories over a whole document**. It can learn -one or more labels, and the labels can be mutually exclusive (i.e. one true -label per document) or non-mutually exclusive (i.e. zero or more labels may be -true per document). The multi-label setting is controlled by the model instance -that's provided. +one or more labels, and the labels are mutually exclusive - there is exactly one +true label per document. ## Config and implementation {#config} @@ -27,10 +25,10 @@ architectures and their arguments and hyperparameters. > #### Example > > ```python -> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL +> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL > config = { > "threshold": 0.5, -> "model": DEFAULT_TEXTCAT_MODEL, +> "model": DEFAULT_SINGLE_TEXTCAT_MODEL, > } > nlp.add_pipe("textcat", config=config) > ``` @@ -280,7 +278,6 @@ Score a batch of examples. | ---------------- | -------------------------------------------------------------------------------------------------------------------- | | `examples` | The examples to score. ~~Iterable[Example]~~ | | _keyword-only_ | | -| `positive_label` | Optional positive label. ~~Optional[str]~~ | | **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## TextCategorizer.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index eb6f8b288..72669b5e8 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -152,7 +152,7 @@ depth = 2 [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false ``` @@ -170,7 +170,7 @@ labels = [] [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +exclusive_classes = true ngram_size = 1 no_output_layer = false nO = null