mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
multi-label textcat component (#6474)
* multi-label textcat component * formatting * fix comment * cleanup * fix from #6481 * random edit to push the tests * add explicit error when textcat is called with multi-label gold data * fix error nr * small fix
This commit is contained in:
parent
1a77607036
commit
afc5714d32
|
@ -169,7 +169,7 @@ def init_config(
|
|||
"Hardware": variables["hardware"].upper(),
|
||||
"Transformer": template_vars.transformer.get("name", False),
|
||||
}
|
||||
msg.info("Generated template specific for your use case")
|
||||
msg.info("Generated config template specific for your use case")
|
||||
for label, value in use_case.items():
|
||||
msg.text(f"- {label}: {value}")
|
||||
with show_validation_error(hint_fill=False):
|
||||
|
|
|
@ -149,13 +149,44 @@ grad_factor = 1.0
|
|||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{% if "textcat_multilabel" in components %}
|
||||
[components.textcat_multilabel]
|
||||
factory = "textcat_multilabel"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
@ -288,13 +319,41 @@ width = ${components.tok2vec.model.encode.width}
|
|||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{% if "textcat_multilabel" in components %}
|
||||
[components.textcat_multilabel]
|
||||
factory = "textcat_multilabel"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
@ -303,7 +362,7 @@ no_output_layer = false
|
|||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
|
|
|
@ -463,6 +463,10 @@ class Errors:
|
|||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||
"multiple labels per document. In spaCy 3 you should use the "
|
||||
"'textcat_multilabel' component for this instead. "
|
||||
"Example of an offending annotation: {value}")
|
||||
E896 = ("There was an error using the static vectors. Ensure that the vectors "
|
||||
"of the vocab are properly initialized, or set 'include_static_vectors' "
|
||||
"to False.")
|
||||
|
|
|
@ -11,6 +11,7 @@ from .senter import SentenceRecognizer
|
|||
from .sentencizer import Sentencizer
|
||||
from .tagger import Tagger
|
||||
from .textcat import TextCategorizer
|
||||
from .textcat_multilabel import MultiLabel_TextCategorizer
|
||||
from .tok2vec import Tok2Vec
|
||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||
|
||||
|
@ -22,13 +23,14 @@ __all__ = [
|
|||
"EntityRuler",
|
||||
"Morphologizer",
|
||||
"Lemmatizer",
|
||||
"TrainablePipe",
|
||||
"MultiLabel_TextCategorizer",
|
||||
"Pipe",
|
||||
"SentenceRecognizer",
|
||||
"Sentencizer",
|
||||
"Tagger",
|
||||
"TextCategorizer",
|
||||
"Tok2Vec",
|
||||
"TrainablePipe",
|
||||
"merge_entities",
|
||||
"merge_noun_chunks",
|
||||
"merge_subtokens",
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..tokens import Doc
|
|||
from ..vocab import Vocab
|
||||
|
||||
|
||||
default_model_config = """
|
||||
single_label_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
|
||||
|
@ -37,24 +37,24 @@ depth = 2
|
|||
|
||||
[model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
||||
DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["model"]
|
||||
|
||||
bow_model_config = """
|
||||
single_label_bow_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
||||
cnn_model_config = """
|
||||
single_label_cnn_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatCNN.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
|
@ -71,7 +71,7 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
|
@ -103,7 +103,7 @@ def make_textcat(
|
|||
|
||||
|
||||
class TextCategorizer(TrainablePipe):
|
||||
"""Pipeline component for text classification.
|
||||
"""Pipeline component for single-label text classification.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer
|
||||
"""
|
||||
|
@ -111,7 +111,7 @@ class TextCategorizer(TrainablePipe):
|
|||
def __init__(
|
||||
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
|
||||
) -> None:
|
||||
"""Initialize a text categorizer.
|
||||
"""Initialize a text categorizer for single-label classification.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
|
@ -214,6 +214,7 @@ class TextCategorizer(TrainablePipe):
|
|||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_examples(examples, "TextCategorizer.update")
|
||||
self._validate_categories(examples)
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return losses
|
||||
|
@ -256,6 +257,7 @@ class TextCategorizer(TrainablePipe):
|
|||
if self._rehearsal_model is None:
|
||||
return losses
|
||||
validate_examples(examples, "TextCategorizer.rehearse")
|
||||
self._validate_categories(examples)
|
||||
docs = [eg.predicted for eg in examples]
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
|
@ -296,6 +298,7 @@ class TextCategorizer(TrainablePipe):
|
|||
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.get_loss")
|
||||
self._validate_categories(examples)
|
||||
truths, not_missing = self._examples_to_truth(examples)
|
||||
not_missing = self.model.ops.asarray(not_missing)
|
||||
d_scores = (scores - truths) / scores.shape[0]
|
||||
|
@ -341,6 +344,7 @@ class TextCategorizer(TrainablePipe):
|
|||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "TextCategorizer.initialize")
|
||||
self._validate_categories(get_examples())
|
||||
if labels is None:
|
||||
for example in get_examples():
|
||||
for cat in example.y.cats:
|
||||
|
@ -373,12 +377,20 @@ class TextCategorizer(TrainablePipe):
|
|||
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.score")
|
||||
self._validate_categories(examples)
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=self.model.attrs["multi_label"],
|
||||
multi_label=False,
|
||||
positive_label=self.cfg["positive_label"],
|
||||
threshold=self.cfg["threshold"],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def _validate_categories(self, examples: List[Example]):
|
||||
"""Check whether the provided examples all have single-label cats annotations."""
|
||||
for ex in examples:
|
||||
if list(ex.reference.cats.values()).count(1.0) > 1:
|
||||
raise ValueError(Errors.E895.format(value=ex.reference.cats))
|
||||
|
|
191
spacy/pipeline/textcat_multilabel.py
Normal file
191
spacy/pipeline/textcat_multilabel.py
Normal file
|
@ -0,0 +1,191 @@
|
|||
from itertools import islice
|
||||
from typing import Iterable, Optional, Dict, List, Callable, Any
|
||||
|
||||
from thinc.api import Model, Config
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ..language import Language
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..vocab import Vocab
|
||||
from .textcat import TextCategorizer
|
||||
|
||||
|
||||
multi_label_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 2
|
||||
|
||||
[model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["model"]
|
||||
|
||||
multi_label_bow_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
||||
multi_label_cnn_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatCNN.v1"
|
||||
exclusive_classes = false
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
"""
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"textcat_multilabel",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
"cats_micro_p": None,
|
||||
"cats_micro_r": None,
|
||||
"cats_micro_f": None,
|
||||
"cats_macro_p": None,
|
||||
"cats_macro_r": None,
|
||||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_multilabel_textcat(
|
||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels can
|
||||
be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
|
||||
(i.e. zero or more labels may be true per doc). The multi-label setting is
|
||||
controlled by the model instance that's provided.
|
||||
|
||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||
|
||||
|
||||
class MultiLabel_TextCategorizer(TextCategorizer):
|
||||
"""Pipeline component for multi-label text classification.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "textcat_multilabel",
|
||||
*,
|
||||
threshold: float,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for multi-label classification.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold}
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
labels: Optional[Dict] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
|
||||
if labels is None:
|
||||
for example in get_examples():
|
||||
for cat in example.y.cats:
|
||||
self.add_label(cat)
|
||||
else:
|
||||
for label in labels:
|
||||
self.add_label(label)
|
||||
subbatch = list(islice(get_examples(), 10))
|
||||
doc_sample = [eg.reference for eg in subbatch]
|
||||
label_sample, _ = self._examples_to_truth(subbatch)
|
||||
self._require_labels()
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "MultiLabel_TextCategorizer.score")
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=True,
|
||||
threshold=self.cfg["threshold"],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _validate_categories(self, examples: List[Example]):
|
||||
"""This component allows any type of single- or multi-label annotations.
|
||||
This method overwrites the more strict one from 'textcat'. """
|
||||
pass
|
|
@ -460,7 +460,7 @@ class Scorer:
|
|||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||
if gold_score is not None and gold_score > 0:
|
||||
f_per_type[gold_label].fn += 1
|
||||
else:
|
||||
elif pred_cats:
|
||||
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
|
|
|
@ -15,15 +15,31 @@ from spacy.training import Example
|
|||
from ..util import make_tempdir
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
TRAIN_DATA_SINGLE_LABEL = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||
]
|
||||
|
||||
TRAIN_DATA_MULTI_LABEL = [
|
||||
("I'm angry and confused", {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}),
|
||||
("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}),
|
||||
]
|
||||
|
||||
def make_get_examples(nlp):
|
||||
|
||||
def make_get_examples_single_label(nlp):
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
for t in TRAIN_DATA_SINGLE_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
def get_examples():
|
||||
return train_examples
|
||||
|
||||
return get_examples
|
||||
|
||||
|
||||
def make_get_examples_multi_label(nlp):
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA_MULTI_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
def get_examples():
|
||||
|
@ -85,49 +101,75 @@ def test_textcat_learns_multilabel():
|
|||
assert score > 0.5
|
||||
|
||||
|
||||
def test_label_types():
|
||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||
def test_label_types(name):
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
textcat = nlp.add_pipe(name)
|
||||
textcat.add_label("answer")
|
||||
with pytest.raises(ValueError):
|
||||
textcat.add_label(9)
|
||||
|
||||
|
||||
def test_no_label():
|
||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("textcat")
|
||||
nlp.add_pipe(name)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
@pytest.mark.parametrize(
|
||||
"name,get_examples",
|
||||
[
|
||||
("textcat", make_get_examples_single_label),
|
||||
("textcat_multilabel", make_get_examples_multi_label),
|
||||
],
|
||||
)
|
||||
def test_implicit_label(name, get_examples):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("textcat")
|
||||
nlp.initialize(get_examples=make_get_examples(nlp))
|
||||
nlp.add_pipe(name)
|
||||
nlp.initialize(get_examples=get_examples(nlp))
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||
def test_no_resize(name):
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
textcat = nlp.add_pipe(name)
|
||||
textcat.add_label("POSITIVE")
|
||||
textcat.add_label("NEGATIVE")
|
||||
nlp.initialize()
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
assert textcat.model.get_dim("nO") >= 2
|
||||
# this throws an error because the textcat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
textcat.add_label("NEUTRAL")
|
||||
|
||||
|
||||
def test_initialize_examples():
|
||||
def test_error_with_multi_labels():
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA_MULTI_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
with pytest.raises(ValueError):
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name,get_examples, train_data",
|
||||
[
|
||||
("textcat", make_get_examples_single_label, TRAIN_DATA_SINGLE_LABEL),
|
||||
("textcat_multilabel", make_get_examples_multi_label, TRAIN_DATA_MULTI_LABEL),
|
||||
],
|
||||
)
|
||||
def test_initialize_examples(name, get_examples, train_data):
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe(name)
|
||||
for text, annotations in train_data:
|
||||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.initialize()
|
||||
get_examples = make_get_examples(nlp)
|
||||
nlp.initialize(get_examples=get_examples)
|
||||
nlp.initialize(get_examples=get_examples(nlp))
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
|
@ -138,12 +180,10 @@ def test_overfitting_IO():
|
|||
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||
# Set exclusive labels
|
||||
config = {"model": {"linear_model": {"exclusive_classes": True}}}
|
||||
textcat = nlp.add_pipe("textcat", config=config)
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
|
@ -172,6 +212,8 @@ def test_overfitting_IO():
|
|||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples)
|
||||
assert scores["cats_micro_f"] == 1.0
|
||||
assert scores["cats_macro_f"] == 1.0
|
||||
assert scores["cats_macro_auc"] == 1.0
|
||||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
||||
|
@ -192,7 +234,7 @@ def test_overfitting_IO_multi():
|
|||
config = {"model": {"linear_model": {"exclusive_classes": False}}}
|
||||
textcat = nlp.add_pipe("textcat", config=config)
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
for text, annotations in TRAIN_DATA_MULTI_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
|
@ -231,27 +273,75 @@ def test_overfitting_IO_multi():
|
|||
assert_equal(batch_cats_1, no_batch_cats)
|
||||
|
||||
|
||||
def test_overfitting_IO_multi():
|
||||
# Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA_MULTI_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 3
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["textcat_multilabel"] < 0.01
|
||||
|
||||
# test the trained model
|
||||
test_text = "I am confused but happy."
|
||||
doc = nlp(test_text)
|
||||
cats = doc.cats
|
||||
assert cats["HAPPY"] > 0.9
|
||||
assert cats["CONFUSED"] > 0.9
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
cats2 = doc2.cats
|
||||
assert cats2["HAPPY"] > 0.9
|
||||
assert cats2["CONFUSED"] > 0.9
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples)
|
||||
assert scores["cats_micro_f"] == 1.0
|
||||
assert scores["cats_macro_f"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||
texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
|
||||
batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
|
||||
batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
|
||||
no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"textcat_config",
|
||||
"name,train_data,textcat_config",
|
||||
[
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_textcat_configs(textcat_config):
|
||||
def test_textcat_configs(name, train_data, textcat_config):
|
||||
pipe_config = {"model": textcat_config}
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat", config=pipe_config)
|
||||
textcat = nlp.add_pipe(name, config=pipe_config)
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
for text, annotations in train_data:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
|
@ -264,15 +354,24 @@ def test_textcat_configs(textcat_config):
|
|||
def test_positive_class():
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
get_examples = make_get_examples(nlp)
|
||||
get_examples = make_get_examples_single_label(nlp)
|
||||
textcat.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
|
||||
assert textcat.labels == ("POS", "NEG")
|
||||
assert textcat.cfg["positive_label"] == "POS"
|
||||
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
get_examples = make_get_examples_multi_label(nlp)
|
||||
with pytest.raises(TypeError):
|
||||
textcat_multilabel.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
|
||||
textcat_multilabel.initialize(get_examples, labels=["FICTION", "DRAMA"])
|
||||
assert textcat_multilabel.labels == ("FICTION", "DRAMA")
|
||||
assert "positive_label" not in textcat_multilabel.cfg
|
||||
|
||||
|
||||
def test_positive_class_not_present():
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
get_examples = make_get_examples(nlp)
|
||||
get_examples = make_get_examples_single_label(nlp)
|
||||
with pytest.raises(ValueError):
|
||||
textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS")
|
||||
|
||||
|
@ -280,11 +379,9 @@ def test_positive_class_not_present():
|
|||
def test_positive_class_not_binary():
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
get_examples = make_get_examples(nlp)
|
||||
get_examples = make_get_examples_multi_label(nlp)
|
||||
with pytest.raises(ValueError):
|
||||
textcat.initialize(
|
||||
get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
|
||||
)
|
||||
textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
|
||||
|
||||
|
||||
def test_textcat_evaluation():
|
||||
|
|
|
@ -2,8 +2,11 @@ import pytest
|
|||
from thinc.api import Config, fix_random_seed
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline.textcat import default_model_config, bow_model_config
|
||||
from spacy.pipeline.textcat import cnn_model_config
|
||||
from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
|
||||
from spacy.pipeline.textcat import single_label_cnn_config
|
||||
from spacy.pipeline.textcat_multilabel import multi_label_default_config
|
||||
from spacy.pipeline.textcat_multilabel import multi_label_bow_config
|
||||
from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
|
||||
from spacy.tokens import Span
|
||||
from spacy import displacy
|
||||
from spacy.pipeline import merge_entities
|
||||
|
@ -11,7 +14,15 @@ from spacy.training import Example
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"textcat_config", [default_model_config, bow_model_config, cnn_model_config]
|
||||
"textcat_config",
|
||||
[
|
||||
single_label_default_config,
|
||||
single_label_bow_config,
|
||||
single_label_cnn_config,
|
||||
multi_label_default_config,
|
||||
multi_label_bow_config,
|
||||
multi_label_cnn_config,
|
||||
],
|
||||
)
|
||||
def test_issue5551(textcat_config):
|
||||
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
|||
from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
|
||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
||||
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
|
||||
from spacy.lang.en import English
|
||||
from thinc.api import Linear
|
||||
|
@ -188,7 +188,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
|
|||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||
cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
textcat = TextCategorizer(en_vocab, model, threshold=0.5)
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
|
|
@ -51,7 +51,7 @@ def test_readers():
|
|||
for example in train_corpus(nlp):
|
||||
nlp.update([example], sgd=optimizer)
|
||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||
assert scores["cats_score"] == 0.0
|
||||
assert scores["cats_macro_auc"] == 0.0
|
||||
# ensure the pipeline runs
|
||||
doc = nlp("Quick test")
|
||||
assert doc.cats
|
||||
|
|
|
@ -94,7 +94,7 @@ Defines the `nlp` object, its tokenizer and
|
|||
>
|
||||
> [components.textcat.model]
|
||||
> @architectures = "spacy.TextCatBOW.v1"
|
||||
> exclusive_classes = false
|
||||
> exclusive_classes = true
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
> ```
|
||||
|
|
454
website/docs/api/multilabel_textcategorizer.md
Normal file
454
website/docs/api/multilabel_textcategorizer.md
Normal file
|
@ -0,0 +1,454 @@
|
|||
---
|
||||
title: Multi-label TextCategorizer
|
||||
tag: class
|
||||
source: spacy/pipeline/textcat_multilabel.py
|
||||
new: 3
|
||||
teaser: 'Pipeline component for multi-label text classification'
|
||||
api_base_class: /api/pipe
|
||||
api_string_name: textcat_multilabel
|
||||
api_trainable: true
|
||||
---
|
||||
|
||||
The text categorizer predicts **categories over a whole document**. It
|
||||
learns non-mutually exclusive labels, which means that zero or more labels
|
||||
may be true per document.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config). See the
|
||||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
||||
> config = {
|
||||
> "threshold": 0.5,
|
||||
> "model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
> }
|
||||
> nlp.add_pipe("textcat_multilabel", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py
|
||||
```
|
||||
|
||||
## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe with default model
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
>
|
||||
> # Construction via add_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_textcat"}}
|
||||
> parser = nlp.add_pipe("textcat_multilabel", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import MultiLabel_TextCategorizer
|
||||
> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order. Both
|
||||
[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe)
|
||||
delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and
|
||||
[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("This is a sentence.")
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> # This usually happens under the hood
|
||||
> processed = textcat(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.pipe {#pipe tag="method"}
|
||||
|
||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||
when the `nlp` object is called on a text and all pipeline components are
|
||||
applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and
|
||||
[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the
|
||||
[`predict`](/api/multilabel_textcategorizer#predict) and
|
||||
[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> for doc in textcat.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
used to **initialize the model** of the component and can either be the full
|
||||
training data or a representative sample. Initialization includes validating the
|
||||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> textcat.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.textcat_multilabel]
|
||||
>
|
||||
> [initialize.components.textcat_multilabel.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/textcat.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
|
||||
modifying them.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> scores = textcat.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> scores = textcat.predict(docs)
|
||||
> textcat.set_annotations(docs, scores)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------- | --------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. |
|
||||
|
||||
## MultiLabel_TextCategorizer.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of [`Example`](/api/example) objects containing the
|
||||
predictions and gold-standard annotations, and update the component's model.
|
||||
Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
|
||||
[`get_loss`](/api/multilabel_textcategorizer#get_loss).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = textcat.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||
current model to make predictions similar to an initial model to try to address
|
||||
the "catastrophic forgetting" problem. This feature is experimental.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> optimizer = nlp.resume_training()
|
||||
> losses = textcat.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
Find the loss and gradient of loss for the batch of documents and their
|
||||
predicted scores.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat_multilabel")
|
||||
> scores = textcat.predict([eg.predicted for eg in examples])
|
||||
> loss, d_loss = textcat.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.score {#score tag="method" new="3"}
|
||||
|
||||
Score a batch of examples.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scores = textcat.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
Create an optimizer for the pipeline component.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> optimizer = textcat.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
Modify the pipe's model to use the given parameter values.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> with textcat.use_params(optimizer.averages):
|
||||
> textcat.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. Raises an error if the output dimension is already
|
||||
set, or if the model has already been fully [initialized](#initialize). Note
|
||||
that you don't have to call this method if you provide a **representative data
|
||||
sample** to the [`initialize`](#initialize) method. In this case, all labels
|
||||
found in the sample will be automatically added to the model, and the output
|
||||
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the pipe to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.to_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.from_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat_bytes = textcat.to_bytes()
|
||||
> ```
|
||||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat_bytes = textcat.to_bytes()
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.from_bytes(textcat_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.labels {#labels tag="property"}
|
||||
|
||||
The labels currently added to the component.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat.add_label("MY_LABEL")
|
||||
> assert "MY_LABEL" in textcat.labels
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"}
|
||||
|
||||
The labels currently added to the component and their internal meta information.
|
||||
This is the data generated by [`init labels`](/api/cli#init-labels) and used by
|
||||
[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize
|
||||
the model with a pre-defined label set.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> labels = textcat.label_data
|
||||
> textcat.initialize(lambda: [], nlp=nlp, labels=labels)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------- |
|
||||
| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = textcat.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
|
@ -3,17 +3,15 @@ title: TextCategorizer
|
|||
tag: class
|
||||
source: spacy/pipeline/textcat.py
|
||||
new: 2
|
||||
teaser: 'Pipeline component for text classification'
|
||||
teaser: 'Pipeline component for single-label text classification'
|
||||
api_base_class: /api/pipe
|
||||
api_string_name: textcat
|
||||
api_trainable: true
|
||||
---
|
||||
|
||||
The text categorizer predicts **categories over a whole document**. It can learn
|
||||
one or more labels, and the labels can be mutually exclusive (i.e. one true
|
||||
label per document) or non-mutually exclusive (i.e. zero or more labels may be
|
||||
true per document). The multi-label setting is controlled by the model instance
|
||||
that's provided.
|
||||
one or more labels, and the labels are mutually exclusive - there is exactly one
|
||||
true label per document.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
@ -27,10 +25,10 @@ architectures and their arguments and hyperparameters.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
||||
> config = {
|
||||
> "threshold": 0.5,
|
||||
> "model": DEFAULT_TEXTCAT_MODEL,
|
||||
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
> }
|
||||
> nlp.add_pipe("textcat", config=config)
|
||||
> ```
|
||||
|
@ -280,7 +278,6 @@ Score a batch of examples.
|
|||
| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `positive_label` | Optional positive label. ~~Optional[str]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
|
|
@ -152,7 +152,7 @@ depth = 2
|
|||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
```
|
||||
|
@ -170,7 +170,7 @@ labels = []
|
|||
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
|
Loading…
Reference in New Issue
Block a user