From 247048654315760835ff7a926c7aef3f860b8f9e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 26 Jul 2020 13:18:43 +0200 Subject: [PATCH 1/7] Allow pipeline components to set default scores and weights --- spacy/default_config.cfg | 4 +- spacy/language.py | 20 +++++++- spacy/pipeline/dep_parser.pyx | 4 +- spacy/pipeline/ner.pyx | 4 +- spacy/pipeline/senter.pyx | 4 +- spacy/pipeline/tagger.pyx | 4 +- spacy/tests/pipeline/test_pipe_factories.py | 51 ++++++++++++++++++++- spacy/util.py | 17 +++++++ 8 files changed, 100 insertions(+), 8 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7ef4eadfc..3959524a9 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -34,8 +34,8 @@ seed = 0 accumulate_gradient = 1 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. -scores = ["speed", "tag_acc", "dep_uas", "dep_las", "ents_f"] -score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4} +scores = ["token_acc", "speed"] +score_weights = {} # These settings are invalid for the transformer models. init_tok2vec = null discard_oversize = false diff --git a/spacy/language.py b/spacy/language.py index 93d239356..928bde563 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -21,7 +21,7 @@ from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry -from .util import SimpleFrozenDict +from .util import SimpleFrozenDict, combine_score_weights from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -218,16 +218,24 @@ class Language: @property def config(self) -> Config: self._config.setdefault("nlp", {}) + self._config.setdefault("training", {}) self._config["nlp"]["lang"] = self.lang # We're storing the filled config for each pipeline component and so # we can populate the config again later pipeline = {} + scores = self._config["training"].get("scores", []) + score_weights = [] for pipe_name in self.pipe_names: pipe_meta = self.get_pipe_meta(pipe_name) pipe_config = self.get_pipe_config(pipe_name) pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} + scores.extend(pipe_meta.scores) + if pipe_meta.score_weights: + score_weights.append(pipe_meta.score_weights) self._config["nlp"]["pipeline"] = self.pipe_names self._config["components"] = pipeline + self._config["training"]["scores"] = list(scores) + self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config @@ -348,6 +356,8 @@ class Language: assigns: Iterable[str] = tuple(), requires: Iterable[str] = tuple(), retokenizes: bool = False, + scores: Iterable[str] = tuple(), + score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: """Register a new pipeline component factory. Can be used as a decorator @@ -393,6 +403,8 @@ class Language: default_config=default_config, assigns=validate_attrs(assigns), requires=validate_attrs(requires), + scores=scores, + score_weights=score_weights, retokenizes=retokenizes, ) cls.set_factory_meta(name, factory_meta) @@ -417,6 +429,8 @@ class Language: assigns: Iterable[str] = tuple(), requires: Iterable[str] = tuple(), retokenizes: bool = False, + scores: Iterable[str] = tuple(), + score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable[[Doc], Doc]] = None, ) -> Callable: """Register a new pipeline component. Can be used for stateless function @@ -450,6 +464,8 @@ class Language: assigns=assigns, requires=requires, retokenizes=retokenizes, + scores=scores, + score_weights=score_weights, func=factory_func, ) return component_func @@ -1484,6 +1500,8 @@ class FactoryMeta: assigns: Iterable[str] = tuple() requires: Iterable[str] = tuple() retokenizes: bool = False + scores: Iterable[str] = tuple() + score_weights: Dict[str, float] = None def _get_config_overrides( diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 78926a984..574b56f9a 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -42,7 +42,9 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, - } + }, + scores=["dep_uas", "dep_las", "sents_f"], + score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, ) def make_parser( nlp: Language, diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index cb2ca89d8..7c8e9e5d0 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -40,7 +40,9 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_NER_MODEL, - } + }, + scores=["ents_f", "ents_r", "ents_p"], + score_weights={"ents_f": 1.0, "ents_r": 0.0, "ents_p": 0.0}, ) def make_ner( nlp: Language, diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index c065ae72f..4139e82ad 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -33,7 +33,9 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL} + default_config={"model": DEFAULT_SENTER_MODEL}, + scores=["sents_p", "sents_r", "sents_f"], + score_weights={"sents_p": 0.0, "sents_r": 0.0, "sents_f": 1.0}, ) def make_senter(nlp: Language, name: str, model: Model): return SentenceRecognizer(nlp.vocab, model, name) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 7cc11fb84..151057e16 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -39,7 +39,9 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False} + default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False}, + scores=["tag_acc", "pos_acc"], + score_weights={"tag_acc": 0.5, "pos_acc": 0.5}, ) def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool): return Tagger(nlp.vocab, model, name, set_morphology=set_morphology) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index db090fdf2..95aefbd74 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -3,7 +3,7 @@ from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German from spacy.tokens import Doc -from spacy.util import registry, SimpleFrozenDict +from spacy.util import registry, SimpleFrozenDict, combine_score_weights from thinc.api import Model, Linear from thinc.config import ConfigValidationError from pydantic import StrictInt, StrictStr @@ -328,3 +328,52 @@ def test_language_factories_invalid(): assert len(nlp.factories) with pytest.raises(NotImplementedError): nlp.factories["foo"] = "bar" + + +@pytest.mark.parametrize( + "weights,expected", + [ + ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}), + ( + [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}], + {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17}, + ), + ( + [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], + {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, + ), + ], +) +def test_language_factories_combine_score_weights(weights, expected): + result = combine_score_weights(weights) + assert sum(result.values()) in (0.99, 1.0) + assert result == expected + + +def test_language_factories_scores(): + name = "test_language_factories_scores" + func = lambda doc: doc + scores1 = ["a1", "a2"] + weights1 = {"a1": 0.5, "a2": 0.5} + scores2 = ["b1", "b2", "b3"] + weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} + Language.component( + f"{name}1", scores=scores1, score_weights=weights1, func=func, + ) + Language.component( + f"{name}2", scores=scores2, score_weights=weights2, func=func, + ) + meta1 = Language.get_factory_meta(f"{name}1") + assert meta1.scores == scores1 + assert meta1.score_weights == weights1 + meta2 = Language.get_factory_meta(f"{name}2") + assert meta2.scores == scores2 + assert meta2.score_weights == weights2 + nlp = Language(config={"training": {"scores": ["speed"], "score_weights": {}}}) + nlp.add_pipe(f"{name}1") + nlp.add_pipe(f"{name}2") + cfg = nlp.config["training"] + assert cfg["scores"] == ["speed", *scores1, *scores2] + expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} + assert cfg["score_weights"] == expected_weights diff --git a/spacy/util.py b/spacy/util.py index c98ce2354..7eb605cc6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1130,6 +1130,23 @@ def get_arg_names(func: Callable) -> List[str]: return list(set([*argspec.args, *argspec.kwonlyargs])) +def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: + """Combine and normalize score weights defined by components, e.g. + {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. + + weights (List[dict]): The weights defined by the components. + RETURNS (Dict[str, float]): The combined and normalized weights. + """ + result = {} + for w_dict in weights: + # We need to account for weights that don't sum to 1.0 and normalize the + # score weights accordingly, then divide score by the number of components + total = sum([w for w in w_dict.values()]) + for key, value in w_dict.items(): + result[key] = round(value / total / len(weights), 2) + return result + + class DummyTokenizer: # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) From 4060c2d5a6de9b9c9f349f9fe54c7c924b138f4e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 26 Jul 2020 13:40:19 +0200 Subject: [PATCH 2/7] Fix test --- spacy/tests/pipeline/test_pipe_factories.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 95aefbd74..f91952955 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -370,7 +370,9 @@ def test_language_factories_scores(): meta2 = Language.get_factory_meta(f"{name}2") assert meta2.scores == scores2 assert meta2.score_weights == weights2 - nlp = Language(config={"training": {"scores": ["speed"], "score_weights": {}}}) + nlp = Language() + nlp._config["training"]["scores"] = ["speed"] + nlp._config["training"]["score_weights"] = {} nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") cfg = nlp.config["training"] From f8cf378be9087864328d9ad5b90bc15e78021cba Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 10:21:31 +0200 Subject: [PATCH 3/7] Combine weights from multiple components Combine weights from multiple components for the same score. --- spacy/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 7eb605cc6..d23874bae 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1143,7 +1143,8 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: # score weights accordingly, then divide score by the number of components total = sum([w for w in w_dict.values()]) for key, value in w_dict.items(): - result[key] = round(value / total / len(weights), 2) + weight = round(value / total / len(weights), 2) + result[key] = result.get(key, 0.0) + weight return result From baf19fd652ebbe08ec2d1a6cc494a6f1753388de Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 11:17:52 +0200 Subject: [PATCH 4/7] Update cats scoring to provide overall score * Provide top-level score as `attr_score` * Provide a description of the score as `attr_score_desc` * Provide all potential scores keys, setting unused keys to `None` * Update CLI evaluate accordingly --- spacy/cli/evaluate.py | 19 +++++++++-------- spacy/scorer.py | 31 ++++++++++++++++++++++------ spacy/tests/pipeline/test_textcat.py | 2 ++ 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 5cdbee065..83281543a 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -82,8 +82,7 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat AUC": "textcat_macro_auc", - "Textcat F": "textcat_macro_f", + "Textcat": "cats_score", "Sent P": "sents_p", "Sent R": "sents_r", "Sent F": "sents_f", @@ -91,6 +90,8 @@ def evaluate( results = {} for metric, key in metrics.items(): if key in scores: + if key == "cats_score": + metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} @@ -99,12 +100,12 @@ def evaluate( if "ents_per_type" in scores: if scores["ents_per_type"]: print_ents_per_type(msg, scores["ents_per_type"]) - if "textcat_f_per_cat" in scores: - if scores["textcat_f_per_cat"]: - print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"]) - if "textcat_auc_per_cat" in scores: - if scores["textcat_auc_per_cat"]: - print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"]) + if "cats_f_per_type" in scores: + if scores["cats_f_per_type"]: + print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) + if "cats_auc_per_type" in scores: + if scores["cats_auc_per_type"]: + print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] @@ -170,7 +171,7 @@ def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), - title="Textcat F (per type)", + title="Textcat F (per label)", ) diff --git a/spacy/scorer.py b/spacy/scorer.py index a95fe70cf..2bbf453e7 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -298,7 +298,8 @@ class Scorer: **cfg ): """Returns PRF and ROC AUC scores for a doc-level attribute with a - dict with scores for each label like Doc.cats. + dict with scores for each label like Doc.cats. The reported overall + score depends on the scorer settings. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. @@ -309,11 +310,16 @@ class Scorer: Defaults to True. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. - RETURNS (dict): A dictionary containing the scores: - for binary exclusive with positive label: attr_p/r/f, - for 3+ exclusive classes, macro-averaged fscore: attr_macro_f, - for multilabel, macro-averaged AUC: attr_macro_auc, - for all: attr_f_per_type, attr_auc_per_type + RETURNS (dict): A dictionary containing the scores, with inapplicable + scores as None: + for all: + attr_score (one of attr_f / attr_macro_f / attr_macro_auc), + attr_score_desc (text description of the overall score), + attr_f_per_type, + attr_auc_per_type + for binary exclusive with positive label: attr_p/r/f + for 3+ exclusive classes, macro-averaged fscore: attr_macro_f + for multilabel, macro-averaged AUC: attr_macro_auc """ score = PRFScore() f_per_type = dict() @@ -362,6 +368,13 @@ class Scorer: ) ) results = { + attr + "_score": None, + attr + "_score_desc": None, + attr + "_p": None, + attr + "_r": None, + attr + "_f": None, + attr + "_macro_f": None, + attr + "_macro_auc": None, attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } @@ -369,16 +382,22 @@ class Scorer: results[attr + "_p"] = score.precision results[attr + "_r"] = score.recall results[attr + "_f"] = score.fscore + results[attr + "_score"] = results[attr + "_f"] + results[attr + "_score_desc"] = "F (" + positive_label + ")" elif not multi_label: results[attr + "_macro_f"] = sum( [score.fscore for label, score in f_per_type.items()] ) / (len(f_per_type) + 1e-100) + results[attr + "_score"] = results[attr + "_macro_f"] + results[attr + "_score_desc"] = "macro F" else: results[attr + "_macro_auc"] = max( sum([score.score for label, score in auc_per_type.items()]) / (len(auc_per_type) + 1e-100), -1, ) + results[attr + "_score"] = results[attr + "_macro_auc"] + results[attr + "_score_desc"] = "macro AUC" return results @staticmethod diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 5e8dab0bd..15832d4bd 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -121,6 +121,8 @@ def test_overfitting_IO(): train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}} ) assert scores["cats_f"] == 1.0 + assert scores["cats_score"] == 1.0 + assert "cats_score_desc" in scores # fmt: off From 8bb05077771d48b41e9addb1cbd418fed7898b72 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 12:27:40 +0200 Subject: [PATCH 5/7] Add and update score methods and score weights Add and update `score` methods, provided `scores`, and default weights `default_score_weights` for pipeline components. * `scores` provides all top-level keys returned by `score` (merely informative, similar to `assigns`). * `default_score_weights` provides the default weights for a default config. * The keys from `default_score_weights` determine which values will be shown in the `spacy train` output, so keys with weight `0.0` will be displayed but not counted toward the overall score. --- spacy/cli/train.py | 2 +- spacy/language.py | 17 +++++++++-------- spacy/pipeline/dep_parser.pyx | 5 +++-- spacy/pipeline/entityruler.py | 5 +++++ spacy/pipeline/morphologizer.pyx | 4 +++- spacy/pipeline/ner.pyx | 5 +++-- spacy/pipeline/sentencizer.pyx | 8 ++++++-- spacy/pipeline/senter.pyx | 6 ++++-- spacy/pipeline/simple_ner.py | 6 ++++++ spacy/pipeline/tagger.pyx | 4 ++-- spacy/pipeline/textcat.py | 2 ++ spacy/tests/pipeline/test_pipe_factories.py | 18 +++++++++--------- spacy/util.py | 7 ++++--- 13 files changed, 57 insertions(+), 32 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 21fd0eb72..d52762525 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -395,7 +395,7 @@ def subdivide_batch(batch, accumulate_gradient): def setup_printer( training: Union[Dict[str, Any], Config], nlp: Language ) -> Callable[[Dict[str, Any]], None]: - score_cols = training["scores"] + score_cols = list(training["score_weights"]) score_widths = [max(len(col), 6) for col in score_cols] loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_widths = [max(len(col), 8) for col in loss_cols] diff --git a/spacy/language.py b/spacy/language.py index 928bde563..0465a83c0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -230,11 +230,12 @@ class Language: pipe_config = self.get_pipe_config(pipe_name) pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} scores.extend(pipe_meta.scores) - if pipe_meta.score_weights: - score_weights.append(pipe_meta.score_weights) + if pipe_meta.default_score_weights: + score_weights.append(pipe_meta.default_score_weights) self._config["nlp"]["pipeline"] = self.pipe_names self._config["components"] = pipeline - self._config["training"]["scores"] = list(scores) + self._config["training"]["scores"] = sorted(set(scores)) + combined_score_weights = combine_score_weights(score_weights) self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) @@ -357,7 +358,7 @@ class Language: requires: Iterable[str] = tuple(), retokenizes: bool = False, scores: Iterable[str] = tuple(), - score_weights: Dict[str, float] = SimpleFrozenDict(), + default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: """Register a new pipeline component factory. Can be used as a decorator @@ -404,7 +405,7 @@ class Language: assigns=validate_attrs(assigns), requires=validate_attrs(requires), scores=scores, - score_weights=score_weights, + default_score_weights=default_score_weights, retokenizes=retokenizes, ) cls.set_factory_meta(name, factory_meta) @@ -430,7 +431,7 @@ class Language: requires: Iterable[str] = tuple(), retokenizes: bool = False, scores: Iterable[str] = tuple(), - score_weights: Dict[str, float] = SimpleFrozenDict(), + default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable[[Doc], Doc]] = None, ) -> Callable: """Register a new pipeline component. Can be used for stateless function @@ -465,7 +466,7 @@ class Language: requires=requires, retokenizes=retokenizes, scores=scores, - score_weights=score_weights, + default_score_weights=default_score_weights, func=factory_func, ) return component_func @@ -1501,7 +1502,7 @@ class FactoryMeta: requires: Iterable[str] = tuple() retokenizes: bool = False scores: Iterable[str] = tuple() - score_weights: Dict[str, float] = None + default_score_weights: Dict[str, float] = None def _get_config_overrides( diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 574b56f9a..42a64c412 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -43,8 +43,8 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, }, - scores=["dep_uas", "dep_las", "sents_f"], - score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, + scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"], + default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, ) def make_parser( nlp: Language, @@ -115,4 +115,5 @@ cdef class DependencyParser(Parser): results.update(Scorer.score_spans(examples, "sents", **kwargs)) results.update(Scorer.score_deps(examples, "dep", getter=dep_getter, ignore_labels=("p", "punct"), **kwargs)) + del results["sents_per_type"] return results diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 869968136..dc6cf4359 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -23,6 +23,8 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, }, + scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, ) def make_entity_ruler( nlp: Language, @@ -305,6 +307,9 @@ class EntityRuler: label = f"{label}{self.ent_id_sep}{ent_id}" return label + def score(self, examples, **kwargs): + return Scorer.score_spans(examples, "ents", **kwargs) + def from_bytes( self, patterns_bytes: bytes, exclude: Iterable[str] = tuple() ) -> "EntityRuler": diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index fb80a9d86..c069038d3 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -39,7 +39,9 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL} + default_config={"model": DEFAULT_MORPH_MODEL}, + scores=["pos_acc", "morph_acc", "morph_per_feat"], + default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5}, ) def make_morphologizer( nlp: Language, diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 7c8e9e5d0..1d186b8d2 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -41,8 +41,9 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "min_action_freq": 30, "model": DEFAULT_NER_MODEL, }, - scores=["ents_f", "ents_r", "ents_p"], - score_weights={"ents_f": 1.0, "ents_r": 0.0, "ents_p": 0.0}, + scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + ) def make_ner( nlp: Language, diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 70188f856..b1ed75efc 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -13,7 +13,9 @@ from .. import util @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], - default_config={"punct_chars": None} + default_config={"punct_chars": None}, + scores=["sents_p", "sents_r", "sents_f"], + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_sentencizer( nlp: Language, @@ -132,7 +134,9 @@ class Sentencizer(Pipe): doc.c[j].sent_start = -1 def score(self, examples, **kwargs): - return Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", **kwargs) + del results["sents_per_type"] + return results def to_bytes(self, exclude=tuple()): """Serialize the sentencizer to a bytestring. diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 4139e82ad..a5363ba61 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -35,7 +35,7 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] assigns=["token.is_sent_start"], default_config={"model": DEFAULT_SENTER_MODEL}, scores=["sents_p", "sents_r", "sents_f"], - score_weights={"sents_p": 0.0, "sents_r": 0.0, "sents_f": 1.0}, + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_senter(nlp: Language, name: str, model: Model): return SentenceRecognizer(nlp.vocab, model, name) @@ -108,7 +108,9 @@ class SentenceRecognizer(Tagger): raise NotImplementedError def score(self, examples, **kwargs): - return Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", **kwargs) + del results["sents_per_type"] + return results def to_bytes(self, exclude=tuple()): serialize = {} diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 4086c0710..ddb85fce0 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -34,6 +34,9 @@ DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"] "simple_ner", assigns=["doc.ents"], default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL}, + scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + ) def make_simple_ner( nlp: Language, name: str, model: Model, labels: Iterable[str] @@ -173,6 +176,9 @@ class SimpleNER(Pipe): def init_multitask_objectives(self, *args, **kwargs): pass + def score(self, examples, **kwargs): + return Scorer.score_spans(examples, "ents", **kwargs) + def _has_ner(example: Example) -> bool: for ner_tag in example.get_aligned_ner(): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 151057e16..76f9559a4 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -40,8 +40,8 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] "tagger", assigns=["token.tag"], default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False}, - scores=["tag_acc", "pos_acc"], - score_weights={"tag_acc": 0.5, "pos_acc": 0.5}, + scores=["tag_acc", "pos_acc", "lemma_acc"], + default_score_weights={"tag_acc": 1.0}, ) def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool): return Tagger(nlp.vocab, model, name, set_morphology=set_morphology) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 3c0808342..9ab04553d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -56,6 +56,8 @@ dropout = null "textcat", assigns=["doc.cats"], default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL}, + scores=["cats_score", "cats_score_desc", "cats_p", "cats_r", "cats_f", "cats_macro_f", "cats_macro_auc", "cats_f_per_type", "cats_macro_auc_per_type"], + default_score_weights={"cats_score": 1.0}, ) def make_textcat( nlp: Language, name: str, model: Model, labels: Iterable[str] diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index f91952955..62aa91003 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -343,6 +343,10 @@ def test_language_factories_invalid(): [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), + ( + [{"a": 0.5, "b": 0.5}, {"b": 1.0}], + {"a": 0.25, "b": 0.75}, + ), ], ) def test_language_factories_combine_score_weights(weights, expected): @@ -354,28 +358,24 @@ def test_language_factories_combine_score_weights(weights, expected): def test_language_factories_scores(): name = "test_language_factories_scores" func = lambda doc: doc - scores1 = ["a1", "a2"] weights1 = {"a1": 0.5, "a2": 0.5} - scores2 = ["b1", "b2", "b3"] weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} Language.component( - f"{name}1", scores=scores1, score_weights=weights1, func=func, + f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, ) Language.component( - f"{name}2", scores=scores2, score_weights=weights2, func=func, + f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, ) meta1 = Language.get_factory_meta(f"{name}1") - assert meta1.scores == scores1 - assert meta1.score_weights == weights1 + assert meta1.default_score_weights == weights1 meta2 = Language.get_factory_meta(f"{name}2") - assert meta2.scores == scores2 - assert meta2.score_weights == weights2 + assert meta2.default_score_weights == weights2 nlp = Language() nlp._config["training"]["scores"] = ["speed"] nlp._config["training"]["score_weights"] = {} nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") cfg = nlp.config["training"] - assert cfg["scores"] == ["speed", *scores1, *scores2] + assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())]) expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights diff --git a/spacy/util.py b/spacy/util.py index d23874bae..9c4908a78 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1139,9 +1139,10 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: """ result = {} for w_dict in weights: - # We need to account for weights that don't sum to 1.0 and normalize the - # score weights accordingly, then divide score by the number of components - total = sum([w for w in w_dict.values()]) + # We need to account for weights that don't sum to 1.0 and normalize + # the score weights accordingly, then divide score by the number of + # components. + total = sum(w_dict.values()) for key, value in w_dict.items(): weight = round(value / total / len(weights), 2) result[key] = result.get(key, 0.0) + weight From 34c92dfe636fb586aab7a59992ab90a0883c2067 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 15:08:51 +0200 Subject: [PATCH 6/7] Add missing Scorer imports --- spacy/pipeline/entityruler.py | 1 + spacy/pipeline/simple_ner.py | 1 + 2 files changed, 2 insertions(+) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index dc6cf4359..f18cec472 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -8,6 +8,7 @@ from ..errors import Errors from ..util import ensure_path, to_disk, from_disk from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher +from ..scorer import Scorer DEFAULT_ENT_ID_SEP = "||" diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index ddb85fce0..ec7ab6b7a 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -8,6 +8,7 @@ from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob from ..tokens import Doc from ..language import Language from ..vocab import Vocab +from ..scorer import Scorer from .. import util from .pipe import Pipe From fdf09cb2313e18702b9e59d55a15a10394ca3612 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 15:34:42 +0200 Subject: [PATCH 7/7] Update Scorer API docs for score_cats --- website/docs/api/scorer.md | 112 +++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index ef4396e1b..8daefd241 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -8,8 +8,8 @@ source: spacy/scorer.py The `Scorer` computes evaluation scores. It's typically created by [`Language.evaluate`](/api/language#evaluate). -In addition, the `Scorer` provides a number of evaluation methods for -evaluating `Token` and `Doc` attributes. +In addition, the `Scorer` provides a number of evaluation methods for evaluating +`Token` and `Doc` attributes. ## Scorer.\_\_init\_\_ {#init tag="method"} @@ -28,10 +28,10 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------------ | -| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | -| **RETURNS** | `Scorer` | The newly created object. | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | +| **RETURNS** | `Scorer` | The newly created object. | ## Scorer.score {#score tag="method"} @@ -39,13 +39,13 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the scoring methods provided by the components in the pipeline. The returned `Dict` contains the scores provided by the individual pipeline -components. For the scoring methods provided by the `Scorer` and use by the -core pipeline components, the individual score names start with the `Token` or -`Doc` attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, -`tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, -`dep_las`, `dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, -`textcat_macro_auc`, `textcat_macro_f`. - +components. For the scoring methods provided by the `Scorer` and use by the core +pipeline components, the individual score names start with the `Token` or `Doc` +attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`, +`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`, +`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`, +`textcat_macro_f`. + > #### Example > > ```python @@ -53,19 +53,20 @@ core pipeline components, the individual score names start with the `Token` or > scorer.score(examples) > ``` -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | **RETURNS** | `Dict` | A dictionary of scores. | + ## Scorer.score_tokenization {#score_tokenization tag="staticmethod"} Scores the tokenization: -* `token_acc`: # correct tokens / # gold tokens -* `token_p/r/f`: PRF for token character spans +- `token_acc`: # correct tokens / # gold tokens +- `token_p/r/f`: PRF for token character spans -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. | @@ -73,61 +74,62 @@ Scores the tokenization: Scores a single token attribute. -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | +| Name | Type | Description | +| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | | `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. | +| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. | ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"} -Scores a single token attribute per feature for a token attribute in UFEATS format. +Scores a single token attribute per feature for a token attribute in UFEATS +format. -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | +| Name | Type | Description | +| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | | `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. | +| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. | ## Scorer.score_spans {#score_spans tag="staticmethod"} Returns PRF scores for labeled or unlabeled spans. -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | | **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. | ## Scorer.score_deps {#score_deps tag="staticmethod"} Calculate the UAS, LAS, and LAS per type scores for dependency parses. -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute containing the dependency label. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| `head_attr` | `str` | The attribute containing the head token. | -| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | -| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). -| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. | +| Name | Type | Description | +| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute containing the dependency label. | +| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | +| `head_attr` | `str` | The attribute containing the head token. | +| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | +| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). | +| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. | ## Scorer.score_cats {#score_cats tag="staticmethod"} Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict -containing scores for each label like `Doc.cats`. - -| Name | Type | Description | -| ----------- | --------- | --------------------------------------------------------------------------------------------------------| -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | -| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | -| multi_label | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | -| positive_label | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | -| **RETURNS** | `Dict` | A dictionary containing the scores: 1) for binary exclusive with positive label: `attr_p/r/f`; 2) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 3) for multilabel, macro-averaged AUC: `attr_macro_auc`; 4) for all: `attr_f_per_type`, `attr_auc_per_type` | +containing scores for each label like `Doc.cats`. The reported overall score +depends on the scorer settings. +| Name | Type | Description | +| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | +| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | +| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | +| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | +| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` |