Small adjustments to Scorer and docs

2025-11-06 10:57:34 +03:00 · 2020-07-28 21:39:42 +02:00 · 2020-07-28 21:39:42 +02:00 · ac24adec73
commit ac24adec73
parent 256b24b720
2 changed files with 251 additions and 148 deletions
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,47 +1,53 @@
+from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
 import numpy as np

+from .gold import Example
+from .tokens import Token
 from .errors import Errors
 from .util import get_lang_class
 from .morphology import Morphology

+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401
+
+
+DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
+

 class PRFScore:
-    """
-    A precision / recall / F score
-    """
+    """A precision / recall / F score."""

-    def __init__(self):
+    def __init__(self) -> None:
        self.tp = 0
        self.fp = 0
        self.fn = 0

-    def score_set(self, cand, gold):
+    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
-    def precision(self):
+    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
-    def recall(self):
+    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
-    def fscore(self):
+    def fscore(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

-    def to_dict(self):
+    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f": self.fscore}


 class ROCAUCScore:
-    """
-    An AUC ROC score.
-    """
+    """An AUC ROC score."""

    def __init__(self):
        self.golds = []
@ -49,7 +55,7 @@ class ROCAUCScore:
        self.saved_score = 0.0
        self.saved_score_at_len = 0

-    def score_set(self, cand, gold):
+    def score_set(self, cand, gold) -> None:
        self.cands.append(cand)
        self.golds.append(gold)

@ -70,7 +76,13 @@ class ROCAUCScore:
 class Scorer:
    """Compute evaluation scores."""

-    def __init__(self, nlp=None, **cfg):
+    def __init__(
+        self,
+        nlp: Optional["Language"] = None,
+        default_lang: str = "xx",
+        default_pipeline=DEFAULT_PIPELINE,
+        **cfg,
+    ) -> None:
        """Initialize the Scorer.
        RETURNS (Scorer): The newly created object.

@ -78,44 +90,39 @@ class Scorer:
        """
        self.nlp = nlp
        self.cfg = cfg
-
        if not nlp:
-            # create a default pipeline
-            nlp = get_lang_class("xx")()
-            nlp.add_pipe("senter")
-            nlp.add_pipe("tagger")
-            nlp.add_pipe("morphologizer")
-            nlp.add_pipe("parser")
-            nlp.add_pipe("ner")
-            nlp.add_pipe("textcat")
+            nlp = get_lang_class(default_lang)()
+            for pipe in default_pipeline:
+                nlp.add_pipe(pipe)
            self.nlp = nlp

-    def score(self, examples):
+    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
        """Evaluate a list of Examples.

        examples (Iterable[Example]): The predicted annotations + correct annotations.
        RETURNS (Dict): A dictionary of scores.
+
        DOCS: https://spacy.io/api/scorer#score
        """
        scores = {}
-
        if hasattr(self.nlp.tokenizer, "score"):
            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
        for name, component in self.nlp.pipeline:
            if hasattr(component, "score"):
                scores.update(component.score(examples, **self.cfg))
-
        return scores

    @staticmethod
-    def score_tokenization(examples, **cfg):
+    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
        """Returns accuracy and PRF scores for tokenization.
-
        * token_acc: # correct tokens / # gold tokens
        * token_p/r/f: PRF for token character spans

        examples (Iterable[Example]): Examples to score
-        RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
+        RETURNS (Dict[str, float]): A dictionary containing the scores
+            token_acc/p/r/f.
+
+        DOCS: https://spacy.io/api/scorer#score_tokenization
        """
        acc_score = PRFScore()
        prf_score = PRFScore()
@ -146,16 +153,24 @@ class Scorer:
        }

    @staticmethod
-    def score_token_attr(examples, attr, getter=getattr, **cfg):
+    def score_token_attr(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        **cfg,
+    ) -> Dict[str, float]:
        """Returns an accuracy score for a token-level attribute.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
-        getter (callable): Defaults to getattr. If provided,
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
-        RETURNS (dict): A dictionary containing the accuracy score under the
-            key attr_acc.
+        RETURNS (Dict[str, float]): A dictionary containing the accuracy score
+            under the key attr_acc.
+
+        DOCS: https://spacy.io/api/scorer#score_token_attr
        """
        tag_score = PRFScore()
        for example in examples:
@ -173,17 +188,21 @@ class Scorer:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
-        return {
-            attr + "_acc": tag_score.fscore,
-        }
+        return {f"{attr}_acc": tag_score.fscore}

    @staticmethod
-    def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
+    def score_token_attr_per_feat(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        **cfg,
+    ):
        """Return PRF scores per feat for a token attribute in UFEATS format.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
-        getter (callable): Defaults to getattr. If provided,
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
@ -224,20 +243,26 @@ class Scorer:
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
                )
-        return {
-            attr + "_per_feat": per_feat,
-        }
+        return {f"{attr}_per_feat": per_feat}

    @staticmethod
-    def score_spans(examples, attr, getter=getattr, **cfg):
+    def score_spans(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        **cfg,
+    ) -> Dict[str, Any]:
        """Returns PRF scores for labeled spans.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
-        getter (callable): Defaults to getattr. If provided,
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(doc, attr) should return the spans for the individual doc.
-        RETURNS (dict): A dictionary containing the PRF scores under the
-            keys attr_p/r/f and the per-type PRF scores under attr_per_type.
+        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
+            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
+
+        DOCS: https://spacy.io/api/scorer#score_spans
        """
        score = PRFScore()
        score_per_type = dict()
@ -257,14 +282,12 @@ class Scorer:
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()
-
            # Special case for ents:
            # If we have missing values in the gold, we can't easily tell
            # whether our NER predictions are true.
            # It seems bad but it's what we've always done.
            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                continue
-
            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
@ -280,38 +303,39 @@ class Scorer:
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
        results = {
-            attr + "_p": score.precision,
-            attr + "_r": score.recall,
-            attr + "_f": score.fscore,
-            attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+            f"{attr}_p": score.precision,
+            f"{attr}_r": score.recall,
+            f"{attr}_f": score.fscore,
+            f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
        }
        return results

    @staticmethod
    def score_cats(
-        examples,
-        attr,
-        getter=getattr,
-        labels=[],
-        multi_label=True,
-        positive_label=None,
-        **cfg
-    ):
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        labels: Iterable[str] = tuple(),
+        multi_label: bool = True,
+        positive_label: Optional[str] = None,
+        **cfg,
+    ) -> Dict[str, Any]:
        """Returns PRF and ROC AUC scores for a doc-level attribute with a
        dict with scores for each label like Doc.cats. The reported overall
        score depends on the scorer settings.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
-        getter (callable): Defaults to getattr. If provided,
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
-        RETURNS (dict): A dictionary containing the scores, with inapplicable
-                scores as None:
+        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
+            inapplicable scores as None:
            for all:
                attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
                attr_score_desc (text description of the overall score),
@ -320,6 +344,8 @@ class Scorer:
            for binary exclusive with positive label: attr_p/r/f
            for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
            for multilabel, macro-averaged AUC: attr_macro_auc
+
+        DOCS: https://spacy.io/api/scorer#score_cats
        """
        score = PRFScore()
        f_per_type = dict()
@ -368,64 +394,67 @@ class Scorer:
                    )
                )
        results = {
-            attr + "_score": None,
-            attr + "_score_desc": None,
-            attr + "_p": None,
-            attr + "_r": None,
-            attr + "_f": None,
-            attr + "_macro_f": None,
-            attr + "_macro_auc": None,
-            attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
-            attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+            f"{attr}_score": None,
+            f"{attr}_score_desc": None,
+            f"{attr}_p": None,
+            f"{attr}_r": None,
+            f"{attr}_f": None,
+            f"{attr}_macro_f": None,
+            f"{attr}_macro_auc": None,
+            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
        }
        if len(labels) == 2 and not multi_label and positive_label:
-            results[attr + "_p"] = score.precision
-            results[attr + "_r"] = score.recall
-            results[attr + "_f"] = score.fscore
-            results[attr + "_score"] = results[attr + "_f"]
-            results[attr + "_score_desc"] = "F (" + positive_label + ")"
+            results[f"{attr}_p"] = score.precision
+            results[f"{attr}_r"] = score.recall
+            results[f"{attr}_f"] = score.fscore
+            results[f"{attr}_score"] = results[f"{attr}_f"]
+            results[f"{attr}_score_desc"] = f"F ({positive_label})"
        elif not multi_label:
-            results[attr + "_macro_f"] = sum(
+            results[f"{attr}_macro_f"] = sum(
                [score.fscore for label, score in f_per_type.items()]
            ) / (len(f_per_type) + 1e-100)
-            results[attr + "_score"] = results[attr + "_macro_f"]
-            results[attr + "_score_desc"] = "macro F"
+            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
+            results[f"{attr}_score_desc"] = "macro F"
        else:
-            results[attr + "_macro_auc"] = max(
+            results[f"{attr}_macro_auc"] = max(
                sum([score.score for label, score in auc_per_type.items()])
                / (len(auc_per_type) + 1e-100),
                -1,
            )
-            results[attr + "_score"] = results[attr + "_macro_auc"]
-            results[attr + "_score_desc"] = "macro AUC"
+            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
+            results[f"{attr}_score_desc"] = "macro AUC"
        return results

    @staticmethod
    def score_deps(
-        examples,
-        attr,
-        getter=getattr,
-        head_attr="head",
-        head_getter=getattr,
-        ignore_labels=tuple(),
-        **cfg
-    ):
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        head_attr: str = "head",
+        head_getter: Callable[[Token, str], Any] = getattr,
+        ignore_labels: Tuple[str] = tuple(),
+        **cfg,
+    ) -> Dict[str, Any]:
        """Returns the UAS, LAS, and LAS per type scores for dependency
        parses.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute containing the dependency label.
-        getter (callable): Defaults to getattr. If provided,
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        head_attr (str): The attribute containing the head token. Defaults to
            'head'.
-        head_getter (callable): Defaults to getattr. If provided,
+        head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            head_getter(token, attr) should return the value of the head for an
            individual token.
        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
-        RETURNS (dict): A dictionary containing the scores:
+        RETURNS (Dict[str, Any]): A dictionary containing the scores:
            attr_uas, attr_las, and attr_las_per_type.
+
+        DOCS: https://spacy.io/api/scorer#score_deps
        """
        unlabelled = PRFScore()
        labelled = PRFScore()
@ -483,10 +512,11 @@ class Scorer:
                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
        return {
-            attr + "_uas": unlabelled.fscore,
-            attr + "_las": labelled.fscore,
-            attr
-            + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
+            f"{attr}_uas": unlabelled.fscore,
+            f"{attr}_las": labelled.fscore,
+            f"{attr}_las_per_type": {
+                k: v.to_dict() for k, v in labelled_per_dep.items()
+            },
        }


--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -6,10 +6,9 @@ source: spacy/scorer.py
 ---

 The `Scorer` computes evaluation scores. It's typically created by
-[`Language.evaluate`](/api/language#evaluate).
-
-In addition, the `Scorer` provides a number of evaluation methods for evaluating
-`Token` and `Doc` attributes.
+[`Language.evaluate`](/api/language#evaluate). In addition, the `Scorer`
+provides a number of evaluation methods for evaluating [`Token`](/api/token) and
+[`Doc`](/api/doc) attributes.

 ## Scorer.\_\_init\_\_ {#init tag="method"}

@ -20,10 +19,10 @@ Create a new `Scorer`.
 > ```python
 > from spacy.scorer import Scorer
 >
-> # default scoring pipeline
+> # Default scoring pipeline
 > scorer = Scorer()
 >
-> # provided scoring pipeline
+> # Provided scoring pipeline
 > nlp = spacy.load("en_core_web_sm")
 > scorer = Scorer(nlp)
 > ```
@ -41,16 +40,20 @@ scoring methods provided by the components in the pipeline.
 The returned `Dict` contains the scores provided by the individual pipeline
 components. For the scoring methods provided by the `Scorer` and use by the core
 pipeline components, the individual score names start with the `Token` or `Doc`
-attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
-`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`,
-`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`,
-`textcat_macro_f`.
+attribute being scored:
+
+- `token_acc`, `token_p`, `token_r`, `token_f`,
+- `sents_p`, `sents_r`, `sents_f`
+- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
+- `dep_uas`, `dep_las`, `dep_las_per_type`
+- `ents_p`, `ents_r` `ents_f`, `ents_per_type`
+- `textcat_macro_auc`, `textcat_macro_f`

 > #### Example
 >
 > ```python
 > scorer = Scorer()
-> scorer.score(examples)
+> scores = scorer.score(examples)
 > ```

 | Name        | Type                | Description                                                                                   |
@ -58,78 +61,148 @@ attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
 | `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
 | **RETURNS** | `Dict`              | A dictionary of scores.                                                                       |

-## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}
+## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}

 Scores the tokenization:

- `token_acc`: # correct tokens / # gold tokens
- `token_p/r/f`: PRF for token character spans
+- `token_acc`: number of correct tokens / number of gold tokens
+- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
+  character spans
+
+> #### Example
+>
+> ```python
+> scores = Scorer.score_tokenization(examples)
+> ```

 | Name        | Type                | Description                                                                                   |
 | ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
 | `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
-| **RETURNS** | `Dict`              | A dictionary containing the scores `token_acc/p/r/f`.                                         |
+| **RETURNS** | `Dict`              | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`.              |

-## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
+## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}

 Scores a single token attribute.

+> #### Example
+>
+> ```python
+> scores = Scorer.score_token_attr(examples, "pos")
+> print(scores["pos_acc"])
+> ```
+
 | Name           | Type                | Description                                                                                                                   |
-| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | `examples`     | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations.                                 |
 | `attr`         | `str`               | The attribute to score.                                                                                                       |
-| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
-| **RETURNS** | `Dict`              | A dictionary containing the score `attr_acc`.                                                                                 |
+| _keyword-only_ |                     |                                                                                                                               |
+| `getter`       | `Callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS**    | `Dict[str, float]`  | A dictionary containing the score `{attr}_acc`.                                                                               |

-## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
+## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}

-Scores a single token attribute per feature for a token attribute in UFEATS
+Scores a single token attribute per feature for a token attribute in
+[UFEATS](https://universaldependencies.org/format.html#morphological-annotation)
 format.

+> #### Example
+>
+> ```python
+> scores = Scorer.score_token_attr_per_feat(examples, "morph")
+> print(scores["morph_per_feat"])
+> ```
+
 | Name           | Type                | Description                                                                                                                   |
-| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | `examples`     | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations.                                 |
 | `attr`         | `str`               | The attribute to score.                                                                                                       |
-| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
-| **RETURNS** | `Dict`              | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`.                                            |
+| _keyword-only_ |                     |                                                                                                                               |
+| `getter`       | `Callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS**    | `Dict`              | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`.                                           |

-## Scorer.score_spans {#score_spans tag="staticmethod"}
+## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}

 Returns PRF scores for labeled or unlabeled spans.

+> #### Example
+>
+> ```python
+> scores = Scorer.score_spans(examples, "ents")
+> print(scores["ents_f"])
+> ```
+
 | Name           | Type                | Description                                                                                                                                   |
-| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`     | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations.                                                 |
 | `attr`         | `str`               | The attribute to score.                                                                                                                       |
-| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`.     |
-| **RETURNS** | `Dict`              | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
+| _keyword-only_ |                     |                                                                                                                                               |
+| `getter`       | `Callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`.                             |
+| **RETURNS**    | `Dict`              | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. |

-## Scorer.score_deps {#score_deps tag="staticmethod"}
+## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}

 Calculate the UAS, LAS, and LAS per type scores for dependency parses.

+> #### Example
+>
+> ```python
+> def dep_getter(token, attr):
+>     dep = getattr(token, attr)
+>     dep = token.vocab.strings.as_string(dep).lower()
+>     return dep
+>
+> scores = Scorer.score_deps(
+>     examples,
+>     "dep",
+>     getter=dep_getter,
+>     ignore_labels=("p", "punct")
+> )
+> print(scores["dep_uas"], scores["dep_las"])
+> ```
+
 | Name            | Type                | Description                                                                                                                   |
 | --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | `examples`      | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations.                                 |
 | `attr`          | `str`               | The attribute containing the dependency label.                                                                                |
-| `getter`        | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| _keyword-only_  |                     |                                                                                                                               |
+| `getter`        | `Callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
 | `head_attr`     | `str`               | The attribute containing the head token.                                                                                      |
 | `head_getter`   | `callable`          | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`.              |
 | `ignore_labels` | `Tuple`             | Labels to ignore while scoring (e.g., `punct`).                                                                               |
-| **RETURNS**     | `Dict`              | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`.                                          |
+| **RETURNS**     | `Dict`              | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`.                                    |

-## Scorer.score_cats {#score_cats tag="staticmethod"}
+## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}

 Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
 containing scores for each label like `Doc.cats`. The reported overall score
-depends on the scorer settings.
+depends on the scorer settings:
+
+1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
+   `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
+   score), `{attr}_f_per_type`, `{attr}_auc_per_type`
+2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
+3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
+4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
+
+> #### Example
+>
+> ```python
+> labels = ["LABEL_A", "LABEL_B", "LABEL_C"]
+> scores = Scorer.score_cats(
+>     examples,
+>     "cats",
+>     labels=labels
+> )
+> print(scores["cats_macro_auc"])
+> ```

 | Name             | Type                | Description                                                                                             |
-| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- |
 | `examples`       | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations.           |
 | `attr`           | `str`               | The attribute to score.                                                                                 |
-| `getter`         | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`.                                                                                                                                                                                                                                                                                                                                           |
+| _keyword-only_   |                     |                                                                                                         |
+| `getter`         | `Callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
 | labels           | `Iterable[str]`     | The set of possible labels. Defaults to `[]`.                                                           |
 | `multi_label`    | `bool`              | Whether the attribute allows multiple labels. Defaults to `True`.                                       |
 | `positive_label` | `str`               | The positive label for a binary task with exclusive classes. Defaults to `None`.                        |
-| **RETURNS**      | `Dict`              | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` |
+| **RETURNS**      | `Dict`              | A dictionary containing the scores, with inapplicable scores as `None`.                                 |