From 12974bf4d975a8789f86ba91c7cefc31518ed9e4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 29 Oct 2021 10:29:29 +0200
Subject: [PATCH] Add micro PRF for morph scoring (#9546)

* Add micro PRF for morph scoring

For pipelines where morph features are added by more than one component
and a reference training corpus may not contain all features, a micro
PRF score is more flexible than a simple accuracy score. An example is
the reading and inflection features added by the Japanese tokenizer.

* Use `morph_micro_f` as the default morph score for Japanese
morphologizers.

* Update docstring

* Fix typo in docstring

* Update Scorer API docs

* Fix results type

* Organize score list by attribute prefix
---
 spacy/lang/ja/__init__.py  |  2 +-
 spacy/scorer.py            | 30 ++++++++++++++++++++----------
 spacy/tests/test_scorer.py |  2 ++
 website/docs/api/scorer.md | 33 +++++++++++++++++++--------------
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 127c4c8ac..81ff5b5b8 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -203,7 +203,7 @@ class Japanese(Language):
         "extend": True,
         "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
     },
-    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
+    default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 49d51a4b3..75e5b3317 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -247,18 +247,21 @@ class Scorer:
         missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
         **cfg,
     ) -> Dict[str, Any]:
-        """Return PRF scores per feat for a token attribute in UFEATS format.
+        """Return micro PRF and PRF scores per feat for a token attribute in
+        UFEATS format.
 
         examples (Iterable[Example]): Examples to score
         attr (str): The attribute to score.
         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
             getter(token, attr) should return the value of the attribute for an
             individual token.
-        missing_values (Set[Any]): Attribute values to treat as missing annotation
-            in the reference annotation.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores under
-            the key attr_per_feat.
+        missing_values (Set[Any]): Attribute values to treat as missing
+            annotation in the reference annotation.
+        RETURNS (dict): A dictionary containing the micro PRF scores under the
+            key attr_micro_p/r/f and the per-feat PRF scores under
+            attr_per_feat.
         """
+        micro_score = PRFScore()
         per_feat = {}
         for example in examples:
             pred_doc = example.predicted
@@ -300,15 +303,22 @@ class Scorer:
                                     pred_per_feat[field] = set()
                                 pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
+                micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
                 per_feat[field].score_set(
                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
-        score_key = f"{attr}_per_feat"
-        if any([len(v) for v in per_feat.values()]):
-            result = {k: v.to_dict() for k, v in per_feat.items()}
-            return {score_key: result}
+        result: Dict[str, Any] = {}
+        if len(micro_score) > 0:
+            result[f"{attr}_micro_p"] = micro_score.precision
+            result[f"{attr}_micro_r"] = micro_score.recall
+            result[f"{attr}_micro_f"] = micro_score.fscore
+            result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
         else:
-            return {score_key: None}
+            result[f"{attr}_micro_p"] = None
+            result[f"{attr}_micro_r"] = None
+            result[f"{attr}_micro_f"] = None
+            result[f"{attr}_per_feat"] = None
+        return result
 
     @staticmethod
     def score_spans(
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 16cc97f6d..6e15fa2de 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -249,6 +249,7 @@ def test_tag_score(tagged_doc):
     assert results["tag_acc"] == 1.0
     assert results["pos_acc"] == 1.0
     assert results["morph_acc"] == 1.0
+    assert results["morph_micro_f"] == 1.0
     assert results["morph_per_feat"]["NounType"]["f"] == 1.0
 
     # Gold annotation is modified
@@ -272,6 +273,7 @@ def test_tag_score(tagged_doc):
     assert results["tag_acc"] == 0.9
     assert results["pos_acc"] == 0.9
     assert results["morph_acc"] == approx(0.8)
+    assert results["morph_micro_f"] == approx(0.8461538)
     assert results["morph_per_feat"]["NounType"]["f"] == 1.0
     assert results["morph_per_feat"]["Poss"]["f"] == 0.0
     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index da7da5f82..8dbe3b276 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -41,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the
 scoring methods provided by the components in the pipeline.
 
 The returned `Dict` contains the scores provided by the individual pipeline
-components. For the scoring methods provided by the `Scorer` and use by the core
-pipeline components, the individual score names start with the `Token` or `Doc`
-attribute being scored:
+components. For the scoring methods provided by the `Scorer` and used by the
+core pipeline components, the individual score names start with the `Token` or
+`Doc` attribute being scored:
 
-- `token_acc`, `token_p`, `token_r`, `token_f`,
+- `token_acc`, `token_p`, `token_r`, `token_f`
 - `sents_p`, `sents_r`, `sents_f`
-- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
+- `tag_acc`
+- `pos_acc`
+- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`,
+  `morph_per_feat`
+- `lemma_acc`
 - `dep_uas`, `dep_las`, `dep_las_per_type`
 - `ents_p`, `ents_r` `ents_f`, `ents_per_type`
+- `spans_sc_p`, `spans_sc_r`, `spans_sc_f`
 - `cats_score` (depends on config, description provided in `cats_score_desc`),
   `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`,
   `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`,
@@ -84,7 +89,7 @@ Docs with `has_unknown_spaces` are skipped during scoring.
 > ```
 
 | Name        | Description                                                                                                         |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
 | `examples`  | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
 | **RETURNS** | `Dict`                                                                                                              | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
 
@@ -124,14 +129,14 @@ scoring.
 > print(scores["morph_per_feat"])
 > ```
 
-| Name             | Description                                                                                                                                                   |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_   |                                                                                                                                                               |
-| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
-| **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |
+| Name             | Description                                                                                                                                                             |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                     |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                         |
+| _keyword-only_   |                                                                                                                                                                         |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~           |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                                  |
+| **RETURNS**      | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
 
 ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}