From 12974bf4d975a8789f86ba91c7cefc31518ed9e4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 29 Oct 2021 10:29:29 +0200 Subject: [PATCH] Add micro PRF for morph scoring (#9546) * Add micro PRF for morph scoring For pipelines where morph features are added by more than one component and a reference training corpus may not contain all features, a micro PRF score is more flexible than a simple accuracy score. An example is the reading and inflection features added by the Japanese tokenizer. * Use `morph_micro_f` as the default morph score for Japanese morphologizers. * Update docstring * Fix typo in docstring * Update Scorer API docs * Fix results type * Organize score list by attribute prefix --- spacy/lang/ja/__init__.py | 2 +- spacy/scorer.py | 30 ++++++++++++++++++++---------- spacy/tests/test_scorer.py | 2 ++ website/docs/api/scorer.md | 33 +++++++++++++++++++-------------- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 127c4c8ac..81ff5b5b8 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -203,7 +203,7 @@ class Japanese(Language): "extend": True, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, }, - default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, + default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None}, ) def make_morphologizer( nlp: Language, diff --git a/spacy/scorer.py b/spacy/scorer.py index 49d51a4b3..75e5b3317 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -247,18 +247,21 @@ class Scorer: missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] **cfg, ) -> Dict[str, Any]: - """Return PRF scores per feat for a token attribute in UFEATS format. + """Return micro PRF and PRF scores per feat for a token attribute in + UFEATS format. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - missing_values (Set[Any]): Attribute values to treat as missing annotation - in the reference annotation. - RETURNS (dict): A dictionary containing the per-feat PRF scores under - the key attr_per_feat. + missing_values (Set[Any]): Attribute values to treat as missing + annotation in the reference annotation. + RETURNS (dict): A dictionary containing the micro PRF scores under the + key attr_micro_p/r/f and the per-feat PRF scores under + attr_per_feat. """ + micro_score = PRFScore() per_feat = {} for example in examples: pred_doc = example.predicted @@ -300,15 +303,22 @@ class Scorer: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) for field in per_feat: + micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set())) per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) - score_key = f"{attr}_per_feat" - if any([len(v) for v in per_feat.values()]): - result = {k: v.to_dict() for k, v in per_feat.items()} - return {score_key: result} + result: Dict[str, Any] = {} + if len(micro_score) > 0: + result[f"{attr}_micro_p"] = micro_score.precision + result[f"{attr}_micro_r"] = micro_score.recall + result[f"{attr}_micro_f"] = micro_score.fscore + result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()} else: - return {score_key: None} + result[f"{attr}_micro_p"] = None + result[f"{attr}_micro_r"] = None + result[f"{attr}_micro_f"] = None + result[f"{attr}_per_feat"] = None + return result @staticmethod def score_spans( diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 16cc97f6d..6e15fa2de 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -249,6 +249,7 @@ def test_tag_score(tagged_doc): assert results["tag_acc"] == 1.0 assert results["pos_acc"] == 1.0 assert results["morph_acc"] == 1.0 + assert results["morph_micro_f"] == 1.0 assert results["morph_per_feat"]["NounType"]["f"] == 1.0 # Gold annotation is modified @@ -272,6 +273,7 @@ def test_tag_score(tagged_doc): assert results["tag_acc"] == 0.9 assert results["pos_acc"] == 0.9 assert results["morph_acc"] == approx(0.8) + assert results["morph_micro_f"] == approx(0.8461538) assert results["morph_per_feat"]["NounType"]["f"] == 1.0 assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index da7da5f82..8dbe3b276 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -41,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the scoring methods provided by the components in the pipeline. The returned `Dict` contains the scores provided by the individual pipeline -components. For the scoring methods provided by the `Scorer` and use by the core -pipeline components, the individual score names start with the `Token` or `Doc` -attribute being scored: +components. For the scoring methods provided by the `Scorer` and used by the +core pipeline components, the individual score names start with the `Token` or +`Doc` attribute being scored: -- `token_acc`, `token_p`, `token_r`, `token_f`, +- `token_acc`, `token_p`, `token_r`, `token_f` - `sents_p`, `sents_r`, `sents_f` -- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` +- `tag_acc` +- `pos_acc` +- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`, + `morph_per_feat` +- `lemma_acc` - `dep_uas`, `dep_las`, `dep_las_per_type` - `ents_p`, `ents_r` `ents_f`, `ents_per_type` +- `spans_sc_p`, `spans_sc_r`, `spans_sc_f` - `cats_score` (depends on config, description provided in `cats_score_desc`), `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`, `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`, @@ -84,7 +89,7 @@ Docs with `has_unknown_spaces` are skipped during scoring. > ``` | Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | @@ -124,14 +129,14 @@ scoring. > print(scores["morph_per_feat"]) > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | -| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | +| Name | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}