Add micro PRF for morph scoring (#9546)

* Add micro PRF for morph scoring

For pipelines where morph features are added by more than one component
and a reference training corpus may not contain all features, a micro
PRF score is more flexible than a simple accuracy score. An example is
the reading and inflection features added by the Japanese tokenizer.

* Use `morph_micro_f` as the default morph score for Japanese
morphologizers.

* Update docstring

* Fix typo in docstring

* Update Scorer API docs

* Fix results type

* Organize score list by attribute prefix
This commit is contained in:
Adriane Boyd 2021-10-29 10:29:29 +02:00 committed by GitHub
parent 554fa414ec
commit 12974bf4d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 25 deletions

View File

@ -203,7 +203,7 @@ class Japanese(Language):
"extend": True, "extend": True,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
}, },
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
) )
def make_morphologizer( def make_morphologizer(
nlp: Language, nlp: Language,

View File

@ -247,18 +247,21 @@ class Scorer:
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Return PRF scores per feat for a token attribute in UFEATS format. """Return micro PRF and PRF scores per feat for a token attribute in
UFEATS format.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
missing_values (Set[Any]): Attribute values to treat as missing annotation missing_values (Set[Any]): Attribute values to treat as missing
in the reference annotation. annotation in the reference annotation.
RETURNS (dict): A dictionary containing the per-feat PRF scores under RETURNS (dict): A dictionary containing the micro PRF scores under the
the key attr_per_feat. key attr_micro_p/r/f and the per-feat PRF scores under
attr_per_feat.
""" """
micro_score = PRFScore()
per_feat = {} per_feat = {}
for example in examples: for example in examples:
pred_doc = example.predicted pred_doc = example.predicted
@ -300,15 +303,22 @@ class Scorer:
pred_per_feat[field] = set() pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat)) pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )
score_key = f"{attr}_per_feat" result: Dict[str, Any] = {}
if any([len(v) for v in per_feat.values()]): if len(micro_score) > 0:
result = {k: v.to_dict() for k, v in per_feat.items()} result[f"{attr}_micro_p"] = micro_score.precision
return {score_key: result} result[f"{attr}_micro_r"] = micro_score.recall
result[f"{attr}_micro_f"] = micro_score.fscore
result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
else: else:
return {score_key: None} result[f"{attr}_micro_p"] = None
result[f"{attr}_micro_r"] = None
result[f"{attr}_micro_f"] = None
result[f"{attr}_per_feat"] = None
return result
@staticmethod @staticmethod
def score_spans( def score_spans(

View File

@ -249,6 +249,7 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 1.0 assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0 assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0 assert results["morph_acc"] == 1.0
assert results["morph_micro_f"] == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0 assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified # Gold annotation is modified
@ -272,6 +273,7 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 0.9 assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9 assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8) assert results["morph_acc"] == approx(0.8)
assert results["morph_micro_f"] == approx(0.8461538)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0 assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)

View File

@ -41,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the
scoring methods provided by the components in the pipeline. scoring methods provided by the components in the pipeline.
The returned `Dict` contains the scores provided by the individual pipeline The returned `Dict` contains the scores provided by the individual pipeline
components. For the scoring methods provided by the `Scorer` and use by the core components. For the scoring methods provided by the `Scorer` and used by the
pipeline components, the individual score names start with the `Token` or `Doc` core pipeline components, the individual score names start with the `Token` or
attribute being scored: `Doc` attribute being scored:
- `token_acc`, `token_p`, `token_r`, `token_f`, - `token_acc`, `token_p`, `token_r`, `token_f`
- `sents_p`, `sents_r`, `sents_f` - `sents_p`, `sents_r`, `sents_f`
- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` - `tag_acc`
- `pos_acc`
- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`,
`morph_per_feat`
- `lemma_acc`
- `dep_uas`, `dep_las`, `dep_las_per_type` - `dep_uas`, `dep_las`, `dep_las_per_type`
- `ents_p`, `ents_r` `ents_f`, `ents_per_type` - `ents_p`, `ents_r` `ents_f`, `ents_per_type`
- `spans_sc_p`, `spans_sc_r`, `spans_sc_f`
- `cats_score` (depends on config, description provided in `cats_score_desc`), - `cats_score` (depends on config, description provided in `cats_score_desc`),
`cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`, `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`,
`cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`, `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`,
@ -84,7 +89,7 @@ Docs with `has_unknown_spaces` are skipped during scoring.
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
@ -125,13 +130,13 @@ scoring.
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | | **RETURNS** | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}