mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add scorer option to return per-component scores (#12540)
* Add scorer option to return per-component scores Add `per_component` option to `Language.evaluate` and `Scorer.score` to return scores keyed by `tokenizer` (hard-coded) or by component name. Add option to `evaluate` CLI to score by component. Per-component scores can only be saved to JSON. * Update help text and messages
This commit is contained in:
parent
88680a6eed
commit
3637148c4d
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
|||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
|||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
per_component=per_component,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
@ -64,6 +66,7 @@ def evaluate(
|
|||
displacy_limit: int = 25,
|
||||
silent: bool = True,
|
||||
spans_key: str = "sc",
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
|
@ -78,44 +81,53 @@ def evaluate(
|
|||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
scores = nlp.evaluate(dev_dataset)
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||
if per_component:
|
||||
data = scores
|
||||
if output is None:
|
||||
msg.warn(
|
||||
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||
)
|
||||
else:
|
||||
msg.info("Per-component scores will be saved to output JSON file.")
|
||||
else:
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
|
|
|
@ -1372,6 +1372,7 @@ class Language:
|
|||
scorer: Optional[Scorer] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a model's pipeline components.
|
||||
|
||||
|
@ -1383,6 +1384,8 @@ class Language:
|
|||
arguments for specific components.
|
||||
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||||
for the scorer.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
|
||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||
|
||||
|
@ -1415,7 +1418,7 @@ class Language:
|
|||
for eg, doc in zip(examples, docs):
|
||||
eg.predicted = doc
|
||||
end_time = timer()
|
||||
results = scorer.score(examples)
|
||||
results = scorer.score(examples, per_component=per_component)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
results["speed"] = n_words / (end_time - start_time)
|
||||
return results
|
||||
|
|
|
@ -121,20 +121,30 @@ class Scorer:
|
|||
nlp.add_pipe(pipe)
|
||||
self.nlp = nlp
|
||||
|
||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
def score(
|
||||
self, examples: Iterable[Example], *, per_component: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a list of Examples.
|
||||
|
||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
RETURNS (Dict): A dictionary of scores.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score
|
||||
"""
|
||||
scores = {}
|
||||
if hasattr(self.nlp.tokenizer, "score"):
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
if per_component:
|
||||
scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
for name, component in self.nlp.pipeline:
|
||||
if hasattr(component, "score"):
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
if per_component:
|
||||
scores[name] = component.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
return scores
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
|
|||
assert scores["token_r"] == approx(0.33333333)
|
||||
assert scores["token_f"] == 0.4
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example], per_component=True)
|
||||
assert scores["tokenizer"]["token_acc"] == 0.5
|
||||
assert scores["tokenizer"]["token_p"] == 0.5
|
||||
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
|
||||
assert scores["tokenizer"]["token_f"] == 0.4
|
||||
|
||||
|
||||
def test_sents(sented_doc):
|
||||
scorer = Scorer()
|
||||
|
@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
|
|||
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
results = scorer.score([example], per_component=True)
|
||||
assert results["tagger"]["tag_acc"] == 0.9
|
||||
assert results["morphologizer"]["pos_acc"] == 0.9
|
||||
assert results["morphologizer"]["morph_acc"] == approx(0.8)
|
||||
|
||||
|
||||
def test_partial_annotation(en_tokenizer):
|
||||
pred_doc = en_tokenizer("a b c d e")
|
||||
|
|
|
@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
|
|||
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||
| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
|
||||
### speed {id="benchmark-speed", version="3.5", tag="command"}
|
||||
|
||||
|
|
|
@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
|
|||
> print(scores)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Language.use_params {id="use_params",tag="contextmanager, method"}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ Create a new `Scorer`.
|
|||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
|
||||
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||
| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||
|
||||
## Scorer.score {id="score",tag="method"}
|
||||
|
||||
|
@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
|
|||
> scores = scorer.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
|
||||
|
||||
|
|
|
@ -469,7 +469,7 @@ factories.
|
|||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
|
||||
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
|
||||
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
||||
|
||||
### spacy-transformers registry {id="registry-transformers"}
|
||||
|
|
Loading…
Reference in New Issue
Block a user