diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 363c02cd3..9fcdd18be 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -27,6 +27,7 @@ def evaluate_cli( gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), + per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), # fmt: on ): """ @@ -50,6 +51,7 @@ def evaluate_cli( gold_preproc=gold_preproc, displacy_path=displacy_path, displacy_limit=displacy_limit, + per_component=per_component, silent=False, ) @@ -64,6 +66,7 @@ def evaluate( displacy_limit: int = 25, silent: bool = True, spans_key: str = "sc", + per_component: bool = False, ) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() @@ -78,44 +81,53 @@ def evaluate( corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) - scores = nlp.evaluate(dev_dataset) - metrics = { - "TOK": "token_acc", - "TAG": "tag_acc", - "POS": "pos_acc", - "MORPH": "morph_acc", - "LEMMA": "lemma_acc", - "UAS": "dep_uas", - "LAS": "dep_las", - "NER P": "ents_p", - "NER R": "ents_r", - "NER F": "ents_f", - "TEXTCAT": "cats_score", - "SENT P": "sents_p", - "SENT R": "sents_r", - "SENT F": "sents_f", - "SPAN P": f"spans_{spans_key}_p", - "SPAN R": f"spans_{spans_key}_r", - "SPAN F": f"spans_{spans_key}_f", - "SPEED": "speed", - } - results = {} - data = {} - for metric, key in metrics.items(): - if key in scores: - if key == "cats_score": - metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - if isinstance(scores[key], (int, float)): - if key == "speed": - results[metric] = f"{scores[key]:.0f}" + scores = nlp.evaluate(dev_dataset, per_component=per_component) + if per_component: + data = scores + if output is None: + msg.warn( + "The per-component option is enabled but there is no output JSON file provided to save the scores to." + ) + else: + msg.info("Per-component scores will be saved to output JSON file.") + else: + metrics = { + "TOK": "token_acc", + "TAG": "tag_acc", + "POS": "pos_acc", + "MORPH": "morph_acc", + "LEMMA": "lemma_acc", + "UAS": "dep_uas", + "LAS": "dep_las", + "NER P": "ents_p", + "NER R": "ents_r", + "NER F": "ents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPAN P": f"spans_{spans_key}_p", + "SPAN R": f"spans_{spans_key}_r", + "SPAN F": f"spans_{spans_key}_f", + "SPEED": "speed", + } + results = {} + data = {} + for metric, key in metrics.items(): + if key in scores: + if key == "cats_score": + metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" + if isinstance(scores[key], (int, float)): + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" else: - results[metric] = f"{scores[key]*100:.2f}" - else: - results[metric] = "-" - data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] + results[metric] = "-" + data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] - msg.table(results, title="Results") - data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) + msg.table(results, title="Results") + data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] diff --git a/spacy/language.py b/spacy/language.py index 9fdcf6328..289e6dd2c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1372,6 +1372,7 @@ class Language: scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, scorer_cfg: Optional[Dict[str, Any]] = None, + per_component: bool = False, ) -> Dict[str, Any]: """Evaluate a model's pipeline components. @@ -1383,6 +1384,8 @@ class Language: arguments for specific components. scorer_cfg (dict): An optional dictionary with extra keyword arguments for the scorer. + per_component (bool): Whether to return the scores keyed by component + name. Defaults to False. RETURNS (Scorer): The scorer containing the evaluation results. @@ -1415,7 +1418,7 @@ class Language: for eg, doc in zip(examples, docs): eg.predicted = doc end_time = timer() - results = scorer.score(examples) + results = scorer.score(examples, per_component=per_component) n_words = sum(len(eg.predicted) for eg in examples) results["speed"] = n_words / (end_time - start_time) return results diff --git a/spacy/scorer.py b/spacy/scorer.py index de4f52be6..86cd00a50 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -121,20 +121,30 @@ class Scorer: nlp.add_pipe(pipe) self.nlp = nlp - def score(self, examples: Iterable[Example]) -> Dict[str, Any]: + def score( + self, examples: Iterable[Example], *, per_component: bool = False + ) -> Dict[str, Any]: """Evaluate a list of Examples. examples (Iterable[Example]): The predicted annotations + correct annotations. + per_component (bool): Whether to return the scores keyed by component + name. Defaults to False. RETURNS (Dict): A dictionary of scores. DOCS: https://spacy.io/api/scorer#score """ scores = {} if hasattr(self.nlp.tokenizer, "score"): - scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore + if per_component: + scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg) + else: + scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore for name, component in self.nlp.pipeline: if hasattr(component, "score"): - scores.update(component.score(examples, **self.cfg)) + if per_component: + scores[name] = component.score(examples, **self.cfg) + else: + scores.update(component.score(examples, **self.cfg)) return scores @staticmethod diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index dbb47b423..4b2d22986 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -115,6 +115,14 @@ def test_tokenization(sented_doc): assert scores["token_r"] == approx(0.33333333) assert scores["token_f"] == 0.4 + # per-component scoring + scorer = Scorer() + scores = scorer.score([example], per_component=True) + assert scores["tokenizer"]["token_acc"] == 0.5 + assert scores["tokenizer"]["token_p"] == 0.5 + assert scores["tokenizer"]["token_r"] == approx(0.33333333) + assert scores["tokenizer"]["token_f"] == 0.4 + def test_sents(sented_doc): scorer = Scorer() @@ -278,6 +286,13 @@ def test_tag_score(tagged_doc): assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) + # per-component scoring + scorer = Scorer() + results = scorer.score([example], per_component=True) + assert results["tagger"]["tag_acc"] == 0.9 + assert results["morphologizer"]["pos_acc"] == 0.9 + assert results["morphologizer"]["morph_acc"] == approx(0.8) + def test_partial_annotation(en_tokenizer): pred_doc = en_tokenizer("a b c d e") diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 05328b7eb..2c90ec6c0 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | ### speed {id="benchmark-speed", version="3.5", tag="command"} diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index 93ddd79a2..de23156b9 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects. > print(scores) > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `batch_size` | The batch size to use. ~~Optional[int]~~ | -| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | -| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `batch_size` | The batch size to use. ~~Optional[int]~~ | +| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | +| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Language.use_params {id="use_params",tag="contextmanager, method"} diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx index 6f0c95f6f..9bdd0a8f4 100644 --- a/website/docs/api/scorer.mdx +++ b/website/docs/api/scorer.mdx @@ -33,7 +33,7 @@ Create a new `Scorer`. | `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | _keyword-only_ | | -| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | +| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | ## Scorer.score {id="score",tag="method"} @@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or > scores = scorer.score(examples) > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"} diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 6de1acdf0..64ec342cd 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -469,7 +469,7 @@ factories. | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | -| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. | +| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {id="registry-transformers"}