2020-07-28 22:39:42 +03:00
|
|
|
|
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
|
2019-09-15 23:31:31 +03:00
|
|
|
|
import numpy as np
|
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
from .gold import Example
|
2020-08-17 17:45:24 +03:00
|
|
|
|
from .tokens import Token, Doc, Span
|
2019-09-15 23:31:31 +03:00
|
|
|
|
from .errors import Errors
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
from .util import get_lang_class
|
|
|
|
|
from .morphology import Morphology
|
2015-05-27 04:18:16 +03:00
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
# This lets us add type hints for mypy etc. without causing circular imports
|
|
|
|
|
from .language import Language # noqa: F401
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
|
|
|
|
|
|
2015-04-05 23:29:30 +03:00
|
|
|
|
|
2020-07-12 15:03:23 +03:00
|
|
|
|
class PRFScore:
|
2020-07-28 22:39:42 +03:00
|
|
|
|
"""A precision / recall / F score."""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def __init__(self) -> None:
|
2015-05-24 21:07:18 +03:00
|
|
|
|
self.tp = 0
|
|
|
|
|
self.fp = 0
|
|
|
|
|
self.fn = 0
|
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_set(self, cand: set, gold: set) -> None:
|
2015-05-24 21:07:18 +03:00
|
|
|
|
self.tp += len(cand.intersection(gold))
|
|
|
|
|
self.fp += len(cand - gold)
|
|
|
|
|
self.fn += len(gold - cand)
|
|
|
|
|
|
|
|
|
|
@property
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def precision(self) -> float:
|
2020-08-26 05:00:14 +03:00
|
|
|
|
return self.tp / (self.tp + self.fp + 1e-100)
|
2015-05-24 21:07:18 +03:00
|
|
|
|
|
|
|
|
|
@property
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def recall(self) -> float:
|
2020-08-26 05:00:14 +03:00
|
|
|
|
return self.tp / (self.tp + self.fn + 1e-100)
|
2015-05-24 21:07:18 +03:00
|
|
|
|
|
|
|
|
|
@property
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def fscore(self) -> float:
|
2015-05-24 21:07:18 +03:00
|
|
|
|
p = self.precision
|
|
|
|
|
r = self.recall
|
2020-08-25 01:30:52 +03:00
|
|
|
|
return 2 * ((p * r) / (p + r + 1e-100))
|
2015-05-24 21:07:18 +03:00
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def to_dict(self) -> Dict[str, float]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
|
|
|
|
|
2015-05-24 21:07:18 +03:00
|
|
|
|
|
2020-07-12 15:03:23 +03:00
|
|
|
|
class ROCAUCScore:
|
2020-07-28 22:39:42 +03:00
|
|
|
|
"""An AUC ROC score."""
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
2020-07-29 11:39:33 +03:00
|
|
|
|
def __init__(self) -> None:
|
2019-09-15 23:31:31 +03:00
|
|
|
|
self.golds = []
|
|
|
|
|
self.cands = []
|
|
|
|
|
self.saved_score = 0.0
|
|
|
|
|
self.saved_score_at_len = 0
|
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_set(self, cand, gold) -> None:
|
2019-09-15 23:31:31 +03:00
|
|
|
|
self.cands.append(cand)
|
|
|
|
|
self.golds.append(gold)
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def score(self):
|
|
|
|
|
if len(self.golds) == self.saved_score_at_len:
|
|
|
|
|
return self.saved_score
|
|
|
|
|
try:
|
|
|
|
|
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
|
|
|
|
# catch ValueError: Only one class present in y_true.
|
|
|
|
|
# ROC AUC score is not defined in that case.
|
2019-09-18 20:57:08 +03:00
|
|
|
|
except ValueError:
|
2019-09-15 23:31:31 +03:00
|
|
|
|
self.saved_score = -float("inf")
|
|
|
|
|
self.saved_score_at_len = len(self.golds)
|
|
|
|
|
return self.saved_score
|
|
|
|
|
|
|
|
|
|
|
2020-07-12 15:03:23 +03:00
|
|
|
|
class Scorer:
|
2019-05-24 15:06:04 +03:00
|
|
|
|
"""Compute evaluation scores."""
|
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
nlp: Optional["Language"] = None,
|
|
|
|
|
default_lang: str = "xx",
|
|
|
|
|
default_pipeline=DEFAULT_PIPELINE,
|
|
|
|
|
**cfg,
|
|
|
|
|
) -> None:
|
2019-05-24 15:06:04 +03:00
|
|
|
|
"""Initialize the Scorer.
|
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#init
|
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
self.nlp = nlp
|
|
|
|
|
self.cfg = cfg
|
|
|
|
|
if not nlp:
|
2020-07-28 22:39:42 +03:00
|
|
|
|
nlp = get_lang_class(default_lang)()
|
|
|
|
|
for pipe in default_pipeline:
|
|
|
|
|
nlp.add_pipe(pipe)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
self.nlp = nlp
|
|
|
|
|
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Evaluate a list of Examples.
|
|
|
|
|
|
|
|
|
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
|
|
|
|
RETURNS (Dict): A dictionary of scores.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
DOCS: https://spacy.io/api/scorer#score
|
2019-11-28 13:10:07 +03:00
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
scores = {}
|
|
|
|
|
if hasattr(self.nlp.tokenizer, "score"):
|
|
|
|
|
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
|
|
|
|
for name, component in self.nlp.pipeline:
|
|
|
|
|
if hasattr(component, "score"):
|
|
|
|
|
scores.update(component.score(examples, **self.cfg))
|
|
|
|
|
return scores
|
2015-03-11 04:07:03 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
@staticmethod
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Returns accuracy and PRF scores for tokenization.
|
|
|
|
|
* token_acc: # correct tokens / # gold tokens
|
|
|
|
|
* token_p/r/f: PRF for token character spans
|
2015-03-11 04:07:03 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
examples (Iterable[Example]): Examples to score
|
2020-07-28 22:39:42 +03:00
|
|
|
|
RETURNS (Dict[str, float]): A dictionary containing the scores
|
|
|
|
|
token_acc/p/r/f.
|
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#score_tokenization
|
2019-10-31 23:18:16 +03:00
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
acc_score = PRFScore()
|
|
|
|
|
prf_score = PRFScore()
|
|
|
|
|
for example in examples:
|
|
|
|
|
gold_doc = example.reference
|
|
|
|
|
pred_doc = example.predicted
|
|
|
|
|
align = example.alignment
|
|
|
|
|
gold_spans = set()
|
|
|
|
|
pred_spans = set()
|
|
|
|
|
for token in gold_doc:
|
|
|
|
|
if token.orth_.isspace():
|
|
|
|
|
continue
|
|
|
|
|
gold_spans.add((token.idx, token.idx + len(token)))
|
|
|
|
|
for token in pred_doc:
|
|
|
|
|
if token.orth_.isspace():
|
|
|
|
|
continue
|
|
|
|
|
pred_spans.add((token.idx, token.idx + len(token)))
|
|
|
|
|
if align.x2y.lengths[token.i] != 1:
|
|
|
|
|
acc_score.fp += 1
|
|
|
|
|
else:
|
|
|
|
|
acc_score.tp += 1
|
|
|
|
|
prf_score.score_set(pred_spans, gold_spans)
|
2019-10-31 23:18:16 +03:00
|
|
|
|
return {
|
2020-08-25 03:42:47 +03:00
|
|
|
|
"token_acc": acc_score.fscore,
|
|
|
|
|
"token_p": prf_score.precision,
|
|
|
|
|
"token_r": prf_score.recall,
|
|
|
|
|
"token_f": prf_score.fscore,
|
2019-10-31 23:18:16 +03:00
|
|
|
|
}
|
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
@staticmethod
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_token_attr(
|
|
|
|
|
examples: Iterable[Example],
|
|
|
|
|
attr: str,
|
|
|
|
|
*,
|
|
|
|
|
getter: Callable[[Token, str], Any] = getattr,
|
|
|
|
|
**cfg,
|
|
|
|
|
) -> Dict[str, float]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Returns an accuracy score for a token-level attribute.
|
|
|
|
|
|
|
|
|
|
examples (Iterable[Example]): Examples to score
|
|
|
|
|
attr (str): The attribute to score.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
getter(token, attr) should return the value of the attribute for an
|
|
|
|
|
individual token.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
|
|
|
|
under the key attr_acc.
|
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#score_token_attr
|
2019-07-10 12:19:28 +03:00
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
tag_score = PRFScore()
|
|
|
|
|
for example in examples:
|
|
|
|
|
gold_doc = example.reference
|
|
|
|
|
pred_doc = example.predicted
|
|
|
|
|
align = example.alignment
|
|
|
|
|
gold_tags = set()
|
|
|
|
|
for gold_i, token in enumerate(gold_doc):
|
|
|
|
|
gold_tags.add((gold_i, getter(token, attr)))
|
|
|
|
|
pred_tags = set()
|
|
|
|
|
for token in pred_doc:
|
|
|
|
|
if token.orth_.isspace():
|
|
|
|
|
continue
|
|
|
|
|
if align.x2y.lengths[token.i] == 1:
|
|
|
|
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
|
|
|
|
pred_tags.add((gold_i, getter(token, attr)))
|
|
|
|
|
tag_score.score_set(pred_tags, gold_tags)
|
2020-07-28 22:39:42 +03:00
|
|
|
|
return {f"{attr}_acc": tag_score.fscore}
|
2019-07-10 12:19:28 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
@staticmethod
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_token_attr_per_feat(
|
|
|
|
|
examples: Iterable[Example],
|
|
|
|
|
attr: str,
|
|
|
|
|
*,
|
|
|
|
|
getter: Callable[[Token, str], Any] = getattr,
|
|
|
|
|
**cfg,
|
|
|
|
|
):
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
|
|
|
|
|
|
|
|
|
examples (Iterable[Example]): Examples to score
|
|
|
|
|
attr (str): The attribute to score.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
getter(token, attr) should return the value of the attribute for an
|
|
|
|
|
individual token.
|
|
|
|
|
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
|
|
|
|
the key attr_per_feat.
|
2019-09-15 23:31:31 +03:00
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
per_feat = {}
|
|
|
|
|
for example in examples:
|
|
|
|
|
pred_doc = example.predicted
|
|
|
|
|
gold_doc = example.reference
|
|
|
|
|
align = example.alignment
|
|
|
|
|
gold_per_feat = {}
|
|
|
|
|
for gold_i, token in enumerate(gold_doc):
|
|
|
|
|
morph = str(getter(token, attr))
|
|
|
|
|
if morph:
|
|
|
|
|
for feat in morph.split(Morphology.FEATURE_SEP):
|
|
|
|
|
field, values = feat.split(Morphology.FIELD_SEP)
|
|
|
|
|
if field not in per_feat:
|
|
|
|
|
per_feat[field] = PRFScore()
|
|
|
|
|
if field not in gold_per_feat:
|
|
|
|
|
gold_per_feat[field] = set()
|
|
|
|
|
gold_per_feat[field].add((gold_i, feat))
|
|
|
|
|
pred_per_feat = {}
|
|
|
|
|
for token in pred_doc:
|
|
|
|
|
if token.orth_.isspace():
|
|
|
|
|
continue
|
|
|
|
|
if align.x2y.lengths[token.i] == 1:
|
|
|
|
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
|
|
|
|
morph = str(getter(token, attr))
|
|
|
|
|
if morph:
|
|
|
|
|
for feat in morph.split("|"):
|
|
|
|
|
field, values = feat.split("=")
|
|
|
|
|
if field not in per_feat:
|
|
|
|
|
per_feat[field] = PRFScore()
|
|
|
|
|
if field not in pred_per_feat:
|
|
|
|
|
pred_per_feat[field] = set()
|
|
|
|
|
pred_per_feat[field].add((gold_i, feat))
|
|
|
|
|
for field in per_feat:
|
|
|
|
|
per_feat[field].score_set(
|
|
|
|
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
|
|
|
|
)
|
2020-08-06 16:14:47 +03:00
|
|
|
|
result = {k: v.to_dict() for k, v in per_feat.items()}
|
|
|
|
|
return {f"{attr}_per_feat": result}
|
2020-06-12 03:02:07 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
@staticmethod
|
2020-07-28 22:39:42 +03:00
|
|
|
|
def score_spans(
|
|
|
|
|
examples: Iterable[Example],
|
|
|
|
|
attr: str,
|
|
|
|
|
*,
|
2020-08-17 17:45:24 +03:00
|
|
|
|
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
|
2020-07-28 22:39:42 +03:00
|
|
|
|
**cfg,
|
|
|
|
|
) -> Dict[str, Any]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Returns PRF scores for labeled spans.
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
examples (Iterable[Example]): Examples to score
|
|
|
|
|
attr (str): The attribute to score.
|
2020-08-17 17:45:24 +03:00
|
|
|
|
getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
|
|
|
|
|
provided, getter(doc, attr) should return the spans for the
|
|
|
|
|
individual doc.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
|
|
|
|
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#score_spans
|
2019-05-24 15:06:04 +03:00
|
|
|
|
"""
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
score = PRFScore()
|
|
|
|
|
score_per_type = dict()
|
|
|
|
|
for example in examples:
|
|
|
|
|
pred_doc = example.predicted
|
|
|
|
|
gold_doc = example.reference
|
|
|
|
|
# Find all labels in gold and doc
|
|
|
|
|
labels = set(
|
|
|
|
|
[k.label_ for k in getter(gold_doc, attr)]
|
|
|
|
|
+ [k.label_ for k in getter(pred_doc, attr)]
|
|
|
|
|
)
|
|
|
|
|
# Set up all labels for per type scoring and prepare gold per type
|
|
|
|
|
gold_per_type = {label: set() for label in labels}
|
|
|
|
|
for label in labels:
|
|
|
|
|
if label not in score_per_type:
|
|
|
|
|
score_per_type[label] = PRFScore()
|
|
|
|
|
# Find all predidate labels, for all and per type
|
|
|
|
|
gold_spans = set()
|
|
|
|
|
pred_spans = set()
|
|
|
|
|
# Special case for ents:
|
|
|
|
|
# If we have missing values in the gold, we can't easily tell
|
|
|
|
|
# whether our NER predictions are true.
|
|
|
|
|
# It seems bad but it's what we've always done.
|
|
|
|
|
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
|
|
|
|
continue
|
|
|
|
|
for span in getter(gold_doc, attr):
|
|
|
|
|
gold_span = (span.label_, span.start, span.end - 1)
|
|
|
|
|
gold_spans.add(gold_span)
|
|
|
|
|
gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
|
|
|
|
|
pred_per_type = {label: set() for label in labels}
|
|
|
|
|
for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
|
|
|
|
|
pred_spans.add((span.label_, span.start, span.end - 1))
|
|
|
|
|
pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
|
|
|
|
|
# Scores per label
|
|
|
|
|
for k, v in score_per_type.items():
|
|
|
|
|
if k in pred_per_type:
|
|
|
|
|
v.score_set(pred_per_type[k], gold_per_type[k])
|
|
|
|
|
# Score for all labels
|
|
|
|
|
score.score_set(pred_spans, gold_spans)
|
|
|
|
|
results = {
|
2020-07-28 22:39:42 +03:00
|
|
|
|
f"{attr}_p": score.precision,
|
|
|
|
|
f"{attr}_r": score.recall,
|
|
|
|
|
f"{attr}_f": score.fscore,
|
|
|
|
|
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
}
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def score_cats(
|
2020-07-28 22:39:42 +03:00
|
|
|
|
examples: Iterable[Example],
|
|
|
|
|
attr: str,
|
|
|
|
|
*,
|
2020-07-29 11:39:33 +03:00
|
|
|
|
getter: Callable[[Doc, str], Any] = getattr,
|
2020-07-28 22:39:42 +03:00
|
|
|
|
labels: Iterable[str] = tuple(),
|
|
|
|
|
multi_label: bool = True,
|
|
|
|
|
positive_label: Optional[str] = None,
|
2020-08-06 17:24:13 +03:00
|
|
|
|
threshold: Optional[float] = None,
|
2020-07-28 22:39:42 +03:00
|
|
|
|
**cfg,
|
|
|
|
|
) -> Dict[str, Any]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
2020-07-27 12:17:52 +03:00
|
|
|
|
dict with scores for each label like Doc.cats. The reported overall
|
|
|
|
|
score depends on the scorer settings.
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
|
|
|
|
|
examples (Iterable[Example]): Examples to score
|
|
|
|
|
attr (str): The attribute to score.
|
2020-07-29 11:39:33 +03:00
|
|
|
|
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
getter(doc, attr) should return the values for the individual doc.
|
|
|
|
|
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
|
|
|
|
multi_label (bool): Whether the attribute allows multiple labels.
|
|
|
|
|
Defaults to True.
|
|
|
|
|
positive_label (str): The positive label for a binary task with
|
|
|
|
|
exclusive classes. Defaults to None.
|
2020-08-06 17:24:13 +03:00
|
|
|
|
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
|
|
|
|
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
|
|
|
|
|
otherwise.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
|
|
|
|
inapplicable scores as None:
|
2020-07-27 12:17:52 +03:00
|
|
|
|
for all:
|
2020-08-06 17:24:13 +03:00
|
|
|
|
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
2020-07-27 12:17:52 +03:00
|
|
|
|
attr_score_desc (text description of the overall score),
|
2020-08-06 17:24:13 +03:00
|
|
|
|
attr_micro_f,
|
|
|
|
|
attr_macro_f,
|
|
|
|
|
attr_auc,
|
2020-07-27 12:17:52 +03:00
|
|
|
|
attr_f_per_type,
|
|
|
|
|
attr_auc_per_type
|
2020-07-28 22:39:42 +03:00
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#score_cats
|
2019-05-24 15:06:04 +03:00
|
|
|
|
"""
|
2020-08-06 17:24:13 +03:00
|
|
|
|
if threshold is None:
|
|
|
|
|
threshold = 0.5 if multi_label else 0.0
|
|
|
|
|
f_per_type = {label: PRFScore() for label in labels}
|
|
|
|
|
auc_per_type = {label: ROCAUCScore() for label in labels}
|
|
|
|
|
labels = set(labels)
|
|
|
|
|
if labels:
|
|
|
|
|
for eg in examples:
|
|
|
|
|
labels.update(eg.predicted.cats.keys())
|
|
|
|
|
labels.update(eg.reference.cats.keys())
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
for example in examples:
|
2020-08-06 17:24:13 +03:00
|
|
|
|
# Through this loop, None in the gold_cats indicates missing label.
|
|
|
|
|
pred_cats = getter(example.predicted, attr)
|
|
|
|
|
gold_cats = getter(example.reference, attr)
|
|
|
|
|
|
|
|
|
|
# I think the AUC metric is applicable regardless of whether we're
|
|
|
|
|
# doing multi-label classification? Unsure. If not, move this into
|
|
|
|
|
# the elif pred_cats and gold_cats block below.
|
|
|
|
|
for label in labels:
|
|
|
|
|
pred_score = pred_cats.get(label, 0.0)
|
|
|
|
|
gold_score = gold_cats.get(label, 0.0)
|
|
|
|
|
if gold_score is not None:
|
|
|
|
|
auc_per_type[label].score_set(pred_score, gold_score)
|
|
|
|
|
if multi_label:
|
|
|
|
|
for label in labels:
|
|
|
|
|
pred_score = pred_cats.get(label, 0.0)
|
|
|
|
|
gold_score = gold_cats.get(label, 0.0)
|
|
|
|
|
if gold_score is not None:
|
|
|
|
|
if pred_score >= threshold and gold_score > 0:
|
|
|
|
|
f_per_type[label].tp += 1
|
|
|
|
|
elif pred_score >= threshold and gold_score == 0:
|
|
|
|
|
f_per_type[label].fp += 1
|
|
|
|
|
elif pred_score < threshold and gold_score > 0:
|
|
|
|
|
f_per_type[label].fn += 1
|
|
|
|
|
elif pred_cats and gold_cats:
|
|
|
|
|
# Get the highest-scoring for each.
|
|
|
|
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
|
|
|
|
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
|
|
|
|
if gold_score is not None:
|
|
|
|
|
if pred_label == gold_label and pred_score >= threshold:
|
|
|
|
|
f_per_type[pred_label].tp += 1
|
|
|
|
|
else:
|
|
|
|
|
f_per_type[gold_label].fn += 1
|
|
|
|
|
if pred_score >= threshold:
|
|
|
|
|
f_per_type[pred_label].fp += 1
|
|
|
|
|
elif gold_cats:
|
|
|
|
|
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
|
|
|
|
if gold_score is not None and gold_score > 0:
|
|
|
|
|
f_per_type[gold_label].fn += 1
|
|
|
|
|
else:
|
|
|
|
|
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
|
|
|
|
|
if pred_score >= threshold:
|
|
|
|
|
f_per_type[pred_label].fp += 1
|
|
|
|
|
micro_prf = PRFScore()
|
|
|
|
|
for label_prf in f_per_type.values():
|
|
|
|
|
micro_prf.tp = label_prf.tp
|
|
|
|
|
micro_prf.fn = label_prf.fn
|
|
|
|
|
micro_prf.fp = label_prf.fp
|
|
|
|
|
n_cats = len(f_per_type) + 1e-100
|
|
|
|
|
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
|
|
|
|
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
|
|
|
|
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
results = {
|
2020-07-28 22:39:42 +03:00
|
|
|
|
f"{attr}_score": None,
|
|
|
|
|
f"{attr}_score_desc": None,
|
2020-08-06 17:24:13 +03:00
|
|
|
|
f"{attr}_micro_p": micro_prf.precision,
|
|
|
|
|
f"{attr}_micro_r": micro_prf.recall,
|
|
|
|
|
f"{attr}_micro_f": micro_prf.fscore,
|
|
|
|
|
f"{attr}_macro_p": macro_p,
|
|
|
|
|
f"{attr}_macro_r": macro_r,
|
|
|
|
|
f"{attr}_macro_f": macro_f,
|
2020-07-28 22:39:42 +03:00
|
|
|
|
f"{attr}_macro_auc": None,
|
|
|
|
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
|
|
|
|
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
}
|
|
|
|
|
if len(labels) == 2 and not multi_label and positive_label:
|
2020-08-09 23:36:23 +03:00
|
|
|
|
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
2020-08-06 17:24:13 +03:00
|
|
|
|
results[f"{attr}_score"] = positive_label_f
|
2020-07-28 22:39:42 +03:00
|
|
|
|
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
elif not multi_label:
|
2020-07-28 22:39:42 +03:00
|
|
|
|
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
|
|
|
|
results[f"{attr}_score_desc"] = "macro F"
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
else:
|
2020-07-28 22:39:42 +03:00
|
|
|
|
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
|
|
|
|
results[f"{attr}_score_desc"] = "macro AUC"
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def score_deps(
|
2020-07-28 22:39:42 +03:00
|
|
|
|
examples: Iterable[Example],
|
|
|
|
|
attr: str,
|
|
|
|
|
*,
|
|
|
|
|
getter: Callable[[Token, str], Any] = getattr,
|
|
|
|
|
head_attr: str = "head",
|
2020-08-17 17:45:24 +03:00
|
|
|
|
head_getter: Callable[[Token, str], Token] = getattr,
|
2020-07-28 22:39:42 +03:00
|
|
|
|
ignore_labels: Tuple[str] = tuple(),
|
|
|
|
|
**cfg,
|
|
|
|
|
) -> Dict[str, Any]:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
|
|
|
|
parses.
|
|
|
|
|
|
|
|
|
|
examples (Iterable[Example]): Examples to score
|
|
|
|
|
attr (str): The attribute containing the dependency label.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
getter(token, attr) should return the value of the attribute for an
|
|
|
|
|
individual token.
|
|
|
|
|
head_attr (str): The attribute containing the head token. Defaults to
|
|
|
|
|
'head'.
|
2020-08-17 17:45:24 +03:00
|
|
|
|
head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
head_getter(token, attr) should return the value of the head for an
|
|
|
|
|
individual token.
|
|
|
|
|
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
2020-07-28 22:39:42 +03:00
|
|
|
|
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
attr_uas, attr_las, and attr_las_per_type.
|
2020-07-28 22:39:42 +03:00
|
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/scorer#score_deps
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
"""
|
|
|
|
|
unlabelled = PRFScore()
|
|
|
|
|
labelled = PRFScore()
|
|
|
|
|
labelled_per_dep = dict()
|
|
|
|
|
for example in examples:
|
|
|
|
|
gold_doc = example.reference
|
|
|
|
|
pred_doc = example.predicted
|
|
|
|
|
align = example.alignment
|
|
|
|
|
gold_deps = set()
|
|
|
|
|
gold_deps_per_dep = {}
|
|
|
|
|
for gold_i, token in enumerate(gold_doc):
|
|
|
|
|
dep = getter(token, attr)
|
|
|
|
|
head = head_getter(token, head_attr)
|
|
|
|
|
if dep not in ignore_labels:
|
|
|
|
|
gold_deps.add((gold_i, head.i, dep))
|
|
|
|
|
if dep not in labelled_per_dep:
|
|
|
|
|
labelled_per_dep[dep] = PRFScore()
|
|
|
|
|
if dep not in gold_deps_per_dep:
|
|
|
|
|
gold_deps_per_dep[dep] = set()
|
|
|
|
|
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
|
|
|
|
pred_deps = set()
|
|
|
|
|
pred_deps_per_dep = {}
|
|
|
|
|
for token in pred_doc:
|
|
|
|
|
if token.orth_.isspace():
|
|
|
|
|
continue
|
|
|
|
|
if align.x2y.lengths[token.i] != 1:
|
|
|
|
|
gold_i = None
|
|
|
|
|
else:
|
|
|
|
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
|
|
|
|
dep = getter(token, attr)
|
|
|
|
|
head = head_getter(token, head_attr)
|
|
|
|
|
if dep not in ignore_labels and token.orth_.strip():
|
|
|
|
|
if align.x2y.lengths[head.i] == 1:
|
|
|
|
|
gold_head = align.x2y[head.i].dataXd[0, 0]
|
|
|
|
|
else:
|
|
|
|
|
gold_head = None
|
|
|
|
|
# None is indistinct, so we can't just add it to the set
|
|
|
|
|
# Multiple (None, None) deps are possible
|
|
|
|
|
if gold_i is None or gold_head is None:
|
|
|
|
|
unlabelled.fp += 1
|
|
|
|
|
labelled.fp += 1
|
|
|
|
|
else:
|
|
|
|
|
pred_deps.add((gold_i, gold_head, dep))
|
|
|
|
|
if dep not in labelled_per_dep:
|
|
|
|
|
labelled_per_dep[dep] = PRFScore()
|
|
|
|
|
if dep not in pred_deps_per_dep:
|
|
|
|
|
pred_deps_per_dep[dep] = set()
|
|
|
|
|
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
|
|
|
|
labelled.score_set(pred_deps, gold_deps)
|
|
|
|
|
for dep in labelled_per_dep:
|
|
|
|
|
labelled_per_dep[dep].score_set(
|
|
|
|
|
pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
|
|
|
|
|
)
|
|
|
|
|
unlabelled.score_set(
|
|
|
|
|
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
return {
|
2020-07-28 22:39:42 +03:00
|
|
|
|
f"{attr}_uas": unlabelled.fscore,
|
|
|
|
|
f"{attr}_las": labelled.fscore,
|
|
|
|
|
f"{attr}_las_per_type": {
|
|
|
|
|
k: v.to_dict() for k, v in labelled_per_dep.items()
|
|
|
|
|
},
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
}
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#############################################################################
|
|
|
|
|
#
|
|
|
|
|
# The following implementation of roc_auc_score() is adapted from
|
|
|
|
|
# scikit-learn, which is distributed under the following license:
|
|
|
|
|
#
|
|
|
|
|
# New BSD License
|
|
|
|
|
#
|
|
|
|
|
# Copyright (c) 2007–2019 The scikit-learn developers.
|
|
|
|
|
# All rights reserved.
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
|
#
|
|
|
|
|
# a. Redistributions of source code must retain the above copyright notice,
|
|
|
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
|
# b. Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
|
# c. Neither the name of the Scikit-learn Developers nor the names of
|
|
|
|
|
# its contributors may be used to endorse or promote products
|
|
|
|
|
# derived from this software without specific prior written
|
2019-09-18 20:56:55 +03:00
|
|
|
|
# permission.
|
2019-09-15 23:31:31 +03:00
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
|
|
|
|
|
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
|
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
|
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
|
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
|
|
|
# DAMAGE.
|
|
|
|
|
|
2019-09-18 20:56:55 +03:00
|
|
|
|
|
2019-09-15 23:31:31 +03:00
|
|
|
|
def _roc_auc_score(y_true, y_score):
|
|
|
|
|
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
|
|
|
|
|
from prediction scores.
|
|
|
|
|
|
|
|
|
|
Note: this implementation is restricted to the binary classification task
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
y_true : array, shape = [n_samples] or [n_samples, n_classes]
|
|
|
|
|
True binary labels or binary label indicators.
|
|
|
|
|
The multiclass case expects shape = [n_samples] and labels
|
|
|
|
|
with values in ``range(n_classes)``.
|
|
|
|
|
|
|
|
|
|
y_score : array, shape = [n_samples] or [n_samples, n_classes]
|
|
|
|
|
Target scores, can either be probability estimates of the positive
|
|
|
|
|
class, confidence values, or non-thresholded measure of decisions
|
|
|
|
|
(as returned by "decision_function" on some classifiers). For binary
|
|
|
|
|
y_true, y_score is supposed to be the score of the class with greater
|
|
|
|
|
label. The multiclass case expects shape = [n_samples, n_classes]
|
|
|
|
|
where the scores correspond to probability estimates.
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
-------
|
|
|
|
|
auc : float
|
|
|
|
|
|
|
|
|
|
References
|
|
|
|
|
----------
|
|
|
|
|
.. [1] `Wikipedia entry for the Receiver operating characteristic
|
|
|
|
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
|
|
|
|
|
|
|
|
|
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
|
|
|
|
|
Letters, 2006, 27(8):861-874.
|
|
|
|
|
|
|
|
|
|
.. [3] `Analyzing a portion of the ROC curve. McClish, 1989
|
|
|
|
|
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
|
|
|
|
"""
|
|
|
|
|
if len(np.unique(y_true)) != 2:
|
|
|
|
|
raise ValueError(Errors.E165)
|
|
|
|
|
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
|
return _auc(fpr, tpr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _roc_curve(y_true, y_score):
|
|
|
|
|
"""Compute Receiver operating characteristic (ROC)
|
|
|
|
|
|
|
|
|
|
Note: this implementation is restricted to the binary classification task.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
|
|
|
|
|
y_true : array, shape = [n_samples]
|
|
|
|
|
True binary labels. If labels are not either {-1, 1} or {0, 1}, then
|
|
|
|
|
pos_label should be explicitly given.
|
|
|
|
|
|
|
|
|
|
y_score : array, shape = [n_samples]
|
|
|
|
|
Target scores, can either be probability estimates of the positive
|
|
|
|
|
class, confidence values, or non-thresholded measure of decisions
|
|
|
|
|
(as returned by "decision_function" on some classifiers).
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
-------
|
|
|
|
|
fpr : array, shape = [>2]
|
|
|
|
|
Increasing false positive rates such that element i is the false
|
|
|
|
|
positive rate of predictions with score >= thresholds[i].
|
|
|
|
|
|
|
|
|
|
tpr : array, shape = [>2]
|
|
|
|
|
Increasing true positive rates such that element i is the true
|
|
|
|
|
positive rate of predictions with score >= thresholds[i].
|
|
|
|
|
|
|
|
|
|
thresholds : array, shape = [n_thresholds]
|
|
|
|
|
Decreasing thresholds on the decision function used to compute
|
|
|
|
|
fpr and tpr. `thresholds[0]` represents no instances being predicted
|
|
|
|
|
and is arbitrarily set to `max(y_score) + 1`.
|
|
|
|
|
|
|
|
|
|
Notes
|
|
|
|
|
-----
|
|
|
|
|
Since the thresholds are sorted from low to high values, they
|
|
|
|
|
are reversed upon returning them to ensure they correspond to both ``fpr``
|
|
|
|
|
and ``tpr``, which are sorted in reversed order during their calculation.
|
|
|
|
|
|
|
|
|
|
References
|
|
|
|
|
----------
|
|
|
|
|
.. [1] `Wikipedia entry for the Receiver operating characteristic
|
|
|
|
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
|
|
|
|
|
|
|
|
|
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
|
|
|
|
|
Letters, 2006, 27(8):861-874.
|
|
|
|
|
"""
|
|
|
|
|
fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
|
|
|
|
|
|
|
|
|
|
# Add an extra threshold position
|
|
|
|
|
# to make sure that the curve starts at (0, 0)
|
|
|
|
|
tps = np.r_[0, tps]
|
|
|
|
|
fps = np.r_[0, fps]
|
|
|
|
|
thresholds = np.r_[thresholds[0] + 1, thresholds]
|
|
|
|
|
|
|
|
|
|
if fps[-1] <= 0:
|
|
|
|
|
fpr = np.repeat(np.nan, fps.shape)
|
|
|
|
|
else:
|
|
|
|
|
fpr = fps / fps[-1]
|
|
|
|
|
|
|
|
|
|
if tps[-1] <= 0:
|
|
|
|
|
tpr = np.repeat(np.nan, tps.shape)
|
|
|
|
|
else:
|
|
|
|
|
tpr = tps / tps[-1]
|
|
|
|
|
|
|
|
|
|
return fpr, tpr, thresholds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _binary_clf_curve(y_true, y_score):
|
|
|
|
|
"""Calculate true and false positives per binary classification threshold.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
y_true : array, shape = [n_samples]
|
|
|
|
|
True targets of binary classification
|
|
|
|
|
|
|
|
|
|
y_score : array, shape = [n_samples]
|
|
|
|
|
Estimated probabilities or decision function
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
-------
|
|
|
|
|
fps : array, shape = [n_thresholds]
|
|
|
|
|
A count of false positives, at index i being the number of negative
|
|
|
|
|
samples assigned a score >= thresholds[i]. The total number of
|
|
|
|
|
negative samples is equal to fps[-1] (thus true negatives are given by
|
|
|
|
|
fps[-1] - fps).
|
|
|
|
|
|
|
|
|
|
tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
|
|
|
|
|
An increasing count of true positives, at index i being the number
|
|
|
|
|
of positive samples assigned a score >= thresholds[i]. The total
|
|
|
|
|
number of positive samples is equal to tps[-1] (thus false negatives
|
|
|
|
|
are given by tps[-1] - tps).
|
|
|
|
|
|
|
|
|
|
thresholds : array, shape = [n_thresholds]
|
|
|
|
|
Decreasing score values.
|
|
|
|
|
"""
|
2019-09-18 20:56:55 +03:00
|
|
|
|
pos_label = 1.0
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
|
|
y_true = np.ravel(y_true)
|
|
|
|
|
y_score = np.ravel(y_score)
|
|
|
|
|
|
|
|
|
|
# make y_true a boolean vector
|
2019-09-18 20:56:55 +03:00
|
|
|
|
y_true = y_true == pos_label
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
|
|
# sort scores and corresponding truth values
|
|
|
|
|
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
|
|
|
|
|
y_score = y_score[desc_score_indices]
|
|
|
|
|
y_true = y_true[desc_score_indices]
|
2019-09-18 20:56:55 +03:00
|
|
|
|
weight = 1.0
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
|
|
# y_score typically has many tied values. Here we extract
|
|
|
|
|
# the indices associated with the distinct values. We also
|
|
|
|
|
# concatenate a value for the end of the curve.
|
|
|
|
|
distinct_value_indices = np.where(np.diff(y_score))[0]
|
|
|
|
|
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
|
|
|
|
|
|
|
|
|
|
# accumulate the true positives with decreasing threshold
|
|
|
|
|
tps = _stable_cumsum(y_true * weight)[threshold_idxs]
|
|
|
|
|
fps = 1 + threshold_idxs - tps
|
|
|
|
|
return fps, tps, y_score[threshold_idxs]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
|
|
|
|
|
"""Use high precision for cumsum and check that final value matches sum
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
arr : array-like
|
|
|
|
|
To be cumulatively summed as flat
|
|
|
|
|
axis : int, optional
|
|
|
|
|
Axis along which the cumulative sum is computed.
|
|
|
|
|
The default (None) is to compute the cumsum over the flattened array.
|
|
|
|
|
rtol : float
|
|
|
|
|
Relative tolerance, see ``np.allclose``
|
|
|
|
|
atol : float
|
|
|
|
|
Absolute tolerance, see ``np.allclose``
|
|
|
|
|
"""
|
|
|
|
|
out = np.cumsum(arr, axis=axis, dtype=np.float64)
|
|
|
|
|
expected = np.sum(arr, axis=axis, dtype=np.float64)
|
2019-09-18 20:56:55 +03:00
|
|
|
|
if not np.all(
|
|
|
|
|
np.isclose(
|
|
|
|
|
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
|
|
|
|
|
)
|
|
|
|
|
):
|
2019-09-15 23:31:31 +03:00
|
|
|
|
raise ValueError(Errors.E163)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _auc(x, y):
|
|
|
|
|
"""Compute Area Under the Curve (AUC) using the trapezoidal rule
|
|
|
|
|
|
|
|
|
|
This is a general function, given points on a curve. For computing the
|
|
|
|
|
area under the ROC-curve, see :func:`roc_auc_score`.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
x : array, shape = [n]
|
|
|
|
|
x coordinates. These must be either monotonic increasing or monotonic
|
|
|
|
|
decreasing.
|
|
|
|
|
y : array, shape = [n]
|
|
|
|
|
y coordinates.
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
-------
|
|
|
|
|
auc : float
|
|
|
|
|
"""
|
|
|
|
|
x = np.ravel(x)
|
|
|
|
|
y = np.ravel(y)
|
|
|
|
|
|
|
|
|
|
direction = 1
|
|
|
|
|
dx = np.diff(x)
|
|
|
|
|
if np.any(dx < 0):
|
|
|
|
|
if np.all(dx <= 0):
|
|
|
|
|
direction = -1
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(Errors.E164.format(x))
|
|
|
|
|
|
|
|
|
|
area = direction * np.trapz(y, x)
|
|
|
|
|
if isinstance(area, np.memmap):
|
|
|
|
|
# Reductions such as .sum used internally in np.trapz do not return a
|
|
|
|
|
# scalar by default for numpy.memmap instances contrary to
|
|
|
|
|
# regular numpy.ndarray instances.
|
|
|
|
|
area = area.dtype.type(area)
|
|
|
|
|
return area
|