2019-09-15 23:31:31 +03:00
|
|
|
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
|
|
|
import pytest
|
2019-08-01 18:15:36 +03:00
|
|
|
from pytest import approx
|
2020-09-09 11:31:03 +03:00
|
|
|
from spacy.training import Example
|
2020-09-22 12:50:19 +03:00
|
|
|
from spacy.training.iob_utils import offsets_to_biluo_tags
|
2021-04-08 10:34:14 +03:00
|
|
|
from spacy.scorer import Scorer, ROCAUCScore, PRFScore
|
2019-09-15 23:31:31 +03:00
|
|
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
2020-04-02 15:46:32 +03:00
|
|
|
from spacy.lang.en import English
|
2021-04-08 13:19:17 +03:00
|
|
|
from spacy.tokens import Doc, Span
|
2019-08-01 18:15:36 +03:00
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
|
2019-10-31 23:18:16 +03:00
|
|
|
test_las_apple = [
|
|
|
|
[
|
|
|
|
"Apple is looking at buying U.K. startup for $ 1 billion",
|
2019-11-20 15:15:24 +03:00
|
|
|
{
|
|
|
|
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
|
|
|
"deps": [
|
|
|
|
"nsubj",
|
|
|
|
"aux",
|
|
|
|
"ROOT",
|
|
|
|
"prep",
|
|
|
|
"pcomp",
|
|
|
|
"compound",
|
|
|
|
"dobj",
|
|
|
|
"prep",
|
|
|
|
"quantmod",
|
|
|
|
"compound",
|
|
|
|
"pobj",
|
|
|
|
],
|
|
|
|
},
|
2019-10-31 23:18:16 +03:00
|
|
|
]
|
|
|
|
]
|
|
|
|
|
2019-08-01 18:15:36 +03:00
|
|
|
test_ner_cardinal = [
|
2019-08-18 16:09:16 +03:00
|
|
|
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
|
2019-08-01 18:15:36 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
test_ner_apple = [
|
|
|
|
[
|
|
|
|
"Apple is looking at buying U.K. startup for $1 billion",
|
2019-08-18 16:09:16 +03:00
|
|
|
{"entities": [(0, 5, "ORG"), (27, 31, "GPE"), (44, 54, "MONEY")]},
|
2019-08-01 18:15:36 +03:00
|
|
|
]
|
|
|
|
]
|
|
|
|
|
2020-06-20 15:15:04 +03:00
|
|
|
|
2020-04-02 15:46:32 +03:00
|
|
|
@pytest.fixture
|
|
|
|
def tagged_doc():
|
|
|
|
text = "Sarah's sister flew to Silicon Valley via London."
|
|
|
|
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
|
|
|
pos = [
|
|
|
|
"PROPN",
|
|
|
|
"PART",
|
|
|
|
"NOUN",
|
|
|
|
"VERB",
|
|
|
|
"ADP",
|
|
|
|
"PROPN",
|
|
|
|
"PROPN",
|
|
|
|
"ADP",
|
|
|
|
"PROPN",
|
|
|
|
"PUNCT",
|
|
|
|
]
|
|
|
|
morphs = [
|
|
|
|
"NounType=prop|Number=sing",
|
|
|
|
"Poss=yes",
|
|
|
|
"Number=sing",
|
|
|
|
"Tense=past|VerbForm=fin",
|
|
|
|
"",
|
|
|
|
"NounType=prop|Number=sing",
|
|
|
|
"NounType=prop|Number=sing",
|
|
|
|
"",
|
|
|
|
"NounType=prop|Number=sing",
|
|
|
|
"PunctType=peri",
|
|
|
|
]
|
|
|
|
nlp = English()
|
|
|
|
doc = nlp(text)
|
|
|
|
for i in range(len(tags)):
|
|
|
|
doc[i].tag_ = tags[i]
|
|
|
|
doc[i].pos_ = pos[i]
|
2020-10-01 23:21:46 +03:00
|
|
|
doc[i].set_morph(morphs[i])
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
if i > 0:
|
|
|
|
doc[i].is_sent_start = False
|
2020-04-02 15:46:32 +03:00
|
|
|
return doc
|
|
|
|
|
2019-08-18 16:09:16 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
@pytest.fixture
|
|
|
|
def sented_doc():
|
|
|
|
text = "One sentence. Two sentences. Three sentences."
|
|
|
|
nlp = English()
|
|
|
|
doc = nlp(text)
|
|
|
|
for i in range(len(doc)):
|
|
|
|
if i % 3 == 0:
|
|
|
|
doc[i].is_sent_start = True
|
|
|
|
else:
|
|
|
|
doc[i].is_sent_start = False
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
|
|
def test_tokenization(sented_doc):
|
|
|
|
scorer = Scorer()
|
|
|
|
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
|
|
|
|
example = Example.from_dict(sented_doc, gold)
|
|
|
|
scores = scorer.score([example])
|
|
|
|
assert scores["token_acc"] == 1.0
|
|
|
|
|
|
|
|
nlp = English()
|
2020-07-25 16:01:15 +03:00
|
|
|
example.predicted = Doc(
|
|
|
|
nlp.vocab,
|
|
|
|
words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."],
|
|
|
|
spaces=[True, True, True, True, True, False],
|
|
|
|
)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
example.predicted[1].is_sent_start = False
|
|
|
|
scores = scorer.score([example])
|
2023-01-11 10:01:47 +03:00
|
|
|
assert scores["token_acc"] == 0.5
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert scores["token_p"] == 0.5
|
|
|
|
assert scores["token_r"] == approx(0.33333333)
|
|
|
|
assert scores["token_f"] == 0.4
|
|
|
|
|
2023-05-12 16:36:54 +03:00
|
|
|
# per-component scoring
|
|
|
|
scorer = Scorer()
|
|
|
|
scores = scorer.score([example], per_component=True)
|
|
|
|
assert scores["tokenizer"]["token_acc"] == 0.5
|
|
|
|
assert scores["tokenizer"]["token_p"] == 0.5
|
|
|
|
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
|
|
|
|
assert scores["tokenizer"]["token_f"] == 0.4
|
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
|
|
|
|
def test_sents(sented_doc):
|
|
|
|
scorer = Scorer()
|
|
|
|
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
|
|
|
|
example = Example.from_dict(sented_doc, gold)
|
|
|
|
scores = scorer.score([example])
|
|
|
|
assert scores["sents_f"] == 1.0
|
|
|
|
|
|
|
|
# One sentence start is moved
|
|
|
|
gold["sent_starts"][3] = 0
|
|
|
|
gold["sent_starts"][4] = 1
|
|
|
|
example = Example.from_dict(sented_doc, gold)
|
|
|
|
scores = scorer.score([example])
|
|
|
|
assert scores["sents_f"] == approx(0.3333333)
|
|
|
|
|
|
|
|
|
2019-10-31 23:18:16 +03:00
|
|
|
def test_las_per_type(en_vocab):
|
|
|
|
# Gold and Doc are identical
|
|
|
|
scorer = Scorer()
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples = []
|
2019-10-31 23:18:16 +03:00
|
|
|
for input_, annot in test_las_apple:
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(
|
2020-09-29 22:39:28 +03:00
|
|
|
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
|
2019-10-31 23:18:16 +03:00
|
|
|
)
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
|
|
|
example = Example.from_dict(doc, gold)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples.append(example)
|
|
|
|
results = scorer.score(examples)
|
|
|
|
|
|
|
|
assert results["dep_uas"] == 1.0
|
|
|
|
assert results["dep_las"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["compound"]["p"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["compound"]["f"] == 1.0
|
2019-10-31 23:18:16 +03:00
|
|
|
|
|
|
|
# One dep is incorrect in Doc
|
|
|
|
scorer = Scorer()
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples = []
|
2019-10-31 23:18:16 +03:00
|
|
|
for input_, annot in test_las_apple:
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(
|
2020-10-03 18:20:18 +03:00
|
|
|
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
|
2019-10-31 23:18:16 +03:00
|
|
|
)
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
2019-10-31 23:18:16 +03:00
|
|
|
doc[0].dep_ = "compound"
|
2020-06-26 20:34:12 +03:00
|
|
|
example = Example.from_dict(doc, gold)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples.append(example)
|
|
|
|
results = scorer.score(examples)
|
2019-10-31 23:18:16 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["dep_uas"] == 1.0
|
|
|
|
assert_almost_equal(results["dep_las"], 0.9090909)
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["p"] == 0
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["r"] == 0
|
|
|
|
assert results["dep_las_per_type"]["nsubj"]["f"] == 0
|
|
|
|
assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
|
|
|
|
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
|
|
|
|
assert results["dep_las_per_type"]["compound"]["f"] == 0.8
|
2019-10-31 23:18:16 +03:00
|
|
|
|
|
|
|
|
2019-08-01 18:15:36 +03:00
|
|
|
def test_ner_per_type(en_vocab):
|
|
|
|
# Gold and Doc are identical
|
|
|
|
scorer = Scorer()
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples = []
|
2019-08-01 18:15:36 +03:00
|
|
|
for input_, annot in test_ner_cardinal:
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(
|
2020-10-03 18:20:18 +03:00
|
|
|
en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
|
2019-08-18 16:09:16 +03:00
|
|
|
)
|
2020-09-22 12:50:19 +03:00
|
|
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
example = Example.from_dict(doc, {"entities": entities})
|
|
|
|
# a hack for sentence boundaries
|
|
|
|
example.predicted[1].is_sent_start = False
|
|
|
|
example.reference[1].is_sent_start = False
|
|
|
|
examples.append(example)
|
|
|
|
results = scorer.score(examples)
|
2019-08-01 18:15:36 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["ents_p"] == 1.0
|
|
|
|
assert results["ents_r"] == 1.0
|
|
|
|
assert results["ents_f"] == 1.0
|
|
|
|
assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
|
|
|
|
assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
|
|
|
|
assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
|
2019-08-01 18:15:36 +03:00
|
|
|
|
|
|
|
# Doc has one missing and one extra entity
|
|
|
|
# Entity type MONEY is not present in Doc
|
|
|
|
scorer = Scorer()
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
examples = []
|
2019-08-01 18:15:36 +03:00
|
|
|
for input_, annot in test_ner_apple:
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(
|
2019-08-18 16:09:16 +03:00
|
|
|
en_vocab,
|
|
|
|
words=input_.split(" "),
|
2020-10-01 17:22:18 +03:00
|
|
|
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
|
2019-08-18 16:09:16 +03:00
|
|
|
)
|
2020-09-22 12:50:19 +03:00
|
|
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
example = Example.from_dict(doc, {"entities": entities})
|
|
|
|
# a hack for sentence boundaries
|
|
|
|
example.predicted[1].is_sent_start = False
|
|
|
|
example.reference[1].is_sent_start = False
|
|
|
|
examples.append(example)
|
|
|
|
results = scorer.score(examples)
|
2019-08-01 18:15:36 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["ents_p"] == approx(0.6666666)
|
|
|
|
assert results["ents_r"] == approx(0.6666666)
|
|
|
|
assert results["ents_f"] == approx(0.6666666)
|
2019-08-18 16:09:16 +03:00
|
|
|
assert "GPE" in results["ents_per_type"]
|
|
|
|
assert "MONEY" in results["ents_per_type"]
|
|
|
|
assert "ORG" in results["ents_per_type"]
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["ents_per_type"]["GPE"]["p"] == 1.0
|
|
|
|
assert results["ents_per_type"]["GPE"]["r"] == 1.0
|
|
|
|
assert results["ents_per_type"]["GPE"]["f"] == 1.0
|
2019-08-18 16:09:16 +03:00
|
|
|
assert results["ents_per_type"]["MONEY"]["p"] == 0
|
|
|
|
assert results["ents_per_type"]["MONEY"]["r"] == 0
|
|
|
|
assert results["ents_per_type"]["MONEY"]["f"] == 0
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["ents_per_type"]["ORG"]["p"] == 0.5
|
|
|
|
assert results["ents_per_type"]["ORG"]["r"] == 1.0
|
|
|
|
assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
|
2020-04-02 15:46:32 +03:00
|
|
|
def test_tag_score(tagged_doc):
|
|
|
|
# Gold and Doc are identical
|
|
|
|
scorer = Scorer()
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = {
|
|
|
|
"tags": [t.tag_ for t in tagged_doc],
|
|
|
|
"pos": [t.pos_ for t in tagged_doc],
|
2020-10-01 23:21:46 +03:00
|
|
|
"morphs": [str(t.morph) for t in tagged_doc],
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
|
2020-06-26 20:34:12 +03:00
|
|
|
}
|
|
|
|
example = Example.from_dict(tagged_doc, gold)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
results = scorer.score([example])
|
2020-04-02 15:46:32 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
assert results["tag_acc"] == 1.0
|
|
|
|
assert results["pos_acc"] == 1.0
|
|
|
|
assert results["morph_acc"] == 1.0
|
2021-10-29 11:29:29 +03:00
|
|
|
assert results["morph_micro_f"] == 1.0
|
2020-08-06 16:14:47 +03:00
|
|
|
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
|
2020-04-02 15:46:32 +03:00
|
|
|
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
# Gold annotation is modified
|
2020-04-02 15:46:32 +03:00
|
|
|
scorer = Scorer()
|
|
|
|
tags = [t.tag_ for t in tagged_doc]
|
|
|
|
tags[0] = "NN"
|
|
|
|
pos = [t.pos_ for t in tagged_doc]
|
|
|
|
pos[1] = "X"
|
2020-10-01 23:21:46 +03:00
|
|
|
morphs = [str(t.morph) for t in tagged_doc]
|
2020-04-02 15:46:32 +03:00
|
|
|
morphs[1] = "Number=sing"
|
|
|
|
morphs[2] = "Number=plur"
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
gold = {
|
|
|
|
"tags": tags,
|
|
|
|
"pos": pos,
|
|
|
|
"morphs": morphs,
|
|
|
|
"sent_starts": gold["sent_starts"],
|
|
|
|
}
|
2020-06-26 20:34:12 +03:00
|
|
|
example = Example.from_dict(tagged_doc, gold)
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
results = scorer.score([example])
|
|
|
|
|
|
|
|
assert results["tag_acc"] == 0.9
|
|
|
|
assert results["pos_acc"] == 0.9
|
|
|
|
assert results["morph_acc"] == approx(0.8)
|
2021-10-29 11:29:29 +03:00
|
|
|
assert results["morph_micro_f"] == approx(0.8461538)
|
2020-08-06 16:14:47 +03:00
|
|
|
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
|
|
|
|
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
|
|
|
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
2020-04-02 15:46:32 +03:00
|
|
|
|
2023-05-12 16:36:54 +03:00
|
|
|
# per-component scoring
|
|
|
|
scorer = Scorer()
|
|
|
|
results = scorer.score([example], per_component=True)
|
|
|
|
assert results["tagger"]["tag_acc"] == 0.9
|
|
|
|
assert results["morphologizer"]["pos_acc"] == 0.9
|
|
|
|
assert results["morphologizer"]["morph_acc"] == approx(0.8)
|
|
|
|
|
2020-04-02 15:46:32 +03:00
|
|
|
|
2020-11-03 17:47:18 +03:00
|
|
|
def test_partial_annotation(en_tokenizer):
|
|
|
|
pred_doc = en_tokenizer("a b c d e")
|
|
|
|
pred_doc[0].tag_ = "A"
|
|
|
|
pred_doc[0].pos_ = "X"
|
|
|
|
pred_doc[0].set_morph("Feat=Val")
|
|
|
|
pred_doc[0].dep_ = "dep"
|
|
|
|
|
|
|
|
# unannotated reference
|
|
|
|
ref_doc = en_tokenizer("a b c d e")
|
|
|
|
ref_doc.has_unknown_spaces = True
|
|
|
|
example = Example(pred_doc, ref_doc)
|
|
|
|
scorer = Scorer()
|
|
|
|
scores = scorer.score([example])
|
|
|
|
for key in scores:
|
|
|
|
# cats doesn't have an unset state
|
|
|
|
if key.startswith("cats"):
|
|
|
|
continue
|
2021-01-05 05:41:53 +03:00
|
|
|
assert scores[key] is None
|
2020-11-03 17:47:18 +03:00
|
|
|
|
|
|
|
# partially annotated reference, not overlapping with predicted annotation
|
|
|
|
ref_doc = en_tokenizer("a b c d e")
|
|
|
|
ref_doc.has_unknown_spaces = True
|
|
|
|
ref_doc[1].tag_ = "A"
|
|
|
|
ref_doc[1].pos_ = "X"
|
|
|
|
ref_doc[1].set_morph("Feat=Val")
|
|
|
|
ref_doc[1].dep_ = "dep"
|
|
|
|
example = Example(pred_doc, ref_doc)
|
|
|
|
scorer = Scorer()
|
|
|
|
scores = scorer.score([example])
|
2021-01-05 05:41:53 +03:00
|
|
|
assert scores["token_acc"] is None
|
2020-11-03 17:47:18 +03:00
|
|
|
assert scores["tag_acc"] == 0.0
|
|
|
|
assert scores["pos_acc"] == 0.0
|
|
|
|
assert scores["morph_acc"] == 0.0
|
|
|
|
assert scores["dep_uas"] == 1.0
|
|
|
|
assert scores["dep_las"] == 0.0
|
2021-01-05 05:41:53 +03:00
|
|
|
assert scores["sents_f"] is None
|
2020-11-03 17:47:18 +03:00
|
|
|
|
|
|
|
# partially annotated reference, overlapping with predicted annotation
|
|
|
|
ref_doc = en_tokenizer("a b c d e")
|
|
|
|
ref_doc.has_unknown_spaces = True
|
|
|
|
ref_doc[0].tag_ = "A"
|
|
|
|
ref_doc[0].pos_ = "X"
|
|
|
|
ref_doc[1].set_morph("Feat=Val")
|
|
|
|
ref_doc[1].dep_ = "dep"
|
|
|
|
example = Example(pred_doc, ref_doc)
|
|
|
|
scorer = Scorer()
|
|
|
|
scores = scorer.score([example])
|
2021-01-05 05:41:53 +03:00
|
|
|
assert scores["token_acc"] is None
|
2020-11-03 17:47:18 +03:00
|
|
|
assert scores["tag_acc"] == 1.0
|
|
|
|
assert scores["pos_acc"] == 1.0
|
|
|
|
assert scores["morph_acc"] == 0.0
|
|
|
|
assert scores["dep_uas"] == 1.0
|
|
|
|
assert scores["dep_las"] == 0.0
|
2021-01-05 05:41:53 +03:00
|
|
|
assert scores["sents_f"] is None
|
2020-11-03 17:47:18 +03:00
|
|
|
|
|
|
|
|
2019-09-15 23:31:31 +03:00
|
|
|
def test_roc_auc_score():
|
|
|
|
# Binary classification, toy tests from scikit-learn test suite
|
|
|
|
y_true = [0, 1]
|
|
|
|
y_score = [0, 1]
|
|
|
|
tpr, fpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
roc_auc = _roc_auc_score(y_true, y_score)
|
|
|
|
assert_array_almost_equal(tpr, [0, 0, 1])
|
|
|
|
assert_array_almost_equal(fpr, [0, 1, 1])
|
2019-09-18 21:27:03 +03:00
|
|
|
assert_almost_equal(roc_auc, 1.0)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
y_true = [0, 1]
|
|
|
|
y_score = [1, 0]
|
|
|
|
tpr, fpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
roc_auc = _roc_auc_score(y_true, y_score)
|
|
|
|
assert_array_almost_equal(tpr, [0, 1, 1])
|
|
|
|
assert_array_almost_equal(fpr, [0, 0, 1])
|
2019-09-18 21:27:03 +03:00
|
|
|
assert_almost_equal(roc_auc, 0.0)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
y_true = [1, 0]
|
|
|
|
y_score = [1, 1]
|
|
|
|
tpr, fpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
roc_auc = _roc_auc_score(y_true, y_score)
|
|
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
|
|
assert_array_almost_equal(fpr, [0, 1])
|
|
|
|
assert_almost_equal(roc_auc, 0.5)
|
|
|
|
|
|
|
|
y_true = [1, 0]
|
|
|
|
y_score = [1, 0]
|
|
|
|
tpr, fpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
roc_auc = _roc_auc_score(y_true, y_score)
|
|
|
|
assert_array_almost_equal(tpr, [0, 0, 1])
|
|
|
|
assert_array_almost_equal(fpr, [0, 1, 1])
|
2019-09-18 21:27:03 +03:00
|
|
|
assert_almost_equal(roc_auc, 1.0)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
y_true = [1, 0]
|
|
|
|
y_score = [0.5, 0.5]
|
|
|
|
tpr, fpr, _ = _roc_curve(y_true, y_score)
|
|
|
|
roc_auc = _roc_auc_score(y_true, y_score)
|
|
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
|
|
assert_array_almost_equal(fpr, [0, 1])
|
2019-09-18 21:27:03 +03:00
|
|
|
assert_almost_equal(roc_auc, 0.5)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
# same result as above with ROCAUCScore wrapper
|
|
|
|
score = ROCAUCScore()
|
|
|
|
score.score_set(0.5, 1)
|
|
|
|
score.score_set(0.5, 0)
|
2019-09-18 21:27:03 +03:00
|
|
|
assert_almost_equal(score.score, 0.5)
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
# check that errors are raised in undefined cases and score is -inf
|
|
|
|
y_true = [0, 0]
|
|
|
|
y_score = [0.25, 0.75]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
_roc_auc_score(y_true, y_score)
|
|
|
|
|
|
|
|
score = ROCAUCScore()
|
|
|
|
score.score_set(0.25, 0)
|
|
|
|
score.score_set(0.75, 0)
|
2020-10-18 15:50:41 +03:00
|
|
|
with pytest.raises(ValueError):
|
2021-01-05 05:41:53 +03:00
|
|
|
_ = score.score # noqa: F841
|
2019-09-15 23:31:31 +03:00
|
|
|
|
|
|
|
y_true = [1, 1]
|
|
|
|
y_score = [0.25, 0.75]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
_roc_auc_score(y_true, y_score)
|
|
|
|
|
|
|
|
score = ROCAUCScore()
|
|
|
|
score.score_set(0.25, 1)
|
|
|
|
score.score_set(0.75, 1)
|
2020-10-18 15:50:41 +03:00
|
|
|
with pytest.raises(ValueError):
|
2021-01-05 05:41:53 +03:00
|
|
|
_ = score.score # noqa: F841
|
2021-04-08 10:34:14 +03:00
|
|
|
|
|
|
|
|
2021-04-08 13:19:17 +03:00
|
|
|
def test_score_spans():
|
|
|
|
nlp = English()
|
|
|
|
text = "This is just a random sentence."
|
|
|
|
key = "my_spans"
|
|
|
|
gold = nlp.make_doc(text)
|
|
|
|
pred = nlp.make_doc(text)
|
|
|
|
spans = []
|
|
|
|
spans.append(gold.char_span(0, 4, label="PERSON"))
|
|
|
|
spans.append(gold.char_span(0, 7, label="ORG"))
|
|
|
|
spans.append(gold.char_span(8, 12, label="ORG"))
|
|
|
|
gold.spans[key] = spans
|
|
|
|
|
|
|
|
def span_getter(doc, span_key):
|
|
|
|
return doc.spans[span_key]
|
|
|
|
|
|
|
|
# Predict exactly the same, but overlapping spans will be discarded
|
2023-06-01 20:19:17 +03:00
|
|
|
pred.spans[key] = gold.spans[key].copy(doc=pred)
|
2021-04-08 13:19:17 +03:00
|
|
|
eg = Example(pred, gold)
|
|
|
|
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
|
|
|
|
assert scores[f"{key}_p"] == 1.0
|
|
|
|
assert scores[f"{key}_r"] < 1.0
|
|
|
|
|
|
|
|
# Allow overlapping, now both precision and recall should be 100%
|
2023-06-01 20:19:17 +03:00
|
|
|
pred.spans[key] = gold.spans[key].copy(doc=pred)
|
2021-04-08 13:19:17 +03:00
|
|
|
eg = Example(pred, gold)
|
|
|
|
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
|
|
|
|
assert scores[f"{key}_p"] == 1.0
|
|
|
|
assert scores[f"{key}_r"] == 1.0
|
|
|
|
|
|
|
|
# Change the predicted labels
|
|
|
|
new_spans = [Span(pred, span.start, span.end, label="WRONG") for span in spans]
|
|
|
|
pred.spans[key] = new_spans
|
|
|
|
eg = Example(pred, gold)
|
|
|
|
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
|
|
|
|
assert scores[f"{key}_p"] == 0.0
|
|
|
|
assert scores[f"{key}_r"] == 0.0
|
|
|
|
assert f"{key}_per_type" in scores
|
|
|
|
|
|
|
|
# Discard labels from the evaluation
|
2021-06-28 12:48:00 +03:00
|
|
|
scores = Scorer.score_spans(
|
|
|
|
[eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False
|
|
|
|
)
|
2021-04-08 13:19:17 +03:00
|
|
|
assert scores[f"{key}_p"] == 1.0
|
|
|
|
assert scores[f"{key}_r"] == 1.0
|
|
|
|
assert f"{key}_per_type" not in scores
|
|
|
|
|
|
|
|
|
2021-04-08 10:34:14 +03:00
|
|
|
def test_prf_score():
|
|
|
|
cand = {"hi", "ho"}
|
|
|
|
gold1 = {"yo", "hi"}
|
|
|
|
gold2 = set()
|
|
|
|
|
|
|
|
a = PRFScore()
|
|
|
|
a.score_set(cand=cand, gold=gold1)
|
|
|
|
assert (a.precision, a.recall, a.fscore) == approx((0.5, 0.5, 0.5))
|
|
|
|
|
|
|
|
b = PRFScore()
|
|
|
|
b.score_set(cand=cand, gold=gold2)
|
|
|
|
assert (b.precision, b.recall, b.fscore) == approx((0.0, 0.0, 0.0))
|
|
|
|
|
|
|
|
c = a + b
|
|
|
|
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
|
|
|
|
|
|
|
|
a += b
|
2021-06-28 12:48:00 +03:00
|
|
|
assert (a.precision, a.recall, a.fscore) == approx(
|
|
|
|
(c.precision, c.recall, c.fscore)
|
|
|
|
)
|
2022-11-02 17:35:04 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_score_cats(en_tokenizer):
|
|
|
|
text = "some text"
|
|
|
|
gold_doc = en_tokenizer(text)
|
|
|
|
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
|
|
|
|
pred_doc = en_tokenizer(text)
|
|
|
|
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
|
|
|
|
example = Example(pred_doc, gold_doc)
|
|
|
|
# threshold is ignored for multi_label=False
|
|
|
|
scores1 = Scorer.score_cats(
|
|
|
|
[example],
|
|
|
|
"cats",
|
|
|
|
labels=list(gold_doc.cats.keys()),
|
|
|
|
multi_label=False,
|
|
|
|
positive_label="POSITIVE",
|
|
|
|
threshold=0.1,
|
|
|
|
)
|
|
|
|
scores2 = Scorer.score_cats(
|
|
|
|
[example],
|
|
|
|
"cats",
|
|
|
|
labels=list(gold_doc.cats.keys()),
|
|
|
|
multi_label=False,
|
|
|
|
positive_label="POSITIVE",
|
|
|
|
threshold=0.9,
|
|
|
|
)
|
|
|
|
assert scores1["cats_score"] == 1.0
|
|
|
|
assert scores2["cats_score"] == 1.0
|
|
|
|
assert scores1 == scores2
|
|
|
|
# threshold is relevant for multi_label=True
|
|
|
|
scores = Scorer.score_cats(
|
|
|
|
[example],
|
|
|
|
"cats",
|
|
|
|
labels=list(gold_doc.cats.keys()),
|
|
|
|
multi_label=True,
|
|
|
|
threshold=0.9,
|
|
|
|
)
|
|
|
|
assert scores["cats_macro_f"] == 0.0
|
|
|
|
# threshold is relevant for multi_label=True
|
|
|
|
scores = Scorer.score_cats(
|
|
|
|
[example],
|
|
|
|
"cats",
|
|
|
|
labels=list(gold_doc.cats.keys()),
|
|
|
|
multi_label=True,
|
|
|
|
threshold=0.1,
|
|
|
|
)
|
|
|
|
assert scores["cats_macro_f"] == 0.5
|