spaCy/spacy/tests/test_scorer.py

145 lines
4.6 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
import numpy as np
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
from spacy.errors import Errors
from spacy.gold import GoldParse
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
test_ner_cardinal = [
2019-08-18 16:09:16 +03:00
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
2019-08-18 16:09:16 +03:00
{"entities": [(0, 5, "ORG"), (27, 31, "GPE"), (44, 54, "MONEY")]},
]
]
2019-08-18 16:09:16 +03:00
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
for input_, annot in test_ner_cardinal:
2019-08-18 16:09:16 +03:00
doc = get_doc(
en_vocab,
words=input_.split(" "),
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
gold = GoldParse(doc, entities=annot["entities"])
scorer.score(doc, gold)
results = scorer.scores
2019-08-18 16:09:16 +03:00
assert results["ents_p"] == 100
assert results["ents_f"] == 100
assert results["ents_r"] == 100
assert results["ents_per_type"]["CARDINAL"]["p"] == 100
assert results["ents_per_type"]["CARDINAL"]["f"] == 100
assert results["ents_per_type"]["CARDINAL"]["r"] == 100
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
for input_, annot in test_ner_apple:
2019-08-18 16:09:16 +03:00
doc = get_doc(
en_vocab,
words=input_.split(" "),
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
gold = GoldParse(doc, entities=annot["entities"])
scorer.score(doc, gold)
results = scorer.scores
2019-08-18 16:09:16 +03:00
assert results["ents_p"] == approx(66.66666)
assert results["ents_r"] == approx(66.66666)
assert results["ents_f"] == approx(66.66666)
assert "GPE" in results["ents_per_type"]
assert "MONEY" in results["ents_per_type"]
assert "ORG" in results["ents_per_type"]
assert results["ents_per_type"]["GPE"]["p"] == 100
assert results["ents_per_type"]["GPE"]["r"] == 100
assert results["ents_per_type"]["GPE"]["f"] == 100
assert results["ents_per_type"]["MONEY"]["p"] == 0
assert results["ents_per_type"]["MONEY"]["r"] == 0
assert results["ents_per_type"]["MONEY"]["f"] == 0
assert results["ents_per_type"]["ORG"]["p"] == 50
assert results["ents_per_type"]["ORG"]["r"] == 100
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, .5)
# same result as above with ROCAUCScore wrapper
score = ROCAUCScore()
score.score_set(0.5, 1)
score.score_set(0.5, 0)
assert_almost_equal(score.score, .5)
# check that errors are raised in undefined cases and score is -inf
y_true = [0, 0]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
assert score.score == -float("inf")
y_true = [1, 1]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
assert score.score == -float("inf")