spaCy/spacy/scorer.py

from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np

from .training import Example
from .tokens import Token, Doc, Span
from .errors import Errors
from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology

if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401


DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]


class PRFScore:
    """A precision / recall / F score."""

    def __init__(self) -> None:
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def fscore(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f": self.fscore}


class ROCAUCScore:
    """An AUC ROC score."""

    def __init__(self) -> None:
        self.golds = []
        self.cands = []
        self.saved_score = 0.0
        self.saved_score_at_len = 0

    def score_set(self, cand, gold) -> None:
        self.cands.append(cand)
        self.golds.append(gold)

    @property
    def score(self):
        if len(self.golds) == self.saved_score_at_len:
            return self.saved_score
        try:
            self.saved_score = _roc_auc_score(self.golds, self.cands)
        # catch ValueError: Only one class present in y_true.
        # ROC AUC score is not defined in that case.
        except ValueError:
            self.saved_score = -float("inf")
        self.saved_score_at_len = len(self.golds)
        return self.saved_score


class Scorer:
    """Compute evaluation scores."""

    def __init__(
        self,
        nlp: Optional["Language"] = None,
        default_lang: str = "xx",
        default_pipeline=DEFAULT_PIPELINE,
        **cfg,
    ) -> None:
        """Initialize the Scorer.

        DOCS: https://nightly.spacy.io/api/scorer#init
        """
        self.nlp = nlp
        self.cfg = cfg
        if not nlp:
            nlp = get_lang_class(default_lang)()
            for pipe in default_pipeline:
                nlp.add_pipe(pipe)
            self.nlp = nlp

    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
        """Evaluate a list of Examples.

        examples (Iterable[Example]): The predicted annotations + correct annotations.
        RETURNS (Dict): A dictionary of scores.

        DOCS: https://nightly.spacy.io/api/scorer#score
        """
        scores = {}
        if hasattr(self.nlp.tokenizer, "score"):
            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
        for name, component in self.nlp.pipeline:
            if hasattr(component, "score"):
                scores.update(component.score(examples, **self.cfg))
        return scores

    @staticmethod
    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
        """Returns accuracy and PRF scores for tokenization.
        * token_acc: # correct tokens / # gold tokens
        * token_p/r/f: PRF for token character spans

        examples (Iterable[Example]): Examples to score
        RETURNS (Dict[str, float]): A dictionary containing the scores
            token_acc/p/r/f.

        DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
        """
        acc_score = PRFScore()
        prf_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_spans = set()
            pred_spans = set()
            for token in gold_doc:
                if token.orth_.isspace():
                    continue
                gold_spans.add((token.idx, token.idx + len(token)))
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                pred_spans.add((token.idx, token.idx + len(token)))
                if align.x2y.lengths[token.i] != 1:
                    acc_score.fp += 1
                else:
                    acc_score.tp += 1
            prf_score.score_set(pred_spans, gold_spans)
        return {
            "token_acc": acc_score.fscore,
            "token_p": prf_score.precision,
            "token_r": prf_score.recall,
            "token_f": prf_score.fscore,
        }

    @staticmethod
    def score_token_attr(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        **cfg,
    ) -> Dict[str, float]:
        """Returns an accuracy score for a token-level attribute.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (Dict[str, float]): A dictionary containing the accuracy score
            under the key attr_acc.

        DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
        """
        tag_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_tags = set()
            for gold_i, token in enumerate(gold_doc):
                gold_tags.add((gold_i, getter(token, attr)))
            pred_tags = set()
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
        return {f"{attr}_acc": tag_score.fscore}

    @staticmethod
    def score_token_attr_per_feat(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        **cfg,
    ):
        """Return PRF scores per feat for a token attribute in UFEATS format.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
            the key attr_per_feat.
        """
        per_feat = {}
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            align = example.alignment
            gold_per_feat = {}
            for gold_i, token in enumerate(gold_doc):
                morph = str(getter(token, attr))
                if morph:
                    for feat in morph.split(Morphology.FEATURE_SEP):
                        field, values = feat.split(Morphology.FIELD_SEP)
                        if field not in per_feat:
                            per_feat[field] = PRFScore()
                        if field not in gold_per_feat:
                            gold_per_feat[field] = set()
                        gold_per_feat[field].add((gold_i, feat))
            pred_per_feat = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    morph = str(getter(token, attr))
                    if morph:
                        for feat in morph.split("|"):
                            field, values = feat.split("=")
                            if field not in per_feat:
                                per_feat[field] = PRFScore()
                            if field not in pred_per_feat:
                                pred_per_feat[field] = set()
                            pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
                )
        result = {k: v.to_dict() for k, v in per_feat.items()}
        return {f"{attr}_per_feat": result}

    @staticmethod
    def score_spans(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF scores for labeled spans.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
            provided, getter(doc, attr) should return the spans for the
            individual doc.
        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.

        DOCS: https://nightly.spacy.io/api/scorer#score_spans
        """
        score = PRFScore()
        score_per_type = dict()
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            # Find all labels in gold and doc
            labels = set(
                [k.label_ for k in getter(gold_doc, attr)]
                + [k.label_ for k in getter(pred_doc, attr)]
            )
            # Set up all labels for per type scoring and prepare gold per type
            gold_per_type = {label: set() for label in labels}
            for label in labels:
                if label not in score_per_type:
                    score_per_type[label] = PRFScore()
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()
            # Special case for ents:
            # If we have missing values in the gold, we can't easily tell
            # whether our NER predictions are true.
            # It seems bad but it's what we've always done.
            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                continue
            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            pred_per_type = {label: set() for label in labels}
            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
                pred_spans.add((span.label_, span.start, span.end - 1))
                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            # Scores per label
            for k, v in score_per_type.items():
                if k in pred_per_type:
                    v.score_set(pred_per_type[k], gold_per_type[k])
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
        results = {
            f"{attr}_p": score.precision,
            f"{attr}_r": score.recall,
            f"{attr}_f": score.fscore,
            f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
        }
        return results

    @staticmethod
    def score_cats(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Doc, str], Any] = getattr,
        labels: Iterable[str] = SimpleFrozenList(),
        multi_label: bool = True,
        positive_label: Optional[str] = None,
        threshold: Optional[float] = None,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF and ROC AUC scores for a doc-level attribute with a
        dict with scores for each label like Doc.cats. The reported overall
        score depends on the scorer settings.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
            to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
            otherwise.
        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
            inapplicable scores as None:
            for all:
                attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
                attr_score_desc (text description of the overall score),
                attr_micro_f,
                attr_macro_f,
                attr_auc,
                attr_f_per_type,
                attr_auc_per_type

        DOCS: https://nightly.spacy.io/api/scorer#score_cats
        """
        if threshold is None:
            threshold = 0.5 if multi_label else 0.0
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
        if labels:
            for eg in examples:
                labels.update(eg.predicted.cats.keys())
                labels.update(eg.reference.cats.keys())
        for example in examples:
            # Through this loop, None in the gold_cats indicates missing label.
            pred_cats = getter(example.predicted, attr)
            gold_cats = getter(example.reference, attr)

            # I think the AUC metric is applicable regardless of whether we're
            # doing multi-label classification? Unsure. If not, move this into
            # the elif pred_cats and gold_cats block below.
            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
                gold_score = gold_cats.get(label, 0.0)
                if gold_score is not None:
                    auc_per_type[label].score_set(pred_score, gold_score)
            if multi_label:
                for label in labels:
                    pred_score = pred_cats.get(label, 0.0)
                    gold_score = gold_cats.get(label, 0.0)
                    if gold_score is not None:
                        if pred_score >= threshold and gold_score > 0:
                            f_per_type[label].tp += 1
                        elif pred_score >= threshold and gold_score == 0:
                            f_per_type[label].fp += 1
                        elif pred_score < threshold and gold_score > 0:
                            f_per_type[label].fn += 1
            elif pred_cats and gold_cats:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
                if gold_score is not None:
                    if pred_label == gold_label and pred_score >= threshold:
                        f_per_type[pred_label].tp += 1
                    else:
                        f_per_type[gold_label].fn += 1
                        if pred_score >= threshold:
                            f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
                if gold_score is not None and gold_score > 0:
                    f_per_type[gold_label].fn += 1
            else:
                pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
                if pred_score >= threshold:
                    f_per_type[pred_label].fp += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp = label_prf.tp
            micro_prf.fn = label_prf.fn
            micro_prf.fp = label_prf.fp
        n_cats = len(f_per_type) + 1e-100
        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
        macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
        results = {
            f"{attr}_score": None,
            f"{attr}_score_desc": None,
            f"{attr}_micro_p": micro_prf.precision,
            f"{attr}_micro_r": micro_prf.recall,
            f"{attr}_micro_f": micro_prf.fscore,
            f"{attr}_macro_p": macro_p,
            f"{attr}_macro_r": macro_r,
            f"{attr}_macro_f": macro_f,
            f"{attr}_macro_auc": macro_auc,
            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
        }
        if len(labels) == 2 and not multi_label and positive_label:
            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
            results[f"{attr}_score"] = positive_label_f
            results[f"{attr}_score_desc"] = f"F ({positive_label})"
        elif not multi_label:
            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
            results[f"{attr}_score_desc"] = "macro F"
        else:
            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
            results[f"{attr}_score_desc"] = "macro AUC"
        return results

    @staticmethod
    def score_deps(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        head_attr: str = "head",
        head_getter: Callable[[Token, str], Token] = getattr,
        ignore_labels: Iterable[str] = SimpleFrozenList(),
        **cfg,
    ) -> Dict[str, Any]:
        """Returns the UAS, LAS, and LAS per type scores for dependency
        parses.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute containing the dependency label.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        head_attr (str): The attribute containing the head token. Defaults to
            'head'.
        head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
            head_getter(token, attr) should return the value of the head for an
            individual token.
        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
        RETURNS (Dict[str, Any]): A dictionary containing the scores:
            attr_uas, attr_las, and attr_las_per_type.

        DOCS: https://nightly.spacy.io/api/scorer#score_deps
        """
        unlabelled = PRFScore()
        labelled = PRFScore()
        labelled_per_dep = dict()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_deps = set()
            gold_deps_per_dep = {}
            for gold_i, token in enumerate(gold_doc):
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
                if dep not in ignore_labels:
                    gold_deps.add((gold_i, head.i, dep))
                    if dep not in labelled_per_dep:
                        labelled_per_dep[dep] = PRFScore()
                    if dep not in gold_deps_per_dep:
                        gold_deps_per_dep[dep] = set()
                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
            pred_deps = set()
            pred_deps_per_dep = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] != 1:
                    gold_i = None
                else:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
                if dep not in ignore_labels and token.orth_.strip():
                    if align.x2y.lengths[head.i] == 1:
                        gold_head = align.x2y[head.i].dataXd[0, 0]
                    else:
                        gold_head = None
                    # None is indistinct, so we can't just add it to the set
                    # Multiple (None, None) deps are possible
                    if gold_i is None or gold_head is None:
                        unlabelled.fp += 1
                        labelled.fp += 1
                    else:
                        pred_deps.add((gold_i, gold_head, dep))
                        if dep not in labelled_per_dep:
                            labelled_per_dep[dep] = PRFScore()
                        if dep not in pred_deps_per_dep:
                            pred_deps_per_dep[dep] = set()
                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
            labelled.score_set(pred_deps, gold_deps)
            for dep in labelled_per_dep:
                labelled_per_dep[dep].score_set(
                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
                )
            unlabelled.score_set(
                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
        return {
            f"{attr}_uas": unlabelled.fscore,
            f"{attr}_las": labelled.fscore,
            f"{attr}_las_per_type": {
                k: v.to_dict() for k, v in labelled_per_dep.items()
            },
        }


#############################################################################
#
# The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the following license:
#
# New BSD License
#
# Copyright (c) 2007–2019 The scikit-learn developers.
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   a. Redistributions of source code must retain the above copyright notice,
#      this list of conditions and the following disclaimer.
#   b. Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#   c. Neither the name of the Scikit-learn Developers  nor the names of
#      its contributors may be used to endorse or promote products
#      derived from this software without specific prior written
#      permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.


def _roc_auc_score(y_true, y_score):
    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.

    Note: this implementation is restricted to the binary classification task

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, n_classes]
        True binary labels or binary label indicators.
        The multiclass case expects shape = [n_samples] and labels
        with values in ``range(n_classes)``.

    y_score : array, shape = [n_samples] or [n_samples, n_classes]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers). For binary
        y_true, y_score is supposed to be the score of the class with greater
        label. The multiclass case expects shape = [n_samples, n_classes]
        where the scores correspond to probability estimates.

    Returns
    -------
    auc : float

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.

    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
    """
    if len(np.unique(y_true)) != 2:
        raise ValueError(Errors.E165)
    fpr, tpr, _ = _roc_curve(y_true, y_score)
    return _auc(fpr, tpr)


def _roc_curve(y_true, y_score):
    """Compute Receiver operating characteristic (ROC)

    Note: this implementation is restricted to the binary classification task.

    Parameters
    ----------

    y_true : array, shape = [n_samples]
        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
        pos_label should be explicitly given.

    y_score : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    Returns
    -------
    fpr : array, shape = [>2]
        Increasing false positive rates such that element i is the false
        positive rate of predictions with score >= thresholds[i].

    tpr : array, shape = [>2]
        Increasing true positive rates such that element i is the true
        positive rate of predictions with score >= thresholds[i].

    thresholds : array, shape = [n_thresholds]
        Decreasing thresholds on the decision function used to compute
        fpr and tpr. `thresholds[0]` represents no instances being predicted
        and is arbitrarily set to `max(y_score) + 1`.

    Notes
    -----
    Since the thresholds are sorted from low to high values, they
    are reversed upon returning them to ensure they correspond to both ``fpr``
    and ``tpr``, which are sorted in reversed order during their calculation.

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.
    """
    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)

    # Add an extra threshold position
    # to make sure that the curve starts at (0, 0)
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    thresholds = np.r_[thresholds[0] + 1, thresholds]

    if fps[-1] <= 0:
        fpr = np.repeat(np.nan, fps.shape)
    else:
        fpr = fps / fps[-1]

    if tps[-1] <= 0:
        tpr = np.repeat(np.nan, tps.shape)
    else:
        tpr = tps / tps[-1]

    return fpr, tpr, thresholds


def _binary_clf_curve(y_true, y_score):
    """Calculate true and false positives per binary classification threshold.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True targets of binary classification

    y_score : array, shape = [n_samples]
        Estimated probabilities or decision function

    Returns
    -------
    fps : array, shape = [n_thresholds]
        A count of false positives, at index i being the number of negative
        samples assigned a score >= thresholds[i]. The total number of
        negative samples is equal to fps[-1] (thus true negatives are given by
        fps[-1] - fps).

    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
        An increasing count of true positives, at index i being the number
        of positive samples assigned a score >= thresholds[i]. The total
        number of positive samples is equal to tps[-1] (thus false negatives
        are given by tps[-1] - tps).

    thresholds : array, shape = [n_thresholds]
        Decreasing score values.
    """
    pos_label = 1.0

    y_true = np.ravel(y_true)
    y_score = np.ravel(y_score)

    # make y_true a boolean vector
    y_true = y_true == pos_label

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    weight = 1.0

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    return fps, tps, y_score[threshold_idxs]


def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
    """Use high precision for cumsum and check that final value matches sum

    Parameters
    ----------
    arr : array-like
        To be cumulatively summed as flat
    axis : int, optional
        Axis along which the cumulative sum is computed.
        The default (None) is to compute the cumsum over the flattened array.
    rtol : float
        Relative tolerance, see ``np.allclose``
    atol : float
        Absolute tolerance, see ``np.allclose``
    """
    out = np.cumsum(arr, axis=axis, dtype=np.float64)
    expected = np.sum(arr, axis=axis, dtype=np.float64)
    if not np.all(
        np.isclose(
            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
        )
    ):
        raise ValueError(Errors.E163)
    return out


def _auc(x, y):
    """Compute Area Under the Curve (AUC) using the trapezoidal rule

    This is a general function, given points on a curve.  For computing the
    area under the ROC-curve, see :func:`roc_auc_score`.

    Parameters
    ----------
    x : array, shape = [n]
        x coordinates. These must be either monotonic increasing or monotonic
        decreasing.
    y : array, shape = [n]
        y coordinates.

    Returns
    -------
    auc : float
    """
    x = np.ravel(x)
    y = np.ravel(y)

    direction = 1
    dx = np.diff(x)
    if np.any(dx < 0):
        if np.all(dx <= 0):
            direction = -1
        else:
            raise ValueError(Errors.E164.format(x))

    area = direction * np.trapz(y, x)
    if isinstance(area, np.memmap):
        # Reductions such as .sum used internally in np.trapz do not return a
        # scalar by default for numpy.memmap instances contrary to
        # regular numpy.ndarray instances.
        area = area.dtype.type(area)
    return area
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								import numpy as np
-												Renaming gold & annotation_setter (#6042)

* version bump to 3.0.0a16

* rename "gold" folder to "training"

* rename 'annotation_setter' to 'set_extra_annotations'

* formatting
											
										
										
											2020-09-09 11:31:03 +03:00
+								from .training import Example
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								from .tokens import Token, Doc, Span
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								from .errors import Errors
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								from .util import get_lang_class, SimpleFrozenList
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								from .morphology import Morphology
-												* Fix evaluation of NER in scorer.py

											
										
										
											2015-05-27 04:18:16 +03:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								if TYPE_CHECKING:
 								    # This lets us add type hints for mypy etc. without causing circular imports
 								    from .language import Language  # noqa: F401
 								DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
-												* Print parse if verbose in scorer

											
										
										
											2015-04-05 23:29:30 +03:00
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class PRFScore:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    """A precision / recall / F score."""
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def __init__(self) -> None:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
+								        self.tp = 0
 								        self.fp = 0
 								        self.fn = 0
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_set(self, cand: set, gold: set) -> None:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
+								        self.tp += len(cand.intersection(gold))
 								        self.fp += len(cand - gold)
 								        self.fn += len(gold - cand)
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def precision(self) -> float:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        return self.tp / (self.tp + self.fp + 1e-100)
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def recall(self) -> float:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        return self.tp / (self.tp + self.fn + 1e-100)
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def fscore(self) -> float:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
+								        p = self.precision
 								        r = self.recall
-												Various small tweaks to project CLI (#5965)

* Fix up/download of http and local paths

* Support git_sparse_checkout for assets

* Fix scorer

* Handle already-present directories for git assets

* Improve convert command

* Fix support for existant files in git assets

* Support branches in git sparse checkout

* Format

* Fix git assets

* Document git block in assets

* Fix test

* Fix test

* Revert "Fix test"

This reverts commit cf3097260fd2205604981dcee1030f835a548a3f.

* Revert "Fix test"

This reverts commit 964d636e2782ed4660942dffc905cb7e739659d6.

* Dont multiply p/r/f by 100

* Display scores * 100 during training
											
										
										
											2020-08-25 01:30:52 +03:00
+								        return 2 * ((p * r) / (p + r + 1e-100))
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def to_dict(self) -> Dict[str, float]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return {"p": self.precision, "r": self.recall, "f": self.fscore}
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class ROCAUCScore:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    """An AUC ROC score."""
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
-												Fix types in Scorer

											
										
										
											2020-07-29 11:39:33 +03:00
+								    def __init__(self) -> None:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        self.golds = []
 								        self.cands = []
 								        self.saved_score = 0.0
 								        self.saved_score_at_len = 0
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_set(self, cand, gold) -> None:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        self.cands.append(cand)
 								        self.golds.append(gold)
 								    @property
 								    def score(self):
 								        if len(self.golds) == self.saved_score_at_len:
 								            return self.saved_score
 								        try:
 								            self.saved_score = _roc_auc_score(self.golds, self.cands)
 								        # catch ValueError: Only one class present in y_true.
 								        # ROC AUC score is not defined in that case.
-												Make except more explicit

											
										
										
											2019-09-18 20:57:08 +03:00
+								        except ValueError:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            self.saved_score = -float("inf")
 								        self.saved_score_at_len = len(self.golds)
 								        return self.saved_score
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class Scorer:
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								    """Compute evaluation scores."""
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def __init__(
 								        self,
 								        nlp: Optional["Language"] = None,
 								        default_lang: str = "xx",
 								        default_pipeline=DEFAULT_PIPELINE,
 								        **cfg,
 								    ) -> None:
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """Initialize the Scorer.
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#init
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        self.nlp = nlp
 								        self.cfg = cfg
 								        if not nlp:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            nlp = get_lang_class(default_lang)()
 								            for pipe in default_pipeline:
 								                nlp.add_pipe(pipe)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            self.nlp = nlp
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Evaluate a list of Examples.
 								        examples (Iterable[Example]): The predicted annotations + correct annotations.
 								        RETURNS (Dict): A dictionary of scores.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score
-												Add a tagger-based SentenceRecognizer (#4713)

* Add sent_starts to GoldParse

* Add SentTagger pipeline component

Add `SentTagger` pipeline component as a subclass of `Tagger`.

* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
  * S (1): token at beginning of sentence
  * I (0): all other sentence positions
* Sets `token.sent_start` values

* Add sentence segmentation to Scorer

Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.

* Add sentence segmentation to CLI evaluate

* Add senttagger metrics/scoring to train CLI

* Rename SentTagger to SentenceRecognizer

* Add SentenceRecognizer to spacy.pipes imports

* Add SentenceRecognizer serialization test

* Shorten component name to sentrec

* Remove duplicates from train CLI output metrics

											
										
										
											2019-11-28 13:10:07 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        scores = {}
 								        if hasattr(self.nlp.tokenizer, "score"):
 								            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
 								        for name, component in self.nlp.pipeline:
 								            if hasattr(component, "score"):
 								                scores.update(component.score(examples, **self.cfg))
 								        return scores
-												* Add scorer script

											
										
										
											2015-03-11 04:07:03 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Returns accuracy and PRF scores for tokenization.
 								        * token_acc: # correct tokens / # gold tokens
 								        * token_p/r/f: PRF for token character spans
-												* Add scorer script

											
										
										
											2015-03-11 04:07:03 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        examples (Iterable[Example]): Examples to score
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        RETURNS (Dict[str, float]): A dictionary containing the scores
 								            token_acc/p/r/f.
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        acc_score = PRFScore()
 								        prf_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_spans = set()
 								            pred_spans = set()
 								            for token in gold_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                gold_spans.add((token.idx, token.idx + len(token)))
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                pred_spans.add((token.idx, token.idx + len(token)))
 								                if align.x2y.lengths[token.i] != 1:
 								                    acc_score.fp += 1
 								                else:
 								                    acc_score.tp += 1
 								            prf_score.score_set(pred_spans, gold_spans)
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        return {
-												Update scorer

											
										
										
											2020-08-25 03:42:47 +03:00
+								            "token_acc": acc_score.fscore,
 								            "token_p": prf_score.precision,
 								            "token_r": prf_score.recall,
 								            "token_f": prf_score.fscore,
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        }
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_token_attr(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
 								        **cfg,
 								    ) -> Dict[str, float]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Returns an accuracy score for a token-level attribute.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        RETURNS (Dict[str, float]): A dictionary containing the accuracy score
 								            under the key attr_acc.
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 12:19:28 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        tag_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_tags = set()
 								            for gold_i, token in enumerate(gold_doc):
 								                gold_tags.add((gold_i, getter(token, attr)))
 								            pred_tags = set()
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                    pred_tags.add((gold_i, getter(token, attr)))
 								            tag_score.score_set(pred_tags, gold_tags)
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        return {f"{attr}_acc": tag_score.fscore}
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 12:19:28 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_token_attr_per_feat(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
 								        **cfg,
 								    ):
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Return PRF scores per feat for a token attribute in UFEATS format.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
 								            the key attr_per_feat.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        per_feat = {}
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
 								            align = example.alignment
 								            gold_per_feat = {}
 								            for gold_i, token in enumerate(gold_doc):
 								                morph = str(getter(token, attr))
 								                if morph:
 								                    for feat in morph.split(Morphology.FEATURE_SEP):
 								                        field, values = feat.split(Morphology.FIELD_SEP)
 								                        if field not in per_feat:
 								                            per_feat[field] = PRFScore()
 								                        if field not in gold_per_feat:
 								                            gold_per_feat[field] = set()
 								                        gold_per_feat[field].add((gold_i, feat))
 								            pred_per_feat = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                    morph = str(getter(token, attr))
 								                    if morph:
 								                        for feat in morph.split("|"):
 								                            field, values = feat.split("=")
 								                            if field not in per_feat:
 								                                per_feat[field] = PRFScore()
 								                            if field not in pred_per_feat:
 								                                pred_per_feat[field] = set()
 								                            pred_per_feat[field].add((gold_i, feat))
 								            for field in per_feat:
 								                per_feat[field].score_set(
 								                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
 								                )
-												Fix return values for per feat score (#5885)

* Fix return values for per feat score

Convert `PRFScore` to dict as other per type scores.

* Update tests accordingly
											
										
										
											2020-08-06 16:14:47 +03:00
+								        result = {k: v.to_dict() for k, v in per_feat.items()}
 								        return {f"{attr}_per_feat": result}
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 03:02:07 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								    def score_spans(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Returns PRF scores for labeled spans.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								        getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
 								            provided, getter(doc, attr) should return the spans for the
 								            individual doc.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
 								            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_spans
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        score = PRFScore()
 								        score_per_type = dict()
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
 								            # Find all labels in gold and doc
 								            labels = set(
 								                [k.label_ for k in getter(gold_doc, attr)]
 								                + [k.label_ for k in getter(pred_doc, attr)]
 								            )
 								            # Set up all labels for per type scoring and prepare gold per type
 								            gold_per_type = {label: set() for label in labels}
 								            for label in labels:
 								                if label not in score_per_type:
 								                    score_per_type[label] = PRFScore()
 								            # Find all predidate labels, for all and per type
 								            gold_spans = set()
 								            pred_spans = set()
 								            # Special case for ents:
 								            # If we have missing values in the gold, we can't easily tell
 								            # whether our NER predictions are true.
 								            # It seems bad but it's what we've always done.
 								            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
 								                continue
 								            for span in getter(gold_doc, attr):
 								                gold_span = (span.label_, span.start, span.end - 1)
 								                gold_spans.add(gold_span)
 								                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
 								            pred_per_type = {label: set() for label in labels}
 								            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
 								                pred_spans.add((span.label_, span.start, span.end - 1))
 								                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
 								            # Scores per label
 								            for k, v in score_per_type.items():
 								                if k in pred_per_type:
 								                    v.score_set(pred_per_type[k], gold_per_type[k])
 								            # Score for all labels
 								            score.score_set(pred_spans, gold_spans)
 								        results = {
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            f"{attr}_p": score.precision,
 								            f"{attr}_r": score.recall,
 								            f"{attr}_f": score.fscore,
 								            f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        }
 								        return results
 								    @staticmethod
 								    def score_cats(
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        examples: Iterable[Example],
 								        attr: str,
 								        *,
-												Fix types in Scorer

											
										
										
											2020-07-29 11:39:33 +03:00
+								        getter: Callable[[Doc, str], Any] = getattr,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        labels: Iterable[str] = SimpleFrozenList(),
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        multi_label: bool = True,
 								        positive_label: Optional[str] = None,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								        threshold: Optional[float] = None,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Returns PRF and ROC AUC scores for a doc-level attribute with a
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								        dict with scores for each label like Doc.cats. The reported overall
 								        score depends on the scorer settings.
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Fix types in Scorer

											
										
										
											2020-07-29 11:39:33 +03:00
+								        getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            getter(doc, attr) should return the values for the individual doc.
 								        labels (Iterable[str]): The set of possible labels. Defaults to [].
 								        multi_label (bool): Whether the attribute allows multiple labels.
 								            Defaults to True.
 								        positive_label (str): The positive label for a binary task with
 								            exclusive classes. Defaults to None.
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								        threshold (float): Cutoff to consider a prediction "positive". Defaults
 								            to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
 								            otherwise.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
 								            inapplicable scores as None:
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								            for all:
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								                attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								                attr_score_desc (text description of the overall score),
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								                attr_micro_f,
 								                attr_macro_f,
 								                attr_auc,
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								                attr_f_per_type,
 								                attr_auc_per_type
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_cats
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								        if threshold is None:
 								            threshold = 0.5 if multi_label else 0.0
 								        f_per_type = {label: PRFScore() for label in labels}
 								        auc_per_type = {label: ROCAUCScore() for label in labels}
 								        labels = set(labels)
 								        if labels:
 								            for eg in examples:
 								                labels.update(eg.predicted.cats.keys())
 								                labels.update(eg.reference.cats.keys())
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        for example in examples:
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								            # Through this loop, None in the gold_cats indicates missing label.
 								            pred_cats = getter(example.predicted, attr)
 								            gold_cats = getter(example.reference, attr)
 								            # I think the AUC metric is applicable regardless of whether we're
 								            # doing multi-label classification? Unsure. If not, move this into
 								            # the elif pred_cats and gold_cats block below.
 								            for label in labels:
 								                pred_score = pred_cats.get(label, 0.0)
 								                gold_score = gold_cats.get(label, 0.0)
 								                if gold_score is not None:
 								                    auc_per_type[label].score_set(pred_score, gold_score)
 								            if multi_label:
 								                for label in labels:
 								                    pred_score = pred_cats.get(label, 0.0)
 								                    gold_score = gold_cats.get(label, 0.0)
 								                    if gold_score is not None:
 								                        if pred_score >= threshold and gold_score > 0:
 								                            f_per_type[label].tp += 1
 								                        elif pred_score >= threshold and gold_score == 0:
 								                            f_per_type[label].fp += 1
 								                        elif pred_score < threshold and gold_score > 0:
 								                            f_per_type[label].fn += 1
 								            elif pred_cats and gold_cats:
 								                # Get the highest-scoring for each.
 								                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
 								                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
 								                if gold_score is not None:
 								                    if pred_label == gold_label and pred_score >= threshold:
 								                        f_per_type[pred_label].tp += 1
 								                    else:
 								                        f_per_type[gold_label].fn += 1
 								                        if pred_score >= threshold:
 								                            f_per_type[pred_label].fp += 1
 								            elif gold_cats:
 								                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
 								                if gold_score is not None and gold_score > 0:
 								                    f_per_type[gold_label].fn += 1
 								            else:
 								                pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
 								                if pred_score >= threshold:
 								                    f_per_type[pred_label].fp += 1
 								        micro_prf = PRFScore()
 								        for label_prf in f_per_type.values():
 								            micro_prf.tp = label_prf.tp
 								            micro_prf.fn = label_prf.fn
 								            micro_prf.fp = label_prf.fp
 								        n_cats = len(f_per_type) + 1e-100
 								        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
 								        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
 								        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
-												Set macro AUC score in Scorer.score_cats

											
										
										
											2020-08-26 11:49:30 +03:00
+								        macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        results = {
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            f"{attr}_score": None,
 								            f"{attr}_score_desc": None,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								            f"{attr}_micro_p": micro_prf.precision,
 								            f"{attr}_micro_r": micro_prf.recall,
 								            f"{attr}_micro_f": micro_prf.fscore,
 								            f"{attr}_macro_p": macro_p,
 								            f"{attr}_macro_r": macro_r,
 								            f"{attr}_macro_f": macro_f,
-												Set macro AUC score in Scorer.score_cats

											
										
										
											2020-08-26 11:49:30 +03:00
+								            f"{attr}_macro_auc": macro_auc,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
 								            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        }
 								        if len(labels) == 2 and not multi_label and positive_label:
-												Tidy up and auto-format

											
										
										
											2020-08-09 23:36:23 +03:00
+								            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 17:24:13 +03:00
+								            results[f"{attr}_score"] = positive_label_f
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            results[f"{attr}_score_desc"] = f"F ({positive_label})"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        elif not multi_label:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
 								            results[f"{attr}_score_desc"] = "macro F"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        else:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
 								            results[f"{attr}_score_desc"] = "macro AUC"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return results
 								    @staticmethod
 								    def score_deps(
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
 								        head_attr: str = "head",
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								        head_getter: Callable[[Token, str], Token] = getattr,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        ignore_labels: Iterable[str] = SimpleFrozenList(),
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """Returns the UAS, LAS, and LAS per type scores for dependency
 								        parses.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute containing the dependency label.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        head_attr (str): The attribute containing the head token. Defaults to
 								            'head'.
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								        head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            head_getter(token, attr) should return the value of the head for an
 								            individual token.
 								        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the scores:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            attr_uas, attr_las, and attr_las_per_type.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
-												Update docs links in codebase

											
										
										
											2020-09-04 13:58:50 +03:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_deps
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        """
 								        unlabelled = PRFScore()
 								        labelled = PRFScore()
 								        labelled_per_dep = dict()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_deps = set()
 								            gold_deps_per_dep = {}
 								            for gold_i, token in enumerate(gold_doc):
 								                dep = getter(token, attr)
 								                head = head_getter(token, head_attr)
 								                if dep not in ignore_labels:
 								                    gold_deps.add((gold_i, head.i, dep))
 								                    if dep not in labelled_per_dep:
 								                        labelled_per_dep[dep] = PRFScore()
 								                    if dep not in gold_deps_per_dep:
 								                        gold_deps_per_dep[dep] = set()
 								                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
 								            pred_deps = set()
 								            pred_deps_per_dep = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] != 1:
 								                    gold_i = None
 								                else:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                dep = getter(token, attr)
 								                head = head_getter(token, head_attr)
 								                if dep not in ignore_labels and token.orth_.strip():
 								                    if align.x2y.lengths[head.i] == 1:
 								                        gold_head = align.x2y[head.i].dataXd[0, 0]
 								                    else:
 								                        gold_head = None
 								                    # None is indistinct, so we can't just add it to the set
 								                    # Multiple (None, None) deps are possible
 								                    if gold_i is None or gold_head is None:
 								                        unlabelled.fp += 1
 								                        labelled.fp += 1
 								                    else:
 								                        pred_deps.add((gold_i, gold_head, dep))
 								                        if dep not in labelled_per_dep:
 								                            labelled_per_dep[dep] = PRFScore()
 								                        if dep not in pred_deps_per_dep:
 								                            pred_deps_per_dep[dep] = set()
 								                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
 								            labelled.score_set(pred_deps, gold_deps)
 								            for dep in labelled_per_dep:
 								                labelled_per_dep[dep].score_set(
 								                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
 								                )
 								            unlabelled.score_set(
 								                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            )
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return {
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 22:39:42 +03:00
+								            f"{attr}_uas": unlabelled.fscore,
 								            f"{attr}_las": labelled.fscore,
 								            f"{attr}_las_per_type": {
 								                k: v.to_dict() for k, v in labelled_per_dep.items()
 								            },
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        }
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								#############################################################################
 								#
 								# The following implementation of roc_auc_score() is adapted from
 								# scikit-learn, which is distributed under the following license:
 								#
 								# New BSD License
 								#
 								# Copyright (c) 2007–2019 The scikit-learn developers.
 								# All rights reserved.
 								#
 								#
 								# Redistribution and use in source and binary forms, with or without
 								# modification, are permitted provided that the following conditions are met:
 								#
 								#   a. Redistributions of source code must retain the above copyright notice,
 								#      this list of conditions and the following disclaimer.
 								#   b. Redistributions in binary form must reproduce the above copyright
 								#      notice, this list of conditions and the following disclaimer in the
 								#      documentation and/or other materials provided with the distribution.
 								#   c. Neither the name of the Scikit-learn Developers  nor the names of
 								#      its contributors may be used to endorse or promote products
 								#      derived from this software without specific prior written
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								#      permission.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								#
 								#
 								# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 								# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 								# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 								# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 								# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 								# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 								# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 								# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 								# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 								# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 								# DAMAGE.
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								def _roc_auc_score(y_true, y_score):
 								    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
 								    from prediction scores.
 								    Note: this implementation is restricted to the binary classification task
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples] or [n_samples, n_classes]
 								        True binary labels or binary label indicators.
 								        The multiclass case expects shape = [n_samples] and labels
 								        with values in ``range(n_classes)``.
 								    y_score : array, shape = [n_samples] or [n_samples, n_classes]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers). For binary
 								        y_true, y_score is supposed to be the score of the class with greater
 								        label. The multiclass case expects shape = [n_samples, n_classes]
 								        where the scores correspond to probability estimates.
 								    Returns
 								    -------
 								    auc : float
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
 								            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
 								    """
 								    if len(np.unique(y_true)) != 2:
 								        raise ValueError(Errors.E165)
 								    fpr, tpr, _ = _roc_curve(y_true, y_score)
 								    return _auc(fpr, tpr)
 								def _roc_curve(y_true, y_score):
 								    """Compute Receiver operating characteristic (ROC)
 								    Note: this implementation is restricted to the binary classification task.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
 								        pos_label should be explicitly given.
 								    y_score : array, shape = [n_samples]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers).
 								    Returns
 								    -------
 								    fpr : array, shape = [>2]
 								        Increasing false positive rates such that element i is the false
 								        positive rate of predictions with score >= thresholds[i].
 								    tpr : array, shape = [>2]
 								        Increasing true positive rates such that element i is the true
 								        positive rate of predictions with score >= thresholds[i].
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing thresholds on the decision function used to compute
 								        fpr and tpr. `thresholds[0]` represents no instances being predicted
 								        and is arbitrarily set to `max(y_score) + 1`.
 								    Notes
 								    -----
 								    Since the thresholds are sorted from low to high values, they
 								    are reversed upon returning them to ensure they correspond to both ``fpr``
 								    and ``tpr``, which are sorted in reversed order during their calculation.
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    """
 								    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
 								    # Add an extra threshold position
 								    # to make sure that the curve starts at (0, 0)
 								    tps = np.r_[0, tps]
 								    fps = np.r_[0, fps]
 								    thresholds = np.r_[thresholds[0] + 1, thresholds]
 								    if fps[-1] <= 0:
 								        fpr = np.repeat(np.nan, fps.shape)
 								    else:
 								        fpr = fps / fps[-1]
 								    if tps[-1] <= 0:
 								        tpr = np.repeat(np.nan, tps.shape)
 								    else:
 								        tpr = tps / tps[-1]
 								    return fpr, tpr, thresholds
 								def _binary_clf_curve(y_true, y_score):
 								    """Calculate true and false positives per binary classification threshold.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True targets of binary classification
 								    y_score : array, shape = [n_samples]
 								        Estimated probabilities or decision function
 								    Returns
 								    -------
 								    fps : array, shape = [n_thresholds]
 								        A count of false positives, at index i being the number of negative
 								        samples assigned a score >= thresholds[i]. The total number of
 								        negative samples is equal to fps[-1] (thus true negatives are given by
 								        fps[-1] - fps).
 								    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
 								        An increasing count of true positives, at index i being the number
 								        of positive samples assigned a score >= thresholds[i]. The total
 								        number of positive samples is equal to tps[-1] (thus false negatives
 								        are given by tps[-1] - tps).
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing score values.
 								    """
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    pos_label = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    y_true = np.ravel(y_true)
 								    y_score = np.ravel(y_score)
 								    # make y_true a boolean vector
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    y_true = y_true == pos_label
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    # sort scores and corresponding truth values
 								    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
 								    y_score = y_score[desc_score_indices]
 								    y_true = y_true[desc_score_indices]
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    weight = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    # y_score typically has many tied values. Here we extract
 								    # the indices associated with the distinct values. We also
 								    # concatenate a value for the end of the curve.
 								    distinct_value_indices = np.where(np.diff(y_score))[0]
 								    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
 								    # accumulate the true positives with decreasing threshold
 								    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
 								    fps = 1 + threshold_idxs - tps
 								    return fps, tps, y_score[threshold_idxs]
 								def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 								    """Use high precision for cumsum and check that final value matches sum
 								    Parameters
 								    ----------
 								    arr : array-like
 								        To be cumulatively summed as flat
 								    axis : int, optional
 								        Axis along which the cumulative sum is computed.
 								        The default (None) is to compute the cumsum over the flattened array.
 								    rtol : float
 								        Relative tolerance, see ``np.allclose``
 								    atol : float
 								        Absolute tolerance, see ``np.allclose``
 								    """
 								    out = np.cumsum(arr, axis=axis, dtype=np.float64)
 								    expected = np.sum(arr, axis=axis, dtype=np.float64)
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    if not np.all(
 								        np.isclose(
 								            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
 								        )
 								    ):
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        raise ValueError(Errors.E163)
 								    return out
 								def _auc(x, y):
 								    """Compute Area Under the Curve (AUC) using the trapezoidal rule
 								    This is a general function, given points on a curve.  For computing the
 								    area under the ROC-curve, see :func:`roc_auc_score`.
 								    Parameters
 								    ----------
 								    x : array, shape = [n]
 								        x coordinates. These must be either monotonic increasing or monotonic
 								        decreasing.
 								    y : array, shape = [n]
 								        y coordinates.
 								    Returns
 								    -------
 								    auc : float
 								    """
 								    x = np.ravel(x)
 								    y = np.ravel(y)
 								    direction = 1
 								    dx = np.diff(x)
 								    if np.any(dx < 0):
 								        if np.all(dx <= 0):
 								            direction = -1
 								        else:
 								            raise ValueError(Errors.E164.format(x))
 								    area = direction * np.trapz(y, x)
 								    if isinstance(area, np.memmap):
 								        # Reductions such as .sum used internally in np.trapz do not return a
 								        # scalar by default for numpy.memmap instances contrary to
 								        # regular numpy.ndarray instances.
 								        area = area.dtype.type(area)
 								    return area