spaCy/spacy/scorer.py

import numpy as np

from .errors import Errors
from .util import get_lang_class
from .morphology import Morphology


class PRFScore:
    """
    A precision / recall / F score
    """

    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def score_set(self, cand, gold):
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self):
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self):
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def fscore(self):
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    def to_dict(self):
        return {"p": self.precision, "r": self.recall, "f": self.fscore}


class ROCAUCScore:
    """
    An AUC ROC score.
    """

    def __init__(self):
        self.golds = []
        self.cands = []
        self.saved_score = 0.0
        self.saved_score_at_len = 0

    def score_set(self, cand, gold):
        self.cands.append(cand)
        self.golds.append(gold)

    @property
    def score(self):
        if len(self.golds) == self.saved_score_at_len:
            return self.saved_score
        try:
            self.saved_score = _roc_auc_score(self.golds, self.cands)
        # catch ValueError: Only one class present in y_true.
        # ROC AUC score is not defined in that case.
        except ValueError:
            self.saved_score = -float("inf")
        self.saved_score_at_len = len(self.golds)
        return self.saved_score


class Scorer:
    """Compute evaluation scores."""

    def __init__(self, nlp=None, **cfg):
        """Initialize the Scorer.
        RETURNS (Scorer): The newly created object.

        DOCS: https://spacy.io/api/scorer#init
        """
        self.nlp = nlp
        self.cfg = cfg

        if not nlp:
            # create a default pipeline
            nlp = get_lang_class("xx")()
            nlp.add_pipe("senter")
            nlp.add_pipe("tagger")
            nlp.add_pipe("morphologizer")
            nlp.add_pipe("parser")
            nlp.add_pipe("ner")
            nlp.add_pipe("textcat")
            self.nlp = nlp

    def score(self, examples):
        """Evaluate a list of Examples.

        examples (Iterable[Example]): The predicted annotations + correct annotations.
        RETURNS (Dict): A dictionary of scores.
        DOCS: https://spacy.io/api/scorer#score
        """
        scores = {}

        if hasattr(self.nlp.tokenizer, "score"):
            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
        for name, component in self.nlp.pipeline:
            if hasattr(component, "score"):
                scores.update(component.score(examples, **self.cfg))

        return scores

    @staticmethod
    def score_tokenization(examples, **cfg):
        """Returns accuracy and PRF scores for tokenization.

        * token_acc: # correct tokens / # gold tokens
        * token_p/r/f: PRF for token character spans

        examples (Iterable[Example]): Examples to score
        RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
        """
        acc_score = PRFScore()
        prf_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_spans = set()
            pred_spans = set()
            for token in gold_doc:
                if token.orth_.isspace():
                    continue
                gold_spans.add((token.idx, token.idx + len(token)))
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                pred_spans.add((token.idx, token.idx + len(token)))
                if align.x2y.lengths[token.i] != 1:
                    acc_score.fp += 1
                else:
                    acc_score.tp += 1
            prf_score.score_set(pred_spans, gold_spans)
        return {
            "token_acc": acc_score.fscore,
            "token_p": prf_score.precision,
            "token_r": prf_score.recall,
            "token_f": prf_score.fscore,
        }

    @staticmethod
    def score_token_attr(examples, attr, getter=getattr, **cfg):
        """Returns an accuracy score for a token-level attribute.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (callable): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (dict): A dictionary containing the accuracy score under the
            key attr_acc.
        """
        tag_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_tags = set()
            for gold_i, token in enumerate(gold_doc):
                gold_tags.add((gold_i, getter(token, attr)))
            pred_tags = set()
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
        return {
            attr + "_acc": tag_score.fscore,
        }

    @staticmethod
    def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
        """Return PRF scores per feat for a token attribute in UFEATS format.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (callable): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
            the key attr_per_feat.
        """
        per_feat = {}
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            align = example.alignment
            gold_per_feat = {}
            for gold_i, token in enumerate(gold_doc):
                morph = str(getter(token, attr))
                if morph:
                    for feat in morph.split(Morphology.FEATURE_SEP):
                        field, values = feat.split(Morphology.FIELD_SEP)
                        if field not in per_feat:
                            per_feat[field] = PRFScore()
                        if field not in gold_per_feat:
                            gold_per_feat[field] = set()
                        gold_per_feat[field].add((gold_i, feat))
            pred_per_feat = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    morph = str(getter(token, attr))
                    if morph:
                        for feat in morph.split("|"):
                            field, values = feat.split("=")
                            if field not in per_feat:
                                per_feat[field] = PRFScore()
                            if field not in pred_per_feat:
                                pred_per_feat[field] = set()
                            pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
                )
        return {
            attr + "_per_feat": per_feat,
        }

    @staticmethod
    def score_spans(examples, attr, getter=getattr, **cfg):
        """Returns PRF scores for labeled spans.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (callable): Defaults to getattr. If provided,
            getter(doc, attr) should return the spans for the individual doc.
        RETURNS (dict): A dictionary containing the PRF scores under the
            keys attr_p/r/f and the per-type PRF scores under attr_per_type.
        """
        score = PRFScore()
        score_per_type = dict()
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            # Find all labels in gold and doc
            labels = set(
                [k.label_ for k in getter(gold_doc, attr)]
                + [k.label_ for k in getter(pred_doc, attr)]
            )
            # Set up all labels for per type scoring and prepare gold per type
            gold_per_type = {label: set() for label in labels}
            for label in labels:
                if label not in score_per_type:
                    score_per_type[label] = PRFScore()
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()

            # Special case for ents:
            # If we have missing values in the gold, we can't easily tell
            # whether our NER predictions are true.
            # It seems bad but it's what we've always done.
            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                continue

            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            pred_per_type = {label: set() for label in labels}
            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
                pred_spans.add((span.label_, span.start, span.end - 1))
                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            # Scores per label
            for k, v in score_per_type.items():
                if k in pred_per_type:
                    v.score_set(pred_per_type[k], gold_per_type[k])
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
        results = {
            attr + "_p": score.precision,
            attr + "_r": score.recall,
            attr + "_f": score.fscore,
            attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
        }
        return results

    @staticmethod
    def score_cats(
        examples,
        attr,
        getter=getattr,
        labels=[],
        multi_label=True,
        positive_label=None,
        **cfg
    ):
        """Returns PRF and ROC AUC scores for a doc-level attribute with a
        dict with scores for each label like Doc.cats. The reported overall
        score depends on the scorer settings.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (callable): Defaults to getattr. If provided,
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        RETURNS (dict): A dictionary containing the scores, with inapplicable
                scores as None:
            for all:
                attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
                attr_score_desc (text description of the overall score),
                attr_f_per_type,
                attr_auc_per_type
            for binary exclusive with positive label: attr_p/r/f
            for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
            for multilabel, macro-averaged AUC: attr_macro_auc
        """
        score = PRFScore()
        f_per_type = dict()
        auc_per_type = dict()
        for label in labels:
            f_per_type[label] = PRFScore()
            auc_per_type[label] = ROCAUCScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            gold_values = getter(gold_doc, attr)
            pred_values = getter(pred_doc, attr)
            if (
                len(gold_values) > 0
                and set(f_per_type) == set(auc_per_type) == set(gold_values)
                and set(gold_values) == set(pred_values)
            ):
                gold_val = max(gold_values, key=gold_values.get)
                pred_val = max(pred_values, key=pred_values.get)
                if positive_label:
                    score.score_set(
                        set([positive_label]) & set([pred_val]),
                        set([positive_label]) & set([gold_val]),
                    )
                for label in set(gold_values):
                    auc_per_type[label].score_set(
                        pred_values[label], gold_values[label]
                    )
                    f_per_type[label].score_set(
                        set([label]) & set([pred_val]), set([label]) & set([gold_val])
                    )
            elif len(f_per_type) > 0:
                model_labels = set(f_per_type)
                eval_labels = set(gold_values)
                raise ValueError(
                    Errors.E162.format(
                        model_labels=model_labels, eval_labels=eval_labels
                    )
                )
            elif len(auc_per_type) > 0:
                model_labels = set(auc_per_type)
                eval_labels = set(gold_values)
                raise ValueError(
                    Errors.E162.format(
                        model_labels=model_labels, eval_labels=eval_labels
                    )
                )
        results = {
            attr + "_score": None,
            attr + "_score_desc": None,
            attr + "_p": None,
            attr + "_r": None,
            attr + "_f": None,
            attr + "_macro_f": None,
            attr + "_macro_auc": None,
            attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
            attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
        }
        if len(labels) == 2 and not multi_label and positive_label:
            results[attr + "_p"] = score.precision
            results[attr + "_r"] = score.recall
            results[attr + "_f"] = score.fscore
            results[attr + "_score"] = results[attr + "_f"]
            results[attr + "_score_desc"] = "F (" + positive_label + ")"
        elif not multi_label:
            results[attr + "_macro_f"] = sum(
                [score.fscore for label, score in f_per_type.items()]
            ) / (len(f_per_type) + 1e-100)
            results[attr + "_score"] = results[attr + "_macro_f"]
            results[attr + "_score_desc"] = "macro F"
        else:
            results[attr + "_macro_auc"] = max(
                sum([score.score for label, score in auc_per_type.items()])
                / (len(auc_per_type) + 1e-100),
                -1,
            )
            results[attr + "_score"] = results[attr + "_macro_auc"]
            results[attr + "_score_desc"] = "macro AUC"
        return results

    @staticmethod
    def score_deps(
        examples,
        attr,
        getter=getattr,
        head_attr="head",
        head_getter=getattr,
        ignore_labels=tuple(),
        **cfg
    ):
        """Returns the UAS, LAS, and LAS per type scores for dependency
        parses.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute containing the dependency label.
        getter (callable): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        head_attr (str): The attribute containing the head token. Defaults to
            'head'.
        head_getter (callable): Defaults to getattr. If provided,
            head_getter(token, attr) should return the value of the head for an
            individual token.
        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
        RETURNS (dict): A dictionary containing the scores:
            attr_uas, attr_las, and attr_las_per_type.
        """
        unlabelled = PRFScore()
        labelled = PRFScore()
        labelled_per_dep = dict()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_deps = set()
            gold_deps_per_dep = {}
            for gold_i, token in enumerate(gold_doc):
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
                if dep not in ignore_labels:
                    gold_deps.add((gold_i, head.i, dep))
                    if dep not in labelled_per_dep:
                        labelled_per_dep[dep] = PRFScore()
                    if dep not in gold_deps_per_dep:
                        gold_deps_per_dep[dep] = set()
                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
            pred_deps = set()
            pred_deps_per_dep = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] != 1:
                    gold_i = None
                else:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
                if dep not in ignore_labels and token.orth_.strip():
                    if align.x2y.lengths[head.i] == 1:
                        gold_head = align.x2y[head.i].dataXd[0, 0]
                    else:
                        gold_head = None
                    # None is indistinct, so we can't just add it to the set
                    # Multiple (None, None) deps are possible
                    if gold_i is None or gold_head is None:
                        unlabelled.fp += 1
                        labelled.fp += 1
                    else:
                        pred_deps.add((gold_i, gold_head, dep))
                        if dep not in labelled_per_dep:
                            labelled_per_dep[dep] = PRFScore()
                        if dep not in pred_deps_per_dep:
                            pred_deps_per_dep[dep] = set()
                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
            labelled.score_set(pred_deps, gold_deps)
            for dep in labelled_per_dep:
                labelled_per_dep[dep].score_set(
                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
                )
            unlabelled.score_set(
                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
        return {
            attr + "_uas": unlabelled.fscore,
            attr + "_las": labelled.fscore,
            attr
            + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
        }


#############################################################################
#
# The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the following license:
#
# New BSD License
#
# Copyright (c) 2007–2019 The scikit-learn developers.
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   a. Redistributions of source code must retain the above copyright notice,
#      this list of conditions and the following disclaimer.
#   b. Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#   c. Neither the name of the Scikit-learn Developers  nor the names of
#      its contributors may be used to endorse or promote products
#      derived from this software without specific prior written
#      permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.


def _roc_auc_score(y_true, y_score):
    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.

    Note: this implementation is restricted to the binary classification task

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, n_classes]
        True binary labels or binary label indicators.
        The multiclass case expects shape = [n_samples] and labels
        with values in ``range(n_classes)``.

    y_score : array, shape = [n_samples] or [n_samples, n_classes]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers). For binary
        y_true, y_score is supposed to be the score of the class with greater
        label. The multiclass case expects shape = [n_samples, n_classes]
        where the scores correspond to probability estimates.

    Returns
    -------
    auc : float

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.

    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
    """
    if len(np.unique(y_true)) != 2:
        raise ValueError(Errors.E165)
    fpr, tpr, _ = _roc_curve(y_true, y_score)
    return _auc(fpr, tpr)


def _roc_curve(y_true, y_score):
    """Compute Receiver operating characteristic (ROC)

    Note: this implementation is restricted to the binary classification task.

    Parameters
    ----------

    y_true : array, shape = [n_samples]
        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
        pos_label should be explicitly given.

    y_score : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    Returns
    -------
    fpr : array, shape = [>2]
        Increasing false positive rates such that element i is the false
        positive rate of predictions with score >= thresholds[i].

    tpr : array, shape = [>2]
        Increasing true positive rates such that element i is the true
        positive rate of predictions with score >= thresholds[i].

    thresholds : array, shape = [n_thresholds]
        Decreasing thresholds on the decision function used to compute
        fpr and tpr. `thresholds[0]` represents no instances being predicted
        and is arbitrarily set to `max(y_score) + 1`.

    Notes
    -----
    Since the thresholds are sorted from low to high values, they
    are reversed upon returning them to ensure they correspond to both ``fpr``
    and ``tpr``, which are sorted in reversed order during their calculation.

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.
    """
    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)

    # Add an extra threshold position
    # to make sure that the curve starts at (0, 0)
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    thresholds = np.r_[thresholds[0] + 1, thresholds]

    if fps[-1] <= 0:
        fpr = np.repeat(np.nan, fps.shape)
    else:
        fpr = fps / fps[-1]

    if tps[-1] <= 0:
        tpr = np.repeat(np.nan, tps.shape)
    else:
        tpr = tps / tps[-1]

    return fpr, tpr, thresholds


def _binary_clf_curve(y_true, y_score):
    """Calculate true and false positives per binary classification threshold.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True targets of binary classification

    y_score : array, shape = [n_samples]
        Estimated probabilities or decision function

    Returns
    -------
    fps : array, shape = [n_thresholds]
        A count of false positives, at index i being the number of negative
        samples assigned a score >= thresholds[i]. The total number of
        negative samples is equal to fps[-1] (thus true negatives are given by
        fps[-1] - fps).

    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
        An increasing count of true positives, at index i being the number
        of positive samples assigned a score >= thresholds[i]. The total
        number of positive samples is equal to tps[-1] (thus false negatives
        are given by tps[-1] - tps).

    thresholds : array, shape = [n_thresholds]
        Decreasing score values.
    """
    pos_label = 1.0

    y_true = np.ravel(y_true)
    y_score = np.ravel(y_score)

    # make y_true a boolean vector
    y_true = y_true == pos_label

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    weight = 1.0

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    return fps, tps, y_score[threshold_idxs]


def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
    """Use high precision for cumsum and check that final value matches sum

    Parameters
    ----------
    arr : array-like
        To be cumulatively summed as flat
    axis : int, optional
        Axis along which the cumulative sum is computed.
        The default (None) is to compute the cumsum over the flattened array.
    rtol : float
        Relative tolerance, see ``np.allclose``
    atol : float
        Absolute tolerance, see ``np.allclose``
    """
    out = np.cumsum(arr, axis=axis, dtype=np.float64)
    expected = np.sum(arr, axis=axis, dtype=np.float64)
    if not np.all(
        np.isclose(
            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
        )
    ):
        raise ValueError(Errors.E163)
    return out


def _auc(x, y):
    """Compute Area Under the Curve (AUC) using the trapezoidal rule

    This is a general function, given points on a curve.  For computing the
    area under the ROC-curve, see :func:`roc_auc_score`.

    Parameters
    ----------
    x : array, shape = [n]
        x coordinates. These must be either monotonic increasing or monotonic
        decreasing.
    y : array, shape = [n]
        y coordinates.

    Returns
    -------
    auc : float
    """
    x = np.ravel(x)
    y = np.ravel(y)

    direction = 1
    dx = np.diff(x)
    if np.any(dx < 0):
        if np.all(dx <= 0):
            direction = -1
        else:
            raise ValueError(Errors.E164.format(x))

    area = direction * np.trapz(y, x)
    if isinstance(area, np.memmap):
        # Reductions such as .sum used internally in np.trapz do not return a
        # scalar by default for numpy.memmap instances contrary to
        # regular numpy.ndarray instances.
        area = area.dtype.type(area)
    return area
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								import numpy as np
 								from .errors import Errors
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								from .util import get_lang_class
 								from .morphology import Morphology
-												* Fix evaluation of NER in scorer.py

											
										
										
											2015-05-27 04:18:16 +03:00
-												* Print parse if verbose in scorer

											
										
										
											2015-04-05 23:29:30 +03:00
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class PRFScore:
-												Use consistent formatting for docstrings

											
										
										
											2017-04-15 12:59:21 +03:00
+								    """
 								    A precision / recall / F score
 								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
+								    def __init__(self):
 								        self.tp = 0
 								        self.fp = 0
 								        self.fn = 0
 								    def score_set(self, cand, gold):
 								        self.tp += len(cand.intersection(gold))
 								        self.fp += len(cand - gold)
 								        self.fn += len(gold - cand)
 								    @property
 								    def precision(self):
 								        return self.tp / (self.tp + self.fp + 1e-100)
 								    @property
 								    def recall(self):
 								        return self.tp / (self.tp + self.fn + 1e-100)
 								    @property
 								    def fscore(self):
 								        p = self.precision
 								        r = self.recall
 								        return 2 * ((p * r) / (p + r + 1e-100))
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    def to_dict(self):
 								        return {"p": self.precision, "r": self.recall, "f": self.fscore}
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 21:07:18 +03:00
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class ROCAUCScore:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								    """
 								    An AUC ROC score.
 								    """
 								    def __init__(self):
 								        self.golds = []
 								        self.cands = []
 								        self.saved_score = 0.0
 								        self.saved_score_at_len = 0
 								    def score_set(self, cand, gold):
 								        self.cands.append(cand)
 								        self.golds.append(gold)
 								    @property
 								    def score(self):
 								        if len(self.golds) == self.saved_score_at_len:
 								            return self.saved_score
 								        try:
 								            self.saved_score = _roc_auc_score(self.golds, self.cands)
 								        # catch ValueError: Only one class present in y_true.
 								        # ROC AUC score is not defined in that case.
-												Make except more explicit

											
										
										
											2019-09-18 20:57:08 +03:00
+								        except ValueError:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            self.saved_score = -float("inf")
 								        self.saved_score_at_len = len(self.golds)
 								        return self.saved_score
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class Scorer:
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								    """Compute evaluation scores."""
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    def __init__(self, nlp=None, **cfg):
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """Initialize the Scorer.
 								        RETURNS (Scorer): The newly created object.
 								        DOCS: https://spacy.io/api/scorer#init
 								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        self.nlp = nlp
 								        self.cfg = cfg
 								        if not nlp:
 								            # create a default pipeline
 								            nlp = get_lang_class("xx")()
 								            nlp.add_pipe("senter")
 								            nlp.add_pipe("tagger")
 								            nlp.add_pipe("morphologizer")
 								            nlp.add_pipe("parser")
 								            nlp.add_pipe("ner")
 								            nlp.add_pipe("textcat")
 								            self.nlp = nlp
 								    def score(self, examples):
 								        """Evaluate a list of Examples.
 								        examples (Iterable[Example]): The predicted annotations + correct annotations.
 								        RETURNS (Dict): A dictionary of scores.
 								        DOCS: https://spacy.io/api/scorer#score
-												Add a tagger-based SentenceRecognizer (#4713)

* Add sent_starts to GoldParse

* Add SentTagger pipeline component

Add `SentTagger` pipeline component as a subclass of `Tagger`.

* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
  * S (1): token at beginning of sentence
  * I (0): all other sentence positions
* Sets `token.sent_start` values

* Add sentence segmentation to Scorer

Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.

* Add sentence segmentation to CLI evaluate

* Add senttagger metrics/scoring to train CLI

* Rename SentTagger to SentenceRecognizer

* Add SentenceRecognizer to spacy.pipes imports

* Add SentenceRecognizer serialization test

* Shorten component name to sentrec

* Remove duplicates from train CLI output metrics

											
										
										
											2019-11-28 13:10:07 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        scores = {}
-												Add a tagger-based SentenceRecognizer (#4713)

* Add sent_starts to GoldParse

* Add SentTagger pipeline component

Add `SentTagger` pipeline component as a subclass of `Tagger`.

* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
  * S (1): token at beginning of sentence
  * I (0): all other sentence positions
* Sets `token.sent_start` values

* Add sentence segmentation to Scorer

Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.

* Add sentence segmentation to CLI evaluate

* Add senttagger metrics/scoring to train CLI

* Rename SentTagger to SentenceRecognizer

* Add SentenceRecognizer to spacy.pipes imports

* Add SentenceRecognizer serialization test

* Shorten component name to sentrec

* Remove duplicates from train CLI output metrics

											
										
										
											2019-11-28 13:10:07 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        if hasattr(self.nlp.tokenizer, "score"):
 								            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
 								        for name, component in self.nlp.pipeline:
 								            if hasattr(component, "score"):
 								                scores.update(component.score(examples, **self.cfg))
-												Add a tagger-based SentenceRecognizer (#4713)

* Add sent_starts to GoldParse

* Add SentTagger pipeline component

Add `SentTagger` pipeline component as a subclass of `Tagger`.

* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
  * S (1): token at beginning of sentence
  * I (0): all other sentence positions
* Sets `token.sent_start` values

* Add sentence segmentation to Scorer

Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.

* Add sentence segmentation to CLI evaluate

* Add senttagger metrics/scoring to train CLI

* Rename SentTagger to SentenceRecognizer

* Add SentenceRecognizer to spacy.pipes imports

* Add SentenceRecognizer serialization test

* Shorten component name to sentrec

* Remove duplicates from train CLI output metrics

											
										
										
											2019-11-28 13:10:07 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return scores
-												* Add scorer script

											
										
										
											2015-03-11 04:07:03 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
 								    def score_tokenization(examples, **cfg):
 								        """Returns accuracy and PRF scores for tokenization.
-												* Add scorer script

											
										
										
											2015-03-11 04:07:03 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        * token_acc: # correct tokens / # gold tokens
 								        * token_p/r/f: PRF for token character spans
-												* Add scorer script

											
										
										
											2015-03-11 04:07:03 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        examples (Iterable[Example]): Examples to score
 								        RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        acc_score = PRFScore()
 								        prf_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_spans = set()
 								            pred_spans = set()
 								            for token in gold_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                gold_spans.add((token.idx, token.idx + len(token)))
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                pred_spans.add((token.idx, token.idx + len(token)))
 								                if align.x2y.lengths[token.i] != 1:
 								                    acc_score.fp += 1
 								                else:
 								                    acc_score.tp += 1
 								            prf_score.score_set(pred_spans, gold_spans)
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        return {
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            "token_acc": acc_score.fscore,
 								            "token_p": prf_score.precision,
 								            "token_r": prf_score.recall,
 								            "token_f": prf_score.fscore,
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 23:18:16 +03:00
+								        }
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
 								    def score_token_attr(examples, attr, getter=getattr, **cfg):
 								        """Returns an accuracy score for a token-level attribute.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
 								        getter (callable): Defaults to getattr. If provided,
 								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        RETURNS (dict): A dictionary containing the accuracy score under the
 								            key attr_acc.
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 12:19:28 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        tag_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_tags = set()
 								            for gold_i, token in enumerate(gold_doc):
 								                gold_tags.add((gold_i, getter(token, attr)))
 								            pred_tags = set()
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                    pred_tags.add((gold_i, getter(token, attr)))
 								            tag_score.score_set(pred_tags, gold_tags)
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 12:19:28 +03:00
+								        return {
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            attr + "_acc": tag_score.fscore,
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 12:19:28 +03:00
+								        }
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
 								    def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
 								        """Return PRF scores per feat for a token attribute in UFEATS format.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
 								        getter (callable): Defaults to getattr. If provided,
 								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
 								            the key attr_per_feat.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        per_feat = {}
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
 								            align = example.alignment
 								            gold_per_feat = {}
 								            for gold_i, token in enumerate(gold_doc):
 								                morph = str(getter(token, attr))
 								                if morph:
 								                    for feat in morph.split(Morphology.FEATURE_SEP):
 								                        field, values = feat.split(Morphology.FIELD_SEP)
 								                        if field not in per_feat:
 								                            per_feat[field] = PRFScore()
 								                        if field not in gold_per_feat:
 								                            gold_per_feat[field] = set()
 								                        gold_per_feat[field].add((gold_i, feat))
 								            pred_per_feat = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                    morph = str(getter(token, attr))
 								                    if morph:
 								                        for feat in morph.split("|"):
 								                            field, values = feat.split("=")
 								                            if field not in per_feat:
 								                                per_feat[field] = PRFScore()
 								                            if field not in pred_per_feat:
 								                                pred_per_feat[field] = set()
 								                            pred_per_feat[field].add((gold_i, feat))
 								            for field in per_feat:
 								                per_feat[field].score_set(
 								                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
 								                )
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        return {
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            attr + "_per_feat": per_feat,
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 03:02:07 +03:00
+								        }
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								    @staticmethod
 								    def score_spans(examples, attr, getter=getattr, **cfg):
 								        """Returns PRF scores for labeled spans.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
 								        getter (callable): Defaults to getattr. If provided,
 								            getter(doc, attr) should return the spans for the individual doc.
 								        RETURNS (dict): A dictionary containing the PRF scores under the
 								            keys attr_p/r/f and the per-type PRF scores under attr_per_type.
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        score = PRFScore()
 								        score_per_type = dict()
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
 								            # Find all labels in gold and doc
 								            labels = set(
 								                [k.label_ for k in getter(gold_doc, attr)]
 								                + [k.label_ for k in getter(pred_doc, attr)]
 								            )
 								            # Set up all labels for per type scoring and prepare gold per type
 								            gold_per_type = {label: set() for label in labels}
 								            for label in labels:
 								                if label not in score_per_type:
 								                    score_per_type[label] = PRFScore()
 								            # Find all predidate labels, for all and per type
 								            gold_spans = set()
 								            pred_spans = set()
 								            # Special case for ents:
 								            # If we have missing values in the gold, we can't easily tell
 								            # whether our NER predictions are true.
 								            # It seems bad but it's what we've always done.
 								            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
 								                continue
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            for span in getter(gold_doc, attr):
 								                gold_span = (span.label_, span.start, span.end - 1)
 								                gold_spans.add(gold_span)
 								                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
 								            pred_per_type = {label: set() for label in labels}
 								            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
 								                pred_spans.add((span.label_, span.start, span.end - 1))
 								                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
 								            # Scores per label
 								            for k, v in score_per_type.items():
 								                if k in pred_per_type:
 								                    v.score_set(pred_per_type[k], gold_per_type[k])
 								            # Score for all labels
 								            score.score_set(pred_spans, gold_spans)
 								        results = {
 								            attr + "_p": score.precision,
 								            attr + "_r": score.recall,
 								            attr + "_f": score.fscore,
 								            attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
 								        }
 								        return results
 								    @staticmethod
 								    def score_cats(
 								        examples,
 								        attr,
 								        getter=getattr,
 								        labels=[],
 								        multi_label=True,
 								        positive_label=None,
 								        **cfg
 								    ):
 								        """Returns PRF and ROC AUC scores for a doc-level attribute with a
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								        dict with scores for each label like Doc.cats. The reported overall
 								        score depends on the scorer settings.
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
 								        getter (callable): Defaults to getattr. If provided,
 								            getter(doc, attr) should return the values for the individual doc.
 								        labels (Iterable[str]): The set of possible labels. Defaults to [].
 								        multi_label (bool): Whether the attribute allows multiple labels.
 								            Defaults to True.
 								        positive_label (str): The positive label for a binary task with
 								            exclusive classes. Defaults to None.
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								        RETURNS (dict): A dictionary containing the scores, with inapplicable
 								                scores as None:
 								            for all:
 								                attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
 								                attr_score_desc (text description of the overall score),
 								                attr_f_per_type,
 								                attr_auc_per_type
 								            for binary exclusive with positive label: attr_p/r/f
 								            for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
 								            for multilabel, macro-averaged AUC: attr_macro_auc
-												Update Scorer and add API docs

											
										
										
											2019-05-24 15:06:04 +03:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        score = PRFScore()
 								        f_per_type = dict()
 								        auc_per_type = dict()
 								        for label in labels:
 								            f_per_type[label] = PRFScore()
 								            auc_per_type[label] = ROCAUCScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            gold_values = getter(gold_doc, attr)
 								            pred_values = getter(pred_doc, attr)
 								            if (
 								                len(gold_values) > 0
 								                and set(f_per_type) == set(auc_per_type) == set(gold_values)
 								                and set(gold_values) == set(pred_values)
 								            ):
 								                gold_val = max(gold_values, key=gold_values.get)
 								                pred_val = max(pred_values, key=pred_values.get)
 								                if positive_label:
 								                    score.score_set(
 								                        set([positive_label]) & set([pred_val]),
 								                        set([positive_label]) & set([gold_val]),
 								                    )
 								                for label in set(gold_values):
 								                    auc_per_type[label].score_set(
 								                        pred_values[label], gold_values[label]
 								                    )
 								                    f_per_type[label].score_set(
 								                        set([label]) & set([pred_val]), set([label]) & set([gold_val])
 								                    )
 								            elif len(f_per_type) > 0:
 								                model_labels = set(f_per_type)
 								                eval_labels = set(gold_values)
 								                raise ValueError(
 								                    Errors.E162.format(
 								                        model_labels=model_labels, eval_labels=eval_labels
-												Auto-format

											
										
										
											2019-11-20 15:15:24 +03:00
+								                    )
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 03:02:07 +03:00
+								                )
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            elif len(auc_per_type) > 0:
 								                model_labels = set(auc_per_type)
 								                eval_labels = set(gold_values)
 								                raise ValueError(
 								                    Errors.E162.format(
 								                        model_labels=model_labels, eval_labels=eval_labels
 								                    )
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 03:02:07 +03:00
+								                )
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        results = {
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								            attr + "_score": None,
 								            attr + "_score_desc": None,
 								            attr + "_p": None,
 								            attr + "_r": None,
 								            attr + "_f": None,
 								            attr + "_macro_f": None,
 								            attr + "_macro_auc": None,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
 								            attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
 								        }
 								        if len(labels) == 2 and not multi_label and positive_label:
 								            results[attr + "_p"] = score.precision
 								            results[attr + "_r"] = score.recall
 								            results[attr + "_f"] = score.fscore
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								            results[attr + "_score"] = results[attr + "_f"]
 								            results[attr + "_score_desc"] = "F (" + positive_label + ")"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        elif not multi_label:
 								            results[attr + "_macro_f"] = sum(
 								                [score.fscore for label, score in f_per_type.items()]
 								            ) / (len(f_per_type) + 1e-100)
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								            results[attr + "_score"] = results[attr + "_macro_f"]
 								            results[attr + "_score_desc"] = "macro F"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        else:
 								            results[attr + "_macro_auc"] = max(
 								                sum([score.score for label, score in auc_per_type.items()])
 								                / (len(auc_per_type) + 1e-100),
 								                -1,
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 03:02:07 +03:00
+								            )
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 12:17:52 +03:00
+								            results[attr + "_score"] = results[attr + "_macro_auc"]
 								            results[attr + "_score_desc"] = "macro AUC"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return results
 								    @staticmethod
 								    def score_deps(
 								        examples,
 								        attr,
 								        getter=getattr,
 								        head_attr="head",
 								        head_getter=getattr,
 								        ignore_labels=tuple(),
 								        **cfg
 								    ):
 								        """Returns the UAS, LAS, and LAS per type scores for dependency
 								        parses.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute containing the dependency label.
 								        getter (callable): Defaults to getattr. If provided,
 								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        head_attr (str): The attribute containing the head token. Defaults to
 								            'head'.
 								        head_getter (callable): Defaults to getattr. If provided,
 								            head_getter(token, attr) should return the value of the head for an
 								            individual token.
 								        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
 								        RETURNS (dict): A dictionary containing the scores:
 								            attr_uas, attr_las, and attr_las_per_type.
 								        """
 								        unlabelled = PRFScore()
 								        labelled = PRFScore()
 								        labelled_per_dep = dict()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_deps = set()
 								            gold_deps_per_dep = {}
 								            for gold_i, token in enumerate(gold_doc):
 								                dep = getter(token, attr)
 								                head = head_getter(token, head_attr)
 								                if dep not in ignore_labels:
 								                    gold_deps.add((gold_i, head.i, dep))
 								                    if dep not in labelled_per_dep:
 								                        labelled_per_dep[dep] = PRFScore()
 								                    if dep not in gold_deps_per_dep:
 								                        gold_deps_per_dep[dep] = set()
 								                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
 								            pred_deps = set()
 								            pred_deps_per_dep = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] != 1:
 								                    gold_i = None
 								                else:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
 								                dep = getter(token, attr)
 								                head = head_getter(token, head_attr)
 								                if dep not in ignore_labels and token.orth_.strip():
 								                    if align.x2y.lengths[head.i] == 1:
 								                        gold_head = align.x2y[head.i].dataXd[0, 0]
 								                    else:
 								                        gold_head = None
 								                    # None is indistinct, so we can't just add it to the set
 								                    # Multiple (None, None) deps are possible
 								                    if gold_i is None or gold_head is None:
 								                        unlabelled.fp += 1
 								                        labelled.fp += 1
 								                    else:
 								                        pred_deps.add((gold_i, gold_head, dep))
 								                        if dep not in labelled_per_dep:
 								                            labelled_per_dep[dep] = PRFScore()
 								                        if dep not in pred_deps_per_dep:
 								                            pred_deps_per_dep[dep] = set()
 								                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
 								            labelled.score_set(pred_deps, gold_deps)
 								            for dep in labelled_per_dep:
 								                labelled_per_dep[dep].score_set(
 								                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
 								                )
 								            unlabelled.score_set(
 								                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            )
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        return {
 								            attr + "_uas": unlabelled.fscore,
 								            attr + "_las": labelled.fscore,
 								            attr
 								            + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
 								        }
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								#############################################################################
 								#
 								# The following implementation of roc_auc_score() is adapted from
 								# scikit-learn, which is distributed under the following license:
 								#
 								# New BSD License
 								#
 								# Copyright (c) 2007–2019 The scikit-learn developers.
 								# All rights reserved.
 								#
 								#
 								# Redistribution and use in source and binary forms, with or without
 								# modification, are permitted provided that the following conditions are met:
 								#
 								#   a. Redistributions of source code must retain the above copyright notice,
 								#      this list of conditions and the following disclaimer.
 								#   b. Redistributions in binary form must reproduce the above copyright
 								#      notice, this list of conditions and the following disclaimer in the
 								#      documentation and/or other materials provided with the distribution.
 								#   c. Neither the name of the Scikit-learn Developers  nor the names of
 								#      its contributors may be used to endorse or promote products
 								#      derived from this software without specific prior written
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								#      permission.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								#
 								#
 								# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 								# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 								# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 								# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 								# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 								# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 								# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 								# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 								# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 								# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 								# DAMAGE.
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								def _roc_auc_score(y_true, y_score):
 								    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
 								    from prediction scores.
 								    Note: this implementation is restricted to the binary classification task
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples] or [n_samples, n_classes]
 								        True binary labels or binary label indicators.
 								        The multiclass case expects shape = [n_samples] and labels
 								        with values in ``range(n_classes)``.
 								    y_score : array, shape = [n_samples] or [n_samples, n_classes]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers). For binary
 								        y_true, y_score is supposed to be the score of the class with greater
 								        label. The multiclass case expects shape = [n_samples, n_classes]
 								        where the scores correspond to probability estimates.
 								    Returns
 								    -------
 								    auc : float
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
 								            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
 								    """
 								    if len(np.unique(y_true)) != 2:
 								        raise ValueError(Errors.E165)
 								    fpr, tpr, _ = _roc_curve(y_true, y_score)
 								    return _auc(fpr, tpr)
 								def _roc_curve(y_true, y_score):
 								    """Compute Receiver operating characteristic (ROC)
 								    Note: this implementation is restricted to the binary classification task.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
 								        pos_label should be explicitly given.
 								    y_score : array, shape = [n_samples]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers).
 								    Returns
 								    -------
 								    fpr : array, shape = [>2]
 								        Increasing false positive rates such that element i is the false
 								        positive rate of predictions with score >= thresholds[i].
 								    tpr : array, shape = [>2]
 								        Increasing true positive rates such that element i is the true
 								        positive rate of predictions with score >= thresholds[i].
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing thresholds on the decision function used to compute
 								        fpr and tpr. `thresholds[0]` represents no instances being predicted
 								        and is arbitrarily set to `max(y_score) + 1`.
 								    Notes
 								    -----
 								    Since the thresholds are sorted from low to high values, they
 								    are reversed upon returning them to ensure they correspond to both ``fpr``
 								    and ``tpr``, which are sorted in reversed order during their calculation.
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    """
 								    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
 								    # Add an extra threshold position
 								    # to make sure that the curve starts at (0, 0)
 								    tps = np.r_[0, tps]
 								    fps = np.r_[0, fps]
 								    thresholds = np.r_[thresholds[0] + 1, thresholds]
 								    if fps[-1] <= 0:
 								        fpr = np.repeat(np.nan, fps.shape)
 								    else:
 								        fpr = fps / fps[-1]
 								    if tps[-1] <= 0:
 								        tpr = np.repeat(np.nan, tps.shape)
 								    else:
 								        tpr = tps / tps[-1]
 								    return fpr, tpr, thresholds
 								def _binary_clf_curve(y_true, y_score):
 								    """Calculate true and false positives per binary classification threshold.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True targets of binary classification
 								    y_score : array, shape = [n_samples]
 								        Estimated probabilities or decision function
 								    Returns
 								    -------
 								    fps : array, shape = [n_thresholds]
 								        A count of false positives, at index i being the number of negative
 								        samples assigned a score >= thresholds[i]. The total number of
 								        negative samples is equal to fps[-1] (thus true negatives are given by
 								        fps[-1] - fps).
 								    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
 								        An increasing count of true positives, at index i being the number
 								        of positive samples assigned a score >= thresholds[i]. The total
 								        number of positive samples is equal to tps[-1] (thus false negatives
 								        are given by tps[-1] - tps).
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing score values.
 								    """
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    pos_label = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    y_true = np.ravel(y_true)
 								    y_score = np.ravel(y_score)
 								    # make y_true a boolean vector
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    y_true = y_true == pos_label
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    # sort scores and corresponding truth values
 								    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
 								    y_score = y_score[desc_score_indices]
 								    y_true = y_true[desc_score_indices]
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    weight = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
 								    # y_score typically has many tied values. Here we extract
 								    # the indices associated with the distinct values. We also
 								    # concatenate a value for the end of the curve.
 								    distinct_value_indices = np.where(np.diff(y_score))[0]
 								    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
 								    # accumulate the true positives with decreasing threshold
 								    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
 								    fps = 1 + threshold_idxs - tps
 								    return fps, tps, y_score[threshold_idxs]
 								def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 								    """Use high precision for cumsum and check that final value matches sum
 								    Parameters
 								    ----------
 								    arr : array-like
 								        To be cumulatively summed as flat
 								    axis : int, optional
 								        Axis along which the cumulative sum is computed.
 								        The default (None) is to compute the cumsum over the flattened array.
 								    rtol : float
 								        Relative tolerance, see ``np.allclose``
 								    atol : float
 								        Absolute tolerance, see ``np.allclose``
 								    """
 								    out = np.cumsum(arr, axis=axis, dtype=np.float64)
 								    expected = np.sum(arr, axis=axis, dtype=np.float64)
-												Auto-format

											
										
										
											2019-09-18 20:56:55 +03:00
+								    if not np.all(
 								        np.isclose(
 								            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
 								        )
 								    ):
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								        raise ValueError(Errors.E163)
 								    return out
 								def _auc(x, y):
 								    """Compute Area Under the Curve (AUC) using the trapezoidal rule
 								    This is a general function, given points on a curve.  For computing the
 								    area under the ROC-curve, see :func:`roc_auc_score`.
 								    Parameters
 								    ----------
 								    x : array, shape = [n]
 								        x coordinates. These must be either monotonic increasing or monotonic
 								        decreasing.
 								    y : array, shape = [n]
 								        y coordinates.
 								    Returns
 								    -------
 								    auc : float
 								    """
 								    x = np.ravel(x)
 								    y = np.ravel(y)
 								    direction = 1
 								    dx = np.diff(x)
 								    if np.any(dx < 0):
 								        if np.all(dx <= 0):
 								            direction = -1
 								        else:
 								            raise ValueError(Errors.E164.format(x))
 								    area = direction * np.trapz(y, x)
 								    if isinstance(area, np.memmap):
 								        # Reductions such as .sum used internally in np.trapz do not return a
 								        # scalar by default for numpy.memmap instances contrary to
 								        # regular numpy.ndarray instances.
 								        area = area.dtype.type(area)
 								    return area