TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
This commit is contained in:
Sofie Van Landeghem 2020-10-18 14:50:41 +02:00 committed by GitHub
parent e2f3c4e12d
commit 75a202ce65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 235 additions and 127 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0rc1" __version__ = "3.0.0rc2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -100,7 +100,7 @@ def init_labels_cli(
extract the labels.""" extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)

View File

@ -136,15 +136,19 @@ factory = "textcat"
{% if optimize == "accuracy" %} {% if optimize == "accuracy" %}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v1" @architectures = "spacy.TextCatBOW.v1"
@ -271,15 +275,19 @@ factory = "textcat"
{% if optimize == "accuracy" %} {% if optimize == "accuracy" %}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v1" @architectures = "spacy.TextCatBOW.v1"

View File

@ -44,7 +44,7 @@ def train_cli(
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists(): if output_path is not None and not output_path.exists():
output_path.mkdir() output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}") msg.good(f"Created output directory: {output_path}")
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)

View File

@ -398,8 +398,8 @@ class Errors:
E163 = ("cumsum was found to be unstable: its last element does not " E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum") "correspond to sum")
E164 = ("x is neither increasing nor decreasing: {x}.") E164 = ("x is neither increasing nor decreasing: {x}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in " E165 = ("Only one class present in the gold labels: {label}. "
"that case.") "ROC AUC score is not defined in that case.")
E166 = ("Can only merge DocBins with the same value for '{param}'.\n" E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}") "Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}") E169 = ("Can't find module: {module}")
@ -456,6 +456,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E897 = ("Field '{field}' should be a dot-notation string referring to the "
"relevant section in the config, but found type {type} instead.")
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make " "is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your " "sure to store the component model as `self.model` in your "

View File

@ -1,4 +1,6 @@
from typing import Optional from typing import Optional, List
from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@ -10,12 +12,13 @@ from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor from ..featureextractor import FeatureExtractor
from ...tokens import Doc
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier( def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model: ) -> Model[List[Doc], Floats2d]:
""" """
Build a simple CNN text classifier, given a token-to-vector model as inputs. Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
is applied instead, so that outputs are in the range [0, 1]. is applied instead, so that outputs are in the range [0, 1].
""" """
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
if exclusive_classes: if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model = cnn >> output_layer
model.set_ref("output_layer", output_layer) model.set_ref("output_layer", output_layer)
else: else:
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = ( model = cnn >> linear_layer >> Logistic()
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model.set_ref("output_layer", linear_layer) model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO) model.set_dim("nO", nO)
@ -45,8 +47,7 @@ def build_bow_text_classifier(
ngram_size: int, ngram_size: int,
no_output_layer: bool, no_output_layer: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model: ) -> Model[List[Doc], Floats2d]:
# Don't document this yet, I'm not sure it's right.
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO) sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -59,6 +60,39 @@ def build_bow_text_classifier(
return model return model
@registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier(
tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO")
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model
# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1") @registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier( def build_text_classifier(
width: int, width: int,
@ -158,11 +192,8 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1") @registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata( def build_text_classifier_lowdata(
width: int, width: int, dropout: Optional[float], nO: Optional[int] = None
pretrained_vectors: Optional[bool], ) -> Model[List[Doc], Floats2d]:
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right. # Don't document this yet, I'm not sure it's right.
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):

View File

@ -106,7 +106,7 @@ def MultiHashEmbed(
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical """Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations. through a feed-forward subnetwork to build a mixed representation.
The features used can be configured with the 'attrs' argument. The suggested The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into

View File

@ -16,15 +16,30 @@ from ..vocab import Vocab
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
pretrained_vectors = null [model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64 width = 64
conv_depth = 2 rows = [2000, 2000, 1000, 1000, 1000, 1000]
embed_size = 2000 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1 window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1 ngram_size = 1
dropout = null no_output_layer = false
""" """
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
@ -60,9 +75,11 @@ subword_features = true
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
"cats_p": None, "cats_micro_p": None,
"cats_r": None, "cats_micro_r": None,
"cats_f": None, "cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None, "cats_macro_f": None,
"cats_macro_auc": None, "cats_macro_auc": None,
"cats_f_per_type": None, "cats_f_per_type": None,

View File

@ -59,7 +59,9 @@ class PRFScore:
class ROCAUCScore: class ROCAUCScore:
"""An AUC ROC score.""" """An AUC ROC score. This is only defined for binary classification.
Use the method is_binary before calculating the score, otherwise it
may throw an error."""
def __init__(self) -> None: def __init__(self) -> None:
self.golds = [] self.golds = []
@ -71,16 +73,16 @@ class ROCAUCScore:
self.cands.append(cand) self.cands.append(cand)
self.golds.append(gold) self.golds.append(gold)
def is_binary(self):
return len(np.unique(self.golds)) == 2
@property @property
def score(self): def score(self):
if not self.is_binary():
raise ValueError(Errors.E165.format(label=set(self.golds)))
if len(self.golds) == self.saved_score_at_len: if len(self.golds) == self.saved_score_at_len:
return self.saved_score return self.saved_score
try: self.saved_score = _roc_auc_score(self.golds, self.cands)
self.saved_score = _roc_auc_score(self.golds, self.cands)
# catch ValueError: Only one class present in y_true.
# ROC AUC score is not defined in that case.
except ValueError:
self.saved_score = -float("inf")
self.saved_score_at_len = len(self.golds) self.saved_score_at_len = len(self.golds)
return self.saved_score return self.saved_score
@ -362,9 +364,13 @@ class Scorer:
for all: for all:
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score), attr_score_desc (text description of the overall score),
attr_micro_p,
attr_micro_r,
attr_micro_f, attr_micro_f,
attr_macro_p,
attr_macro_r,
attr_macro_f, attr_macro_f,
attr_auc, attr_macro_auc,
attr_f_per_type, attr_f_per_type,
attr_auc_per_type attr_auc_per_type
@ -431,7 +437,9 @@ class Scorer:
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats # Limit macro_auc to those labels with gold annotations,
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
results = { results = {
f"{attr}_score": None, f"{attr}_score": None,
f"{attr}_score_desc": None, f"{attr}_score_desc": None,
@ -443,7 +451,7 @@ class Scorer:
f"{attr}_macro_f": macro_f, f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": macro_auc, f"{attr}_macro_auc": macro_auc,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
} }
if len(labels) == 2 and not multi_label and positive_label: if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score):
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
""" """
if len(np.unique(y_true)) != 2: if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165) raise ValueError(Errors.E165.format(label=np.unique(y_true)))
fpr, tpr, _ = _roc_curve(y_true, y_score) fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr) return _auc(fpr, tpr)

View File

@ -2,6 +2,7 @@ import pytest
from spacy.language import Language from spacy.language import Language
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear, ConfigValidationError from thinc.api import Model, Linear, ConfigValidationError
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
name = "test_class_component_model" name = "test_class_component_model"
default_config = { default_config = {
"model": { "model": {
"@architectures": "spacy.TextCatEnsemble.v1", "@architectures": "spacy.TextCatEnsemble.v2",
"exclusive_classes": False, "tok2vec": DEFAULT_TOK2VEC_MODEL,
"pretrained_vectors": None, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
"width": 64, "no_output_layer": False},
"embed_size": 2000,
"window_size": 1,
"conv_depth": 2,
"ngram_size": 1,
"dropout": None,
}, },
"value1": 10, "value1": 10,
} }

View File

@ -140,7 +140,7 @@ def test_overfitting_IO():
nlp = English() nlp = English()
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
# Set exclusive labels # Set exclusive labels
config = {"model": {"exclusive_classes": True}} config = {"model": {"linear_model": {"exclusive_classes": True}}}
textcat = nlp.add_pipe("textcat", config=config) textcat = nlp.add_pipe("textcat", config=config)
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
@ -192,9 +192,8 @@ def test_overfitting_IO():
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
], ],

View File

@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
import numpy import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.examples import sentences as EN_SENTENCES from spacy.lang.en.examples import sentences as EN_SENTENCES
def get_textcat_kwargs(): def get_textcat_bow_kwargs():
return { return {
"width": 64, "exclusive_classes": True,
"embed_size": 2000,
"pretrained_vectors": None,
"exclusive_classes": False,
"ngram_size": 1, "ngram_size": 1,
"window_size": 1, "no_output_layer": False,
"conv_depth": 2, "nO": 34,
"dropout": None,
"nO": 7,
} }
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return { return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
"tok2vec": test_tok2vec(),
"exclusive_classes": False,
"nO": 13,
}
def get_all_params(model): def get_all_params(model):
@ -105,7 +96,7 @@ def test_multi_hash_embed():
"seed,model_func,kwargs", "seed,model_func,kwargs",
[ [
(0, build_Tok2Vec_model, get_tok2vec_kwargs()), (0, build_Tok2Vec_model, get_tok2vec_kwargs()),
(0, build_text_classifier, get_textcat_kwargs()), (0, build_bow_text_classifier, get_textcat_bow_kwargs()),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
], ],
) )
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
"seed,model_func,kwargs,get_X", "seed,model_func,kwargs,get_X",
[ [
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, build_text_classifier, get_textcat_kwargs(), get_docs), (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
], ],
) )
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
"seed,dropout,model_func,kwargs,get_X", "seed,dropout,model_func,kwargs,get_X",
[ [
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
], ],
) )

View File

@ -334,7 +334,8 @@ def test_roc_auc_score():
score = ROCAUCScore() score = ROCAUCScore()
score.score_set(0.25, 0) score.score_set(0.25, 0)
score.score_set(0.75, 0) score.score_set(0.75, 0)
assert score.score == -float("inf") with pytest.raises(ValueError):
s = score.score
y_true = [1, 1] y_true = [1, 1]
y_score = [0.25, 0.75] y_score = [0.25, 0.75]
@ -344,4 +345,5 @@ def test_roc_auc_score():
score = ROCAUCScore() score = ROCAUCScore()
score.score_set(0.25, 1) score.score_set(0.25, 1)
score.score_set(0.75, 1) score.score_set(0.75, 1)
assert score.score == -float("inf") with pytest.raises(ValueError):
s = score.score

View File

@ -51,7 +51,7 @@ def test_readers():
for example in train_corpus(nlp): for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer) nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp))) scores = nlp.evaluate(list(dev_corpus(nlp)))
assert scores["cats_score"] assert scores["cats_score"] == 0.0
# ensure the pipeline runs # ensure the pipeline runs
doc = nlp("Quick test") doc = nlp("Quick test")
assert doc.cats assert doc.cats

View File

@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]] dot_names = [T["train_corpus"], T["dev_corpus"]]
if not isinstance(T["train_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
if not isinstance(T["dev_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
train_corpus, dev_corpus = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"] optimizer = T["optimizer"]
# Components that shouldn't be updated during training # Components that shouldn't be updated during training

View File

@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used a feed-forward subnetwork to build a mixed representation. The features used
can be configured with the `attrs` argument. The suggested attributes are can be configured with the `attrs` argument. The suggested attributes are
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
some subword information, without construction a fully character-based some subword information, without construction a fully character-based
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
different architectures and settings to determine what works best on your different architectures and settings to determine what works best on your
specific data and challenge. specific data and challenge.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TextCatEnsemble.v1" > @architectures = "spacy.TextCatEnsemble.v2"
> exclusive_classes = false
> pretrained_vectors = null
> width = 64
> embed_size = 2000
> conv_depth = 2
> window_size = 1
> ngram_size = 1
> dropout = null
> nO = null > nO = null
>
> [model.linear_model]
> @architectures = "spacy.TextCatBOW.v1"
> exclusive_classes = true
> ngram_size = 1
> no_output_layer = false
>
> [model.tok2vec]
> @architectures = "spacy.Tok2Vec.v1"
>
> [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
> include_static_vectors = false
>
> [model.tok2vec.encode]
> @architectures = "spacy.MaxoutWindowEncoder.v1"
> width = ${model.tok2vec.embed.width}
> window_size = 1
> maxout_pieces = 3
> depth = 2
> ``` > ```
Stacked ensemble of a bag-of-words model and a neural network model. The neural Stacked ensemble of a linear bag-of-words model and a neural network model. The
network has an internal CNN Tok2Vec layer and uses attention. neural network is built upon a Tok2Vec layer and uses attention. The setting for
whether or not this model should cater for multi-label classification, is taken
from the linear model, where it is stored in `model.attrs["multi_label"]`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
| Name | Description | | Name | Description |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
</Accordion>
### spacy.TextCatCNN.v1 {#TextCatCNN} ### spacy.TextCatCNN.v1 {#TextCatCNN}
> #### Example Config > #### Example Config

View File

@ -174,15 +174,25 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
containing scores for each label like `Doc.cats`. The reported overall score containing scores for each label like `Doc.cats`. The returned dictionary
depends on the scorer settings: contains the following scores:
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / - `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall each label is weighted equally
score), `{attr}_f_per_type`, `{attr}_auc_per_type` - `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` across evaluations per label
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; - `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` scores, keyed by label
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
description)
The reported `{attr}_score` depends on the classification properties:
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
of the positive label
- **3+ exclusive classes**, macro-averaged F-score:
`{attr}_score = {attr}_macro_f`
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
> #### Example > #### Example
> >

View File

@ -130,16 +130,31 @@ factory = "textcat"
labels = [] labels = []
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
dropout = 0
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[components.textcat.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${components.textcat.model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
``` ```
spaCy has two additional built-in `textcat` architectures, and you can easily spaCy has two additional built-in `textcat` architectures, and you can easily

View File

@ -1244,15 +1244,10 @@ labels = []
# This function is created and then passed to the "textcat" component as # This function is created and then passed to the "textcat" component as
# the argument "model" # the argument "model"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1 ngram_size = 1
dropout = null no_output_layer = false
[components.other_textcat] [components.other_textcat]
factory = "textcat" factory = "textcat"

View File

@ -717,7 +717,7 @@ tabular results to a file:
```python ```python
### functions.py ### functions.py
import sys import sys
from typing import IO, Tuple, Callable, Dict, Any from typing import IO, Tuple, Callable, Dict, Any, Optional
import spacy import spacy
from spacy import Language from spacy import Language
from pathlib import Path from pathlib import Path
@ -729,7 +729,7 @@ def custom_logger(log_path):
stdout: IO=sys.stdout, stdout: IO=sys.stdout,
stderr: IO=sys.stderr stderr: IO=sys.stderr
) -> Tuple[Callable, Callable]: ) -> Tuple[Callable, Callable]:
stdout.write(f"Logging to {log_path}\n") stdout.write(f"Logging to {log_path}\\n")
log_file = Path(log_path).open("w", encoding="utf8") log_file = Path(log_path).open("w", encoding="utf8")
log_file.write("step\\t") log_file.write("step\\t")
log_file.write("score\\t") log_file.write("score\\t")