mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
TextCat updates and fixes (#6263)
* small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos
This commit is contained in:
parent
e2f3c4e12d
commit
75a202ce65
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0rc1"
|
__version__ = "3.0.0rc2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -100,7 +100,7 @@ def init_labels_cli(
|
||||||
extract the labels."""
|
extract the labels."""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir(parents=True)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
|
|
|
@ -136,15 +136,19 @@ factory = "textcat"
|
||||||
|
|
||||||
{% if optimize == "accuracy" %}
|
{% if optimize == "accuracy" %}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
@ -271,15 +275,19 @@ factory = "textcat"
|
||||||
|
|
||||||
{% if optimize == "accuracy" %}
|
{% if optimize == "accuracy" %}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
|
|
@ -44,7 +44,7 @@ def train_cli(
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None and not output_path.exists():
|
if output_path is not None and not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir(parents=True)
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
|
|
@ -398,8 +398,8 @@ class Errors:
|
||||||
E163 = ("cumsum was found to be unstable: its last element does not "
|
E163 = ("cumsum was found to be unstable: its last element does not "
|
||||||
"correspond to sum")
|
"correspond to sum")
|
||||||
E164 = ("x is neither increasing nor decreasing: {x}.")
|
E164 = ("x is neither increasing nor decreasing: {x}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in the gold labels: {label}. "
|
||||||
"that case.")
|
"ROC AUC score is not defined in that case.")
|
||||||
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
|
@ -456,6 +456,8 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
||||||
|
"relevant section in the config, but found type {type} instead.")
|
||||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||||
"is not set or None. If you've implemented a custom component, make "
|
"is not set or None. If you've implemented a custom component, make "
|
||||||
"sure to store the component model as `self.model` in your "
|
"sure to store the component model as `self.model` in your "
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
|
from thinc.types import Floats2d
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
|
@ -10,12 +12,13 @@ from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
def build_simple_cnn_text_classifier(
|
def build_simple_cnn_text_classifier(
|
||||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||||
) -> Model:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
"""
|
"""
|
||||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||||
|
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
"""
|
"""
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
|
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
model = cnn >> output_layer
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
else:
|
else:
|
||||||
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = (
|
model = cnn >> linear_layer >> Logistic()
|
||||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
|
||||||
)
|
|
||||||
model.set_ref("output_layer", linear_layer)
|
model.set_ref("output_layer", linear_layer)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
|
@ -45,8 +47,7 @@ def build_bow_text_classifier(
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
# Don't document this yet, I'm not sure it's right.
|
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO)
|
sparse_linear = SparseLinear(nO)
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||||
|
@ -59,6 +60,39 @@ def build_bow_text_classifier(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||||
|
def build_text_classifier(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
linear_model: Model[List[Doc], Floats2d],
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
|
width = tok2vec.get_dim("nO")
|
||||||
|
cnn_model = (
|
||||||
|
tok2vec
|
||||||
|
>> list2ragged()
|
||||||
|
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
|
||||||
|
>> reduce_sum()
|
||||||
|
>> residual(Maxout(nO=width, nI=width))
|
||||||
|
>> Linear(nO=nO, nI=width)
|
||||||
|
>> Dropout(0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
nO_double = nO * 2 if nO else None
|
||||||
|
if exclusive_classes:
|
||||||
|
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||||
|
else:
|
||||||
|
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||||
|
model = (linear_model | cnn_model) >> output_layer
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
if model.has_dim("nO") is not False:
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
return model
|
||||||
|
|
||||||
|
# TODO: move to legacy
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier(
|
||||||
width: int,
|
width: int,
|
||||||
|
@ -158,11 +192,8 @@ def build_text_classifier(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(
|
def build_text_classifier_lowdata(
|
||||||
width: int,
|
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||||
pretrained_vectors: Optional[bool],
|
) -> Model[List[Doc], Floats2d]:
|
||||||
dropout: Optional[float],
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
# Don't document this yet, I'm not sure it's right.
|
# Don't document this yet, I'm not sure it's right.
|
||||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
|
|
|
@ -106,7 +106,7 @@ def MultiHashEmbed(
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedding layer that separately embeds a number of lexical
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representation.
|
||||||
|
|
||||||
The features used can be configured with the 'attrs' argument. The suggested
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
|
|
|
@ -16,15 +16,30 @@ from ..vocab import Vocab
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
pretrained_vectors = null
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = 64
|
width = 64
|
||||||
conv_depth = 2
|
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
embed_size = 2000
|
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 2
|
||||||
|
|
||||||
|
[model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
dropout = null
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
@ -60,9 +75,11 @@ subword_features = true
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
"cats_p": None,
|
"cats_micro_p": None,
|
||||||
"cats_r": None,
|
"cats_micro_r": None,
|
||||||
"cats_f": None,
|
"cats_micro_f": None,
|
||||||
|
"cats_macro_p": None,
|
||||||
|
"cats_macro_r": None,
|
||||||
"cats_macro_f": None,
|
"cats_macro_f": None,
|
||||||
"cats_macro_auc": None,
|
"cats_macro_auc": None,
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
|
|
|
@ -59,7 +59,9 @@ class PRFScore:
|
||||||
|
|
||||||
|
|
||||||
class ROCAUCScore:
|
class ROCAUCScore:
|
||||||
"""An AUC ROC score."""
|
"""An AUC ROC score. This is only defined for binary classification.
|
||||||
|
Use the method is_binary before calculating the score, otherwise it
|
||||||
|
may throw an error."""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.golds = []
|
self.golds = []
|
||||||
|
@ -71,16 +73,16 @@ class ROCAUCScore:
|
||||||
self.cands.append(cand)
|
self.cands.append(cand)
|
||||||
self.golds.append(gold)
|
self.golds.append(gold)
|
||||||
|
|
||||||
|
def is_binary(self):
|
||||||
|
return len(np.unique(self.golds)) == 2
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def score(self):
|
def score(self):
|
||||||
|
if not self.is_binary():
|
||||||
|
raise ValueError(Errors.E165.format(label=set(self.golds)))
|
||||||
if len(self.golds) == self.saved_score_at_len:
|
if len(self.golds) == self.saved_score_at_len:
|
||||||
return self.saved_score
|
return self.saved_score
|
||||||
try:
|
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
||||||
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
|
||||||
# catch ValueError: Only one class present in y_true.
|
|
||||||
# ROC AUC score is not defined in that case.
|
|
||||||
except ValueError:
|
|
||||||
self.saved_score = -float("inf")
|
|
||||||
self.saved_score_at_len = len(self.golds)
|
self.saved_score_at_len = len(self.golds)
|
||||||
return self.saved_score
|
return self.saved_score
|
||||||
|
|
||||||
|
@ -362,9 +364,13 @@ class Scorer:
|
||||||
for all:
|
for all:
|
||||||
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
||||||
attr_score_desc (text description of the overall score),
|
attr_score_desc (text description of the overall score),
|
||||||
|
attr_micro_p,
|
||||||
|
attr_micro_r,
|
||||||
attr_micro_f,
|
attr_micro_f,
|
||||||
|
attr_macro_p,
|
||||||
|
attr_macro_r,
|
||||||
attr_macro_f,
|
attr_macro_f,
|
||||||
attr_auc,
|
attr_macro_auc,
|
||||||
attr_f_per_type,
|
attr_f_per_type,
|
||||||
attr_auc_per_type
|
attr_auc_per_type
|
||||||
|
|
||||||
|
@ -431,7 +437,9 @@ class Scorer:
|
||||||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||||
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
||||||
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
|
# Limit macro_auc to those labels with gold annotations,
|
||||||
|
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
|
||||||
|
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
|
||||||
results = {
|
results = {
|
||||||
f"{attr}_score": None,
|
f"{attr}_score": None,
|
||||||
f"{attr}_score_desc": None,
|
f"{attr}_score_desc": None,
|
||||||
|
@ -443,7 +451,7 @@ class Scorer:
|
||||||
f"{attr}_macro_f": macro_f,
|
f"{attr}_macro_f": macro_f,
|
||||||
f"{attr}_macro_auc": macro_auc,
|
f"{attr}_macro_auc": macro_auc,
|
||||||
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
if len(labels) == 2 and not multi_label and positive_label:
|
if len(labels) == 2 and not multi_label and positive_label:
|
||||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
||||||
|
@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score):
|
||||||
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
||||||
"""
|
"""
|
||||||
if len(np.unique(y_true)) != 2:
|
if len(np.unique(y_true)) != 2:
|
||||||
raise ValueError(Errors.E165)
|
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
|
||||||
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
||||||
return _auc(fpr, tpr)
|
return _auc(fpr, tpr)
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
||||||
from thinc.api import Model, Linear, ConfigValidationError
|
from thinc.api import Model, Linear, ConfigValidationError
|
||||||
|
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
|
||||||
name = "test_class_component_model"
|
name = "test_class_component_model"
|
||||||
default_config = {
|
default_config = {
|
||||||
"model": {
|
"model": {
|
||||||
"@architectures": "spacy.TextCatEnsemble.v1",
|
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||||
"exclusive_classes": False,
|
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||||
"pretrained_vectors": None,
|
"linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
|
||||||
"width": 64,
|
"no_output_layer": False},
|
||||||
"embed_size": 2000,
|
|
||||||
"window_size": 1,
|
|
||||||
"conv_depth": 2,
|
|
||||||
"ngram_size": 1,
|
|
||||||
"dropout": None,
|
|
||||||
},
|
},
|
||||||
"value1": 10,
|
"value1": 10,
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,7 +140,7 @@ def test_overfitting_IO():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||||
# Set exclusive labels
|
# Set exclusive labels
|
||||||
config = {"model": {"exclusive_classes": True}}
|
config = {"model": {"linear_model": {"exclusive_classes": True}}}
|
||||||
textcat = nlp.add_pipe("textcat", config=config)
|
textcat = nlp.add_pipe("textcat", config=config)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
|
@ -192,9 +192,8 @@ def test_overfitting_IO():
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
||||||
],
|
],
|
||||||
|
|
|
@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_kwargs():
|
def get_textcat_bow_kwargs():
|
||||||
return {
|
return {
|
||||||
"width": 64,
|
"exclusive_classes": True,
|
||||||
"embed_size": 2000,
|
|
||||||
"pretrained_vectors": None,
|
|
||||||
"exclusive_classes": False,
|
|
||||||
"ngram_size": 1,
|
"ngram_size": 1,
|
||||||
"window_size": 1,
|
"no_output_layer": False,
|
||||||
"conv_depth": 2,
|
"nO": 34,
|
||||||
"dropout": None,
|
|
||||||
"nO": 7,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {
|
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
"tok2vec": test_tok2vec(),
|
|
||||||
"exclusive_classes": False,
|
|
||||||
"nO": 13,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -105,7 +96,7 @@ def test_multi_hash_embed():
|
||||||
"seed,model_func,kwargs",
|
"seed,model_func,kwargs",
|
||||||
[
|
[
|
||||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
|
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
|
||||||
(0, build_text_classifier, get_textcat_kwargs()),
|
(0, build_bow_text_classifier, get_textcat_bow_kwargs()),
|
||||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
|
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
|
||||||
"seed,model_func,kwargs,get_X",
|
"seed,model_func,kwargs,get_X",
|
||||||
[
|
[
|
||||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||||
(0, build_text_classifier, get_textcat_kwargs(), get_docs),
|
(0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
|
||||||
"seed,dropout,model_func,kwargs,get_X",
|
"seed,dropout,model_func,kwargs,get_X",
|
||||||
[
|
[
|
||||||
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||||
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
|
(0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||||
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -334,7 +334,8 @@ def test_roc_auc_score():
|
||||||
score = ROCAUCScore()
|
score = ROCAUCScore()
|
||||||
score.score_set(0.25, 0)
|
score.score_set(0.25, 0)
|
||||||
score.score_set(0.75, 0)
|
score.score_set(0.75, 0)
|
||||||
assert score.score == -float("inf")
|
with pytest.raises(ValueError):
|
||||||
|
s = score.score
|
||||||
|
|
||||||
y_true = [1, 1]
|
y_true = [1, 1]
|
||||||
y_score = [0.25, 0.75]
|
y_score = [0.25, 0.75]
|
||||||
|
@ -344,4 +345,5 @@ def test_roc_auc_score():
|
||||||
score = ROCAUCScore()
|
score = ROCAUCScore()
|
||||||
score.score_set(0.25, 1)
|
score.score_set(0.25, 1)
|
||||||
score.score_set(0.75, 1)
|
score.score_set(0.75, 1)
|
||||||
assert score.score == -float("inf")
|
with pytest.raises(ValueError):
|
||||||
|
s = score.score
|
||||||
|
|
|
@ -51,7 +51,7 @@ def test_readers():
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||||
assert scores["cats_score"]
|
assert scores["cats_score"] == 0.0
|
||||||
# ensure the pipeline runs
|
# ensure the pipeline runs
|
||||||
doc = nlp("Quick test")
|
doc = nlp("Quick test")
|
||||||
assert doc.cats
|
assert doc.cats
|
||||||
|
|
|
@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
if not isinstance(T["train_corpus"], str):
|
||||||
|
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
|
||||||
|
if not isinstance(T["dev_corpus"], str):
|
||||||
|
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
|
|
|
@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
|
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it through
|
||||||
a feed-forward subnetwork to build a mixed representations. The features used
|
a feed-forward subnetwork to build a mixed representation. The features used
|
||||||
can be configured with the `attrs` argument. The suggested attributes are
|
can be configured with the `attrs` argument. The suggested attributes are
|
||||||
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
||||||
some subword information, without construction a fully character-based
|
some subword information, without construction a fully character-based
|
||||||
|
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
|
||||||
different architectures and settings to determine what works best on your
|
different architectures and settings to determine what works best on your
|
||||||
specific data and challenge.
|
specific data and challenge.
|
||||||
|
|
||||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.TextCatEnsemble.v1"
|
> @architectures = "spacy.TextCatEnsemble.v2"
|
||||||
> exclusive_classes = false
|
|
||||||
> pretrained_vectors = null
|
|
||||||
> width = 64
|
|
||||||
> embed_size = 2000
|
|
||||||
> conv_depth = 2
|
|
||||||
> window_size = 1
|
|
||||||
> ngram_size = 1
|
|
||||||
> dropout = null
|
|
||||||
> nO = null
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.linear_model]
|
||||||
|
> @architectures = "spacy.TextCatBOW.v1"
|
||||||
|
> exclusive_classes = true
|
||||||
|
> ngram_size = 1
|
||||||
|
> no_output_layer = false
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> @architectures = "spacy.Tok2Vec.v1"
|
||||||
|
>
|
||||||
|
> [model.tok2vec.embed]
|
||||||
|
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
> width = 64
|
||||||
|
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
|
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
> include_static_vectors = false
|
||||||
|
>
|
||||||
|
> [model.tok2vec.encode]
|
||||||
|
> @architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
> width = ${model.tok2vec.embed.width}
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> depth = 2
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
Stacked ensemble of a linear bag-of-words model and a neural network model. The
|
||||||
network has an internal CNN Tok2Vec layer and uses attention.
|
neural network is built upon a Tok2Vec layer and uses attention. The setting for
|
||||||
|
whether or not this model should cater for multi-label classification, is taken
|
||||||
|
from the linear model, where it is stored in `model.attrs["multi_label"]`.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
||||||
|
|
||||||
|
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
|
|
|
@ -174,15 +174,25 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||||
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||||
containing scores for each label like `Doc.cats`. The reported overall score
|
containing scores for each label like `Doc.cats`. The returned dictionary
|
||||||
depends on the scorer settings:
|
contains the following scores:
|
||||||
|
|
||||||
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
|
- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
|
||||||
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
|
each label is weighted equally
|
||||||
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
|
- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
|
||||||
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
|
across evaluations per label
|
||||||
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
|
- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
|
||||||
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
|
scores, keyed by label
|
||||||
|
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
|
||||||
|
description)
|
||||||
|
|
||||||
|
The reported `{attr}_score` depends on the classification properties:
|
||||||
|
|
||||||
|
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
|
||||||
|
of the positive label
|
||||||
|
- **3+ exclusive classes**, macro-averaged F-score:
|
||||||
|
`{attr}_score = {attr}_macro_f`
|
||||||
|
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -130,16 +130,31 @@ factory = "textcat"
|
||||||
labels = []
|
labels = []
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
dropout = 0
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 64
|
||||||
|
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
|
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = ${components.textcat.model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 2
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
```
|
```
|
||||||
|
|
||||||
spaCy has two additional built-in `textcat` architectures, and you can easily
|
spaCy has two additional built-in `textcat` architectures, and you can easily
|
||||||
|
@ -687,7 +702,7 @@ Before the model can be used, it needs to be
|
||||||
[initialized](/usage/training#initialization). This function receives a callback
|
[initialized](/usage/training#initialization). This function receives a callback
|
||||||
to access the full **training data set**, or a representative sample. This data
|
to access the full **training data set**, or a representative sample. This data
|
||||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
||||||
labels can be provided to `initialize`, or you can call
|
labels can be provided to `initialize`, or you can call
|
||||||
`RelationExtractor.add_label` directly. The number of labels defines the output
|
`RelationExtractor.add_label` directly. The number of labels defines the output
|
||||||
dimensionality of the network, and will be used to do
|
dimensionality of the network, and will be used to do
|
||||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||||
|
|
|
@ -1244,15 +1244,10 @@ labels = []
|
||||||
# This function is created and then passed to the "textcat" component as
|
# This function is created and then passed to the "textcat" component as
|
||||||
# the argument "model"
|
# the argument "model"
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
pretrained_vectors = null
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
dropout = null
|
no_output_layer = false
|
||||||
|
|
||||||
[components.other_textcat]
|
[components.other_textcat]
|
||||||
factory = "textcat"
|
factory = "textcat"
|
||||||
|
|
|
@ -717,7 +717,7 @@ tabular results to a file:
|
||||||
```python
|
```python
|
||||||
### functions.py
|
### functions.py
|
||||||
import sys
|
import sys
|
||||||
from typing import IO, Tuple, Callable, Dict, Any
|
from typing import IO, Tuple, Callable, Dict, Any, Optional
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -729,7 +729,7 @@ def custom_logger(log_path):
|
||||||
stdout: IO=sys.stdout,
|
stdout: IO=sys.stdout,
|
||||||
stderr: IO=sys.stderr
|
stderr: IO=sys.stderr
|
||||||
) -> Tuple[Callable, Callable]:
|
) -> Tuple[Callable, Callable]:
|
||||||
stdout.write(f"Logging to {log_path}\n")
|
stdout.write(f"Logging to {log_path}\\n")
|
||||||
log_file = Path(log_path).open("w", encoding="utf8")
|
log_file = Path(log_path).open("w", encoding="utf8")
|
||||||
log_file.write("step\\t")
|
log_file.write("step\\t")
|
||||||
log_file.write("score\\t")
|
log_file.write("score\\t")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user