mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Update TextCatBOW
to use the fixed SparseLinear
layer (#13149)
* Update `TextCatBOW` to use the fixed `SparseLinear` layer A while ago, we fixed the `SparseLinear` layer to use all available parameters: https://github.com/explosion/thinc/pull/754 This change updates `TextCatBOW` to `v3` which uses the new `SparseLinear_v2` layer. This results in a sizeable improvement on a text categorization task that was tested. While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent` option to make it possible to change the hidden size. Ideally, we'd just have an option called `length`. But the way that `TextCatBOW` uses hashes results in a non-uniform distribution of parameters when the length is not a power of two. * Replace TexCatBOW `length_exponent` parameter by `length` We now round up the length to the next power of two if it isn't a power of two. * Remove some tests for TextCatBOW.v2 * Fix missing import
This commit is contained in:
parent
bf7c2ea99a
commit
da7ad97519
|
@ -271,8 +271,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -308,8 +309,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -542,14 +544,15 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -570,15 +573,17 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
|
@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"predicted docs when training {component}.")
|
"predicted docs when training {component}.")
|
||||||
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||||
"but only callbacks with one or three parameters are supported")
|
"but only callbacks with one or three parameters are supported")
|
||||||
|
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import List, Optional, cast
|
from typing import List, Optional, Tuple, cast
|
||||||
|
|
||||||
from thinc.api import (
|
from thinc.api import (
|
||||||
Dropout,
|
Dropout,
|
||||||
|
@ -12,6 +12,7 @@ from thinc.api import (
|
||||||
Relu,
|
Relu,
|
||||||
Softmax,
|
Softmax,
|
||||||
SparseLinear,
|
SparseLinear,
|
||||||
|
SparseLinear_v2,
|
||||||
chain,
|
chain,
|
||||||
clone,
|
clone,
|
||||||
concatenate,
|
concatenate,
|
||||||
|
@ -25,9 +26,10 @@ from thinc.api import (
|
||||||
)
|
)
|
||||||
from thinc.layers.chain import init as init_chain
|
from thinc.layers.chain import init as init_chain
|
||||||
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import ArrayXd, Floats2d
|
||||||
|
|
||||||
from ...attrs import ORTH
|
from ...attrs import ORTH
|
||||||
|
from ...errors import Errors
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
@ -95,10 +97,48 @@ def build_bow_text_classifier(
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
return _build_bow_text_classifier(
|
||||||
|
exclusive_classes=exclusive_classes,
|
||||||
|
ngram_size=ngram_size,
|
||||||
|
no_output_layer=no_output_layer,
|
||||||
|
nO=nO,
|
||||||
|
sparse_linear=SparseLinear(nO=nO),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatBOW.v3")
|
||||||
|
def build_bow_text_classifier_v3(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
length: int = 262144,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
if length < 1:
|
||||||
|
raise ValueError(Errors.E1056.format(length=length))
|
||||||
|
|
||||||
|
# Find k such that 2**(k-1) < length <= 2**k.
|
||||||
|
length = 2 ** (length - 1).bit_length()
|
||||||
|
|
||||||
|
return _build_bow_text_classifier(
|
||||||
|
exclusive_classes=exclusive_classes,
|
||||||
|
ngram_size=ngram_size,
|
||||||
|
no_output_layer=no_output_layer,
|
||||||
|
nO=nO,
|
||||||
|
sparse_linear=SparseLinear_v2(nO=nO, length=length),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_bow_text_classifier(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
|
||||||
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
fill_defaults = {"b": 0, "W": 0}
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO=nO)
|
|
||||||
output_layer = None
|
output_layer = None
|
||||||
if not no_output_layer:
|
if not no_output_layer:
|
||||||
fill_defaults["b"] = NEG_VALUE
|
fill_defaults["b"] = NEG_VALUE
|
||||||
|
|
|
@ -36,8 +36,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
@ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
|
||||||
|
|
||||||
single_label_bow_config = """
|
single_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -35,8 +35,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
|
||||||
|
|
||||||
multi_label_bow_config = """
|
multi_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
|
@ -203,7 +203,7 @@ def test_pipe_class_component_model():
|
||||||
"@architectures": "spacy.TextCatEnsemble.v2",
|
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||||
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||||
"linear_model": {
|
"linear_model": {
|
||||||
"@architectures": "spacy.TextCatBOW.v2",
|
"@architectures": "spacy.TextCatBOW.v3",
|
||||||
"exclusive_classes": False,
|
"exclusive_classes": False,
|
||||||
"ngram_size": 1,
|
"ngram_size": 1,
|
||||||
"no_output_layer": False,
|
"no_output_layer": False,
|
||||||
|
|
|
@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW V1
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
|
@ -451,11 +451,11 @@ def test_no_resize(name, textcat_config):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW V3
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||||
# CNN
|
# CNN
|
||||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
@ -480,11 +480,11 @@ def test_resize(name, textcat_config):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW v3
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||||
# CNN
|
# CNN
|
||||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
@ -693,9 +693,14 @@ def test_overfitting_IO_multi():
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||||
|
# BOW V3
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||||
# ENSEMBLE V2
|
# ENSEMBLE V2
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||||
# CNN V2
|
# CNN V2
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
|
|
@ -238,7 +238,7 @@ def test_project_push_pull(project_dir):
|
||||||
|
|
||||||
def test_find_function_valid():
|
def test_find_function_valid():
|
||||||
# example of architecture in main code base
|
# example of architecture in main code base
|
||||||
function = "spacy.TextCatBOW.v2"
|
function = "spacy.TextCatBOW.v3"
|
||||||
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
||||||
assert f"Found registered function '{function}'" in result.stdout
|
assert f"Found registered function '{function}'" in result.stdout
|
||||||
assert "textcat.py" in result.stdout
|
assert "textcat.py" in result.stdout
|
||||||
|
@ -257,7 +257,7 @@ def test_find_function_valid():
|
||||||
|
|
||||||
def test_find_function_invalid():
|
def test_find_function_invalid():
|
||||||
# invalid registry
|
# invalid registry
|
||||||
function = "spacy.TextCatBOW.v2"
|
function = "spacy.TextCatBOW.v3"
|
||||||
registry = "foobar"
|
registry = "foobar"
|
||||||
result = CliRunner().invoke(
|
result = CliRunner().invoke(
|
||||||
app, ["find-function", function, "--registry", registry]
|
app, ["find-function", function, "--registry", registry]
|
||||||
|
|
|
@ -376,8 +376,9 @@ def test_util_dot_section():
|
||||||
factory = "textcat"
|
factory = "textcat"
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -78,16 +78,16 @@ subword features, and a
|
||||||
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
||||||
consisting of a CNN and a layer-normalized maxout activation function.
|
consisting of a CNN and a layer-normalized maxout activation function.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
|
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
|
||||||
|
|
||||||
|
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
|
||||||
> nO = null
|
> nO = null
|
||||||
>
|
>
|
||||||
> [model.linear_model]
|
> [model.linear_model]
|
||||||
> @architectures = "spacy.TextCatBOW.v2"
|
> @architectures = "spacy.TextCatBOW.v3"
|
||||||
> exclusive_classes = true
|
> exclusive_classes = true
|
||||||
|
> length = 262144
|
||||||
> ngram_size = 1
|
> ngram_size = 1
|
||||||
> no_output_layer = false
|
> no_output_layer = false
|
||||||
>
|
>
|
||||||
|
@ -1057,14 +1058,15 @@ after training.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
### spacy.TextCatBOW.v3 {id="TextCatBOW"}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.TextCatBOW.v2"
|
> @architectures = "spacy.TextCatBOW.v3"
|
||||||
> exclusive_classes = false
|
> exclusive_classes = false
|
||||||
|
> length = 262144
|
||||||
> ngram_size = 1
|
> ngram_size = 1
|
||||||
> no_output_layer = false
|
> no_output_layer = false
|
||||||
> nO = null
|
> nO = null
|
||||||
|
@ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||||
|
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
|
<Accordion title="Previous versions of spacy.TextCatBOW" spaced>
|
||||||
|
|
||||||
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
|
||||||
not yet resizable. Since v2, new labels can be added to this component, even
|
new labels can be added to this component, even after training.
|
||||||
after training.
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||||
|
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
|
||||||
|
layer that only used a small number of the allocated parameters.
|
||||||
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||||
|
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
|
|
||||||
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
||||||
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
||||||
yet support that.
|
yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
|
||||||
|
erroneous sparse linear layer that only used a small number of the allocated
|
||||||
|
parameters.
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
@ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
||||||
|
|
||||||
|
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
|
||||||
|
linear layer that only used a small number of the allocated parameters.
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TextCatBOW.v2"
|
||||||
|
> exclusive_classes = false
|
||||||
|
> ngram_size = 1
|
||||||
|
> no_output_layer = false
|
||||||
|
> nO = null
|
||||||
|
> ```
|
||||||
|
|
||||||
|
An n-gram "bag-of-words" model. This architecture should run much faster than
|
||||||
|
the others, but may not be as accurate, especially if texts are short.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
|
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
|
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
|
||||||
|
|
||||||
Identical to
|
Identical to
|
||||||
|
|
|
@ -153,8 +153,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
```
|
```
|
||||||
|
@ -170,8 +171,9 @@ factory = "textcat"
|
||||||
labels = []
|
labels = []
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
nO = null
|
nO = null
|
||||||
|
|
|
@ -1328,8 +1328,9 @@ labels = []
|
||||||
# This function is created and then passed to the "textcat" component as
|
# This function is created and then passed to the "textcat" component as
|
||||||
# the argument "model"
|
# the argument "model"
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user