Update TextCatBOW to use the fixed SparseLinear layer (#13149)

* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
This commit is contained in:
Daniël de Kok 2023-11-29 09:11:54 +01:00 committed by GitHub
parent bf7c2ea99a
commit da7ad97519
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 144 additions and 50 deletions

View File

@ -271,8 +271,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -308,8 +309,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -542,14 +544,15 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -570,15 +573,17 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat_multilabel.model] [components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{%- endif %} {%- endif %}

View File

@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
"predicted docs when training {component}.") "predicted docs when training {component}.")
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported") "but only callbacks with one or three parameters are supported")
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,5 +1,5 @@
from functools import partial from functools import partial
from typing import List, Optional, cast from typing import List, Optional, Tuple, cast
from thinc.api import ( from thinc.api import (
Dropout, Dropout,
@ -12,6 +12,7 @@ from thinc.api import (
Relu, Relu,
Softmax, Softmax,
SparseLinear, SparseLinear,
SparseLinear_v2,
chain, chain,
clone, clone,
concatenate, concatenate,
@ -25,9 +26,10 @@ from thinc.api import (
) )
from thinc.layers.chain import init as init_chain from thinc.layers.chain import init as init_chain
from thinc.layers.resizable import resize_linear_weighted, resize_model from thinc.layers.resizable import resize_linear_weighted, resize_model
from thinc.types import Floats2d from thinc.types import ArrayXd, Floats2d
from ...attrs import ORTH from ...attrs import ORTH
from ...errors import Errors
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
@ -95,10 +97,48 @@ def build_bow_text_classifier(
ngram_size: int, ngram_size: int,
no_output_layer: bool, no_output_layer: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear(nO=nO),
)
@registry.architectures("spacy.TextCatBOW.v3")
def build_bow_text_classifier_v3(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
length: int = 262144,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
if length < 1:
raise ValueError(Errors.E1056.format(length=length))
# Find k such that 2**(k-1) < length <= 2**k.
length = 2 ** (length - 1).bit_length()
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear_v2(nO=nO, length=length),
)
def _build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
fill_defaults = {"b": 0, "W": 0} fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO=nO)
output_layer = None output_layer = None
if not no_output_layer: if not no_output_layer:
fill_defaults["b"] = NEG_VALUE fill_defaults["b"] = NEG_VALUE

View File

@ -36,8 +36,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
single_label_bow_config = """ single_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """

View File

@ -35,8 +35,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
multi_label_bow_config = """ multi_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false

View File

@ -203,7 +203,7 @@ def test_pipe_class_component_model():
"@architectures": "spacy.TextCatEnsemble.v2", "@architectures": "spacy.TextCatEnsemble.v2",
"tok2vec": DEFAULT_TOK2VEC_MODEL, "tok2vec": DEFAULT_TOK2VEC_MODEL,
"linear_model": { "linear_model": {
"@architectures": "spacy.TextCatBOW.v2", "@architectures": "spacy.TextCatBOW.v3",
"exclusive_classes": False, "exclusive_classes": False,
"ngram_size": 1, "ngram_size": 1,
"no_output_layer": False, "no_output_layer": False,

View File

@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V1
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
@ -451,11 +451,11 @@ def test_no_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -480,11 +480,11 @@ def test_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW v3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -693,9 +693,14 @@ def test_overfitting_IO_multi():
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# BOW V3
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# ENSEMBLE V2 # ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# CNN V2 # CNN V2
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),

View File

@ -238,7 +238,7 @@ def test_project_push_pull(project_dir):
def test_find_function_valid(): def test_find_function_valid():
# example of architecture in main code base # example of architecture in main code base
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
assert f"Found registered function '{function}'" in result.stdout assert f"Found registered function '{function}'" in result.stdout
assert "textcat.py" in result.stdout assert "textcat.py" in result.stdout
@ -257,7 +257,7 @@ def test_find_function_valid():
def test_find_function_invalid(): def test_find_function_invalid():
# invalid registry # invalid registry
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
registry = "foobar" registry = "foobar"
result = CliRunner().invoke( result = CliRunner().invoke(
app, ["find-function", function, "--registry", registry] app, ["find-function", function, "--registry", registry]

View File

@ -376,8 +376,9 @@ def test_util_dot_section():
factory = "textcat" factory = "textcat"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """

View File

@ -78,16 +78,16 @@ subword features, and a
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
consisting of a CNN and a layer-normalized maxout activation function. consisting of a CNN and a layer-normalized maxout activation function.
| Name | Description | | Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
> nO = null > nO = null
> >
> [model.linear_model] > [model.linear_model]
> @architectures = "spacy.TextCatBOW.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = true > exclusive_classes = true
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> >
@ -1057,14 +1058,15 @@ after training.
</Accordion> </Accordion>
### spacy.TextCatBOW.v2 {id="TextCatBOW"} ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TextCatBOW.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = false > exclusive_classes = false
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> nO = null > nO = null
@ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatBOW.v1 definition" spaced> <Accordion title="Previous versions of spacy.TextCatBOW" spaced>
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
not yet resizable. Since v2, new labels can be added to this component, even new labels can be added to this component, even after training.
after training. - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
layer that only used a small number of the allocated parameters.
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
</Accordion> </Accordion>

View File

@ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
yet support that. yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
erroneous sparse linear layer that only used a small number of the allocated
parameters.
> #### Example Config > #### Example Config
> >
@ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
linear layer that only used a small number of the allocated parameters.
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatBOW.v2"
> exclusive_classes = false
> ngram_size = 1
> no_output_layer = false
> nO = null
> ```
An n-gram "bag-of-words" model. This architecture should run much faster than
the others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
Identical to Identical to

View File

@ -153,8 +153,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
``` ```
@ -170,8 +171,9 @@ factory = "textcat"
labels = [] labels = []
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
nO = null nO = null

View File

@ -1328,8 +1328,9 @@ labels = []
# This function is created and then passed to the "textcat" component as # This function is created and then passed to the "textcat" component as
# the argument "model" # the argument "model"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false