From d865f9b223ee72de53f9bcdac3915fd0df2155a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Mon, 27 Nov 2023 16:15:33 +0100 Subject: [PATCH] Replace TexCatBOW `length_exponent` parameter by `length` We now round up the length to the next power of two if it isn't a power of two. --- spacy/cli/templates/quickstart_training.jinja | 10 ++++----- spacy/errors.py | 1 + spacy/ml/models/textcat.py | 10 +++++++-- spacy/pipeline/textcat.py | 4 ++-- spacy/pipeline/textcat_multilabel.py | 2 +- spacy/tests/test_misc.py | 2 +- website/docs/api/architectures.mdx | 21 +++++++++---------- website/docs/usage/layers-architectures.mdx | 4 ++-- website/docs/usage/processing-pipelines.mdx | 2 +- 9 files changed, 31 insertions(+), 25 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index c8d09f6dc..2817147f3 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -273,7 +273,7 @@ grad_factor = 1.0 [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false @@ -311,7 +311,7 @@ grad_factor = 1.0 [components.textcat_multilabel.model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = false -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false @@ -546,7 +546,7 @@ width = ${components.tok2vec.model.encode.width} [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false @@ -575,7 +575,7 @@ width = ${components.tok2vec.model.encode.width} [components.textcat_multilabel.model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = false -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false @@ -583,7 +583,7 @@ no_output_layer = false [components.textcat_multilabel.model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = false -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false {%- endif %} diff --git a/spacy/errors.py b/spacy/errors.py index 8b290da6d..093c65f3d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes): "predicted docs when training {component}.") E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " "but only callbacks with one or three parameters are supported") + E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index be88b568b..205fb82df 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -111,15 +111,21 @@ def build_bow_text_classifier_v3( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, - length_exponent: int = 18, + length: int = 262144, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: + if length < 1: + raise ValueError(Errors.E1056.format(length=length)) + + # Find k such that 2**(k-1) < length <= 2**k. + length = 2 ** (length - 1).bit_length() + return _build_bow_text_classifier( exclusive_classes=exclusive_classes, ngram_size=ngram_size, no_output_layer=no_output_layer, nO=nO, - sparse_linear=SparseLinear_v2(nO=nO, length=2**length_exponent), + sparse_linear=SparseLinear_v2(nO=nO, length=length), ) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 9db92f9f0..43a335c4a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -38,7 +38,7 @@ depth = 2 [model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false """ @@ -48,7 +48,7 @@ single_label_bow_config = """ [model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false """ diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index d90cd836e..c917cc610 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -37,7 +37,7 @@ depth = 2 [model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = false -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false """ diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 3a8fc2139..b1b4faa88 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -378,7 +378,7 @@ def test_util_dot_section(): [components.textcat.model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true - length_exponent = 18 + length = 262144 ngram_size = 1 no_output_layer = false """ diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 0cf0d0753..9d8b3ddfa 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -964,7 +964,7 @@ single-label use-cases where `exclusive_classes = true`, while the > [model.linear_model] > @architectures = "spacy.TextCatBOW.v3" > exclusive_classes = true -> length_exponent = 18 +> length = 262144 > ngram_size = 1 > no_output_layer = false > @@ -1066,7 +1066,7 @@ after training. > [model] > @architectures = "spacy.TextCatBOW.v3" > exclusive_classes = false -> length_exponent = 18 +> length = 262144 > ngram_size = 1 > no_output_layer = false > nO = null @@ -1080,20 +1080,19 @@ the others, but may not be as accurate, especially if texts are short. | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | -| `length_exponent` | The size of the weights vector. The sizes is set to `2**length_exponent`. Defaults to `18`. ~~int~~ | +| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, new - labels can be added to this component, even after training. -* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and - [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear layer - that only used a small number of the allocated parameters. -* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and - [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length_exponent` - argument. +- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, + new labels can be added to this component, even after training. +- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and + [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear + layer that only used a small number of the allocated parameters. +- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and + [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument. diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index d308b4bdb..03b85f5af 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -155,7 +155,7 @@ depth = 2 [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false ``` @@ -173,7 +173,7 @@ labels = [] [components.textcat.model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false nO = null diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx index c387d9b90..3e58b251d 100644 --- a/website/docs/usage/processing-pipelines.mdx +++ b/website/docs/usage/processing-pipelines.mdx @@ -1330,7 +1330,7 @@ labels = [] [components.textcat.model] @architectures = "spacy.TextCatBOW.v3" exclusive_classes = true -length_exponent = 18 +length = 262144 ngram_size = 1 no_output_layer = false