From d865f9b223ee72de53f9bcdac3915fd0df2155a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 27 Nov 2023 16:15:33 +0100
Subject: [PATCH] Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.
---
 spacy/cli/templates/quickstart_training.jinja | 10 ++++-----
 spacy/errors.py                               |  1 +
 spacy/ml/models/textcat.py                    | 10 +++++++--
 spacy/pipeline/textcat.py                     |  4 ++--
 spacy/pipeline/textcat_multilabel.py          |  2 +-
 spacy/tests/test_misc.py                      |  2 +-
 website/docs/api/architectures.mdx            | 21 +++++++++----------
 website/docs/usage/layers-architectures.mdx   |  4 ++--
 website/docs/usage/processing-pipelines.mdx   |  2 +-
 9 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index c8d09f6dc..2817147f3 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -273,7 +273,7 @@ grad_factor = 1.0
 [components.textcat.model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 
@@ -311,7 +311,7 @@ grad_factor = 1.0
 [components.textcat_multilabel.model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 
@@ -546,7 +546,7 @@ width = ${components.tok2vec.model.encode.width}
 [components.textcat.model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 
@@ -575,7 +575,7 @@ width = ${components.tok2vec.model.encode.width}
 [components.textcat_multilabel.model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 
@@ -583,7 +583,7 @@ no_output_layer = false
 [components.textcat_multilabel.model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 {%- endif %}
diff --git a/spacy/errors.py b/spacy/errors.py
index 8b290da6d..093c65f3d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "predicted docs when training {component}.")
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
+    E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index be88b568b..205fb82df 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -111,15 +111,21 @@ def build_bow_text_classifier_v3(
     exclusive_classes: bool,
     ngram_size: int,
     no_output_layer: bool,
-    length_exponent: int = 18,
+    length: int = 262144,
     nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
+    if length < 1:
+        raise ValueError(Errors.E1056.format(length=length))
+
+    # Find k such that 2**(k-1) < length <= 2**k.
+    length = 2 ** (length - 1).bit_length()
+
     return _build_bow_text_classifier(
         exclusive_classes=exclusive_classes,
         ngram_size=ngram_size,
         no_output_layer=no_output_layer,
         nO=nO,
-        sparse_linear=SparseLinear_v2(nO=nO, length=2**length_exponent),
+        sparse_linear=SparseLinear_v2(nO=nO, length=length),
     )
 
 
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 9db92f9f0..43a335c4a 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -38,7 +38,7 @@ depth = 2
 [model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 """
@@ -48,7 +48,7 @@ single_label_bow_config = """
 [model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 """
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index d90cd836e..c917cc610 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -37,7 +37,7 @@ depth = 2
 [model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 """
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 3a8fc2139..b1b4faa88 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -378,7 +378,7 @@ def test_util_dot_section():
     [components.textcat.model]
     @architectures = "spacy.TextCatBOW.v3"
     exclusive_classes = true
-    length_exponent = 18
+    length = 262144
     ngram_size = 1
     no_output_layer = false
     """
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 0cf0d0753..9d8b3ddfa 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -964,7 +964,7 @@ single-label use-cases where `exclusive_classes = true`, while the
 > [model.linear_model]
 > @architectures = "spacy.TextCatBOW.v3"
 > exclusive_classes = true
-> length_exponent = 18
+> length = 262144
 > ngram_size = 1
 > no_output_layer = false
 >
@@ -1066,7 +1066,7 @@ after training.
 > [model]
 > @architectures = "spacy.TextCatBOW.v3"
 > exclusive_classes = false
-> length_exponent = 18
+> length = 262144
 > ngram_size = 1
 > no_output_layer = false
 > nO = null
@@ -1080,20 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
-| `length_exponent`   | The size of the weights vector. The sizes is set to `2**length_exponent`. Defaults to `18`. ~~int~~                                                                                            |
+| `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              |
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 <Accordion title="Previous versions of spacy.TextCatBOW" spaced>
 
-* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, new
-  labels can be added to this component, even after training.
-* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
-  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear layer
-  that only used a small number of the allocated parameters.
-* [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
-  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length_exponent`
-  argument.
+- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
+  new labels can be added to this component, even after training.
+- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
+  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
+  layer that only used a small number of the allocated parameters.
+- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
+  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
 
 </Accordion>
 
diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx
index d308b4bdb..03b85f5af 100644
--- a/website/docs/usage/layers-architectures.mdx
+++ b/website/docs/usage/layers-architectures.mdx
@@ -155,7 +155,7 @@ depth = 2
 [components.textcat.model.linear_model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 ```
@@ -173,7 +173,7 @@ labels = []
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false
 nO = null
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index c387d9b90..3e58b251d 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1330,7 +1330,7 @@ labels = []
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
-length_exponent = 18
+length = 262144
 ngram_size = 1
 no_output_layer = false