mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Update TextCatBOW to use the fixed SparseLinear layer (#13149)
				
					
				
			* Update `TextCatBOW` to use the fixed `SparseLinear` layer A while ago, we fixed the `SparseLinear` layer to use all available parameters: https://github.com/explosion/thinc/pull/754 This change updates `TextCatBOW` to `v3` which uses the new `SparseLinear_v2` layer. This results in a sizeable improvement on a text categorization task that was tested. While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent` option to make it possible to change the hidden size. Ideally, we'd just have an option called `length`. But the way that `TextCatBOW` uses hashes results in a non-uniform distribution of parameters when the length is not a power of two. * Replace TexCatBOW `length_exponent` parameter by `length` We now round up the length to the next power of two if it isn't a power of two. * Remove some tests for TextCatBOW.v2 * Fix missing import
This commit is contained in:
		
							parent
							
								
									bf7c2ea99a
								
							
						
					
					
						commit
						da7ad97519
					
				|  | @ -271,8 +271,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -308,8 +309,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -542,14 +544,15 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  | @ -570,15 +573,17 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat_multilabel.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| {%- endif %} | ||||
|  |  | |||
|  | @ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              "predicted docs when training {component}.") | ||||
|     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " | ||||
|              "but only callbacks with one or three parameters are supported") | ||||
|     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.") | ||||
| 
 | ||||
| 
 | ||||
| # Deprecated model shortcuts, only used in errors and warnings | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from functools import partial | ||||
| from typing import List, Optional, cast | ||||
| from typing import List, Optional, Tuple, cast | ||||
| 
 | ||||
| from thinc.api import ( | ||||
|     Dropout, | ||||
|  | @ -12,6 +12,7 @@ from thinc.api import ( | |||
|     Relu, | ||||
|     Softmax, | ||||
|     SparseLinear, | ||||
|     SparseLinear_v2, | ||||
|     chain, | ||||
|     clone, | ||||
|     concatenate, | ||||
|  | @ -25,9 +26,10 @@ from thinc.api import ( | |||
| ) | ||||
| from thinc.layers.chain import init as init_chain | ||||
| from thinc.layers.resizable import resize_linear_weighted, resize_model | ||||
| from thinc.types import Floats2d | ||||
| from thinc.types import ArrayXd, Floats2d | ||||
| 
 | ||||
| from ...attrs import ORTH | ||||
| from ...errors import Errors | ||||
| from ...tokens import Doc | ||||
| from ...util import registry | ||||
| from ..extract_ngrams import extract_ngrams | ||||
|  | @ -95,10 +97,48 @@ def build_bow_text_classifier( | |||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear(nO=nO), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatBOW.v3") | ||||
| def build_bow_text_classifier_v3( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     length: int = 262144, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     if length < 1: | ||||
|         raise ValueError(Errors.E1056.format(length=length)) | ||||
| 
 | ||||
|     # Find k such that 2**(k-1) < length <= 2**k. | ||||
|     length = 2 ** (length - 1).bit_length() | ||||
| 
 | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear_v2(nO=nO, length=length), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def _build_bow_text_classifier( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         sparse_linear = SparseLinear(nO=nO) | ||||
|         output_layer = None | ||||
|         if not no_output_layer: | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|  |  | |||
|  | @ -36,8 +36,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m | |||
| 
 | ||||
| single_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  |  | |||
|  | @ -35,8 +35,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod | |||
| 
 | ||||
| multi_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  |  | |||
|  | @ -203,7 +203,7 @@ def test_pipe_class_component_model(): | |||
|             "@architectures": "spacy.TextCatEnsemble.v2", | ||||
|             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||
|             "linear_model": { | ||||
|                 "@architectures": "spacy.TextCatBOW.v2", | ||||
|                 "@architectures": "spacy.TextCatBOW.v3", | ||||
|                 "exclusive_classes": False, | ||||
|                 "ngram_size": 1, | ||||
|                 "no_output_layer": False, | ||||
|  |  | |||
|  | @ -414,7 +414,7 @@ def test_implicit_label(name, get_examples): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         # BOW V1 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|  | @ -451,11 +451,11 @@ def test_no_resize(name, textcat_config): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # BOW V3 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # CNN | ||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|  | @ -480,11 +480,11 @@ def test_resize(name, textcat_config): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # BOW v3 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # CNN | ||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|  | @ -693,9 +693,14 @@ def test_overfitting_IO_multi(): | |||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||
|         # BOW V3 | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||
|         # ENSEMBLE V2 | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), | ||||
|         # CNN V2 | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|  |  | |||
|  | @ -238,7 +238,7 @@ def test_project_push_pull(project_dir): | |||
| 
 | ||||
| def test_find_function_valid(): | ||||
|     # example of architecture in main code base | ||||
|     function = "spacy.TextCatBOW.v2" | ||||
|     function = "spacy.TextCatBOW.v3" | ||||
|     result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) | ||||
|     assert f"Found registered function '{function}'" in result.stdout | ||||
|     assert "textcat.py" in result.stdout | ||||
|  | @ -257,7 +257,7 @@ def test_find_function_valid(): | |||
| 
 | ||||
| def test_find_function_invalid(): | ||||
|     # invalid registry | ||||
|     function = "spacy.TextCatBOW.v2" | ||||
|     function = "spacy.TextCatBOW.v3" | ||||
|     registry = "foobar" | ||||
|     result = CliRunner().invoke( | ||||
|         app, ["find-function", function, "--registry", registry] | ||||
|  |  | |||
|  | @ -376,8 +376,9 @@ def test_util_dot_section(): | |||
|     factory = "textcat" | ||||
| 
 | ||||
|     [components.textcat.model] | ||||
|     @architectures = "spacy.TextCatBOW.v2" | ||||
|     @architectures = "spacy.TextCatBOW.v3" | ||||
|     exclusive_classes = true | ||||
|     length = 262144 | ||||
|     ngram_size = 1 | ||||
|     no_output_layer = false | ||||
|     """ | ||||
|  |  | |||
|  | @ -78,16 +78,16 @@ subword features, and a | |||
| [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer | ||||
| consisting of a CNN and a layer-normalized maxout activation function. | ||||
| 
 | ||||
| | Name                 | Description                                                                                                                                                                                                                                                                   | | ||||
| | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          | | ||||
| | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                | | ||||
| | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            | | ||||
| | Name                 | Description                                                                                                                                                                                                                                                                 | | ||||
| | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        | | ||||
| | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              | | ||||
| | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          | | ||||
| | `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | ||||
| | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   | | ||||
| | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       | | ||||
| | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  | | ||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        | | ||||
| | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                 | | ||||
| | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                     | | ||||
| | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                | | ||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                      | | ||||
| 
 | ||||
| ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} | ||||
| 
 | ||||
|  | @ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the | |||
| > nO = null | ||||
| > | ||||
| > [model.linear_model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > @architectures = "spacy.TextCatBOW.v3" | ||||
| > exclusive_classes = true | ||||
| > length = 262144 | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > | ||||
|  | @ -1057,14 +1058,15 @@ after training. | |||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatBOW.v2 {id="TextCatBOW"} | ||||
| ### spacy.TextCatBOW.v3 {id="TextCatBOW"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > @architectures = "spacy.TextCatBOW.v3" | ||||
| > exclusive_classes = false | ||||
| > length = 262144 | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > nO = null | ||||
|  | @ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short. | |||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||
| | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||
| | `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| <Accordion title="spacy.TextCatBOW.v1 definition" spaced> | ||||
| <Accordion title="Previous versions of spacy.TextCatBOW" spaced> | ||||
| 
 | ||||
| [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was | ||||
| not yet resizable. Since v2, new labels can be added to this component, even | ||||
| after training. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, | ||||
|   new labels can be added to this component, even after training. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||
|   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear | ||||
|   layer that only used a small number of the allocated parameters. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||
|   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument. | ||||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
|  |  | |||
|  | @ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster. | |||
| 
 | ||||
| Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means | ||||
| that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not | ||||
| yet support that. | ||||
| yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an | ||||
| erroneous sparse linear layer that only used a small number of the allocated | ||||
| parameters. | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
|  | @ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short. | |||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TextCatBOW.v2 {id="TextCatBOW"} | ||||
| 
 | ||||
| Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse | ||||
| linear layer that only used a small number of the allocated parameters. | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > exclusive_classes = false | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > nO = null | ||||
| > ``` | ||||
| 
 | ||||
| An n-gram "bag-of-words" model. This architecture should run much faster than | ||||
| the others, but may not be as accurate, especially if texts are short. | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||
| | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} | ||||
| 
 | ||||
| Identical to | ||||
|  |  | |||
|  | @ -153,8 +153,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| ``` | ||||
|  | @ -170,8 +171,9 @@ factory = "textcat" | |||
| labels = [] | ||||
| 
 | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| nO = null | ||||
|  |  | |||
|  | @ -1328,8 +1328,9 @@ labels = [] | |||
| # This function is created and then passed to the "textcat" component as | ||||
| # the argument "model" | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user