From 36634d2adfe270a8f240230cf652c10c1c7c6ba3 Mon Sep 17 00:00:00 2001 From: vinit Date: Mon, 6 Mar 2023 14:54:37 +0530 Subject: [PATCH] update morphologizer, tagger test --- spacy/cli/templates/quickstart_training.jinja | 3 ++- spacy/pipeline/morphologizer.pyx | 11 +++++--- spacy/tests/pipeline/test_morphologizer.py | 27 ++++++++++++++++++- spacy/tests/pipeline/test_tagger.py | 5 ++-- website/docs/api/morphologizer.mdx | 13 ++++----- 5 files changed, 45 insertions(+), 14 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index ab61396d9..046ceb81e 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -69,7 +69,6 @@ grad_factor = 1.0 {% if "tagger" in components %} [components.tagger] factory = "tagger" -label_smoothing = 0.05 [components.tagger.model] @architectures = "spacy.Tagger.v2" @@ -287,6 +286,7 @@ maxout_pieces = 3 {% if "morphologizer" in components %} [components.morphologizer] factory = "morphologizer" +label_smoothing = 0.05 [components.morphologizer.model] @architectures = "spacy.Tagger.v2" @@ -300,6 +300,7 @@ width = ${components.tok2vec.model.encode.width} {% if "tagger" in components %} [components.tagger] factory = "tagger" +label_smoothing = 0.05 [components.tagger.model] @architectures = "spacy.Tagger.v2" diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 24f98508f..be8f82212 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -52,7 +52,8 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, + default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -61,9 +62,10 @@ def make_morphologizer( name: str, overwrite: bool, extend: bool, + label_smoothing: float, scorer: Optional[Callable], ): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer) def morphologizer_score(examples, **kwargs): @@ -94,6 +96,7 @@ class Morphologizer(Tagger): *, overwrite: bool = BACKWARD_OVERWRITE, extend: bool = BACKWARD_EXTEND, + label_smoothing: float = 0.0, scorer: Optional[Callable] = morphologizer_score, ): """Initialize a morphologizer. @@ -121,6 +124,7 @@ class Morphologizer(Tagger): "labels_pos": {}, "overwrite": overwrite, "extend": extend, + "label_smoothing": label_smoothing, } self.cfg = dict(sorted(cfg.items())) self.scorer = scorer @@ -270,7 +274,8 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#get_loss """ validate_examples(examples, "Morphologizer.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, + label_smoothing=self.cfg["label_smoothing"]) truths = [] for eg in examples: eg_truths = [] diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 33696bfd8..8ce74ccfa 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,5 +1,5 @@ import pytest -from numpy.testing import assert_equal +from numpy.testing import assert_equal, assert_almost_equal from spacy import util from spacy.training import Example @@ -19,6 +19,8 @@ def test_label_types(): morphologizer.add_label(9) +TAGS = ["Feat=N", "Feat=V", "Feat=J"] + TRAIN_DATA = [ ( "I like green eggs", @@ -32,6 +34,29 @@ TRAIN_DATA = [ ] +def test_label_smoothing(): + nlp = Language() + morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing") + morph_ls = nlp.add_pipe( + "morphologizer", "label_smoothing", config=dict(label_smoothing=0.05) + ) + train_examples = [] + losses = {} + for tag in TAGS: + morph_no_ls.add_label(tag) + morph_ls.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + tag_scores, bp_tag_scores = morph_ls.model.begin_update( + [eg.predicted for eg in train_examples] + ) + no_ls_grads = morph_no_ls.get_loss(train_examples, tag_scores)[1][0] + ls_grads = morph_ls.get_loss(train_examples, tag_scores)[1][0] + assert_almost_equal(ls_grads / no_ls_grads, 0.94285715) + + def test_no_label(): nlp = Language() nlp.add_pipe("morphologizer") diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 65c4cbe0a..0cc25a64b 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,5 @@ import pytest -from numpy.testing import assert_equal, assert_array_almost_equal +from numpy.testing import assert_equal, assert_almost_equal from spacy.attrs import TAG from spacy import util @@ -68,7 +68,6 @@ PARTIAL_DATA = [ def test_label_smoothing(): - util.fix_random_seed() nlp = Language() tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing") tagger_ls = nlp.add_pipe( @@ -88,7 +87,7 @@ def test_label_smoothing(): ) no_ls_grads = tagger_no_ls.get_loss(train_examples, tag_scores)[1][0] ls_grads = tagger_ls.get_loss(train_examples, tag_scores)[1][0] - assert_array_almost_equal((ls_grads - no_ls_grads)[0], [0.05, -0.025, -0.025]) + assert_almost_equal(ls_grads / no_ls_grads, 0.925) def test_no_label(): diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index f097f2ae3..440061dc5 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -42,12 +42,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | -| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| `label_smoothing` 3.6 | Whether or not to use label smoothing. Defaults to `False`. ~~bool~~ | ```python %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx