diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index e29ee71a2..e3e952d1d 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -526,8 +526,8 @@ def debug_data( msg.info(f"{len(label_list)} label(s) in train data") p = np.array(counts) p = p / p.sum() - entropy = np.round((-p*np.log2(p)).sum(), 2) - msg.info(f"{entropy} is the train data label entropy") + norm_entropy = (-p * np.log2(p)).sum() / np.log2(len(label_list)) + msg.info(f"{norm_entropy} is the normalised label entropy") model_labels = _get_labels_from_model(nlp, "tagger") labels = set(label_list) missing_labels = model_labels - labels diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 32cdd196b..fdc0a17d5 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -70,8 +70,12 @@ PARTIAL_DATA = [ def test_label_smoothing(): util.fix_random_seed() nlp = Language() - tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing", config=dict(label_smoothing=False)) - tagger_ls = nlp.add_pipe("tagger", "label_smoothing", config=dict(label_smoothing=True)) + tagger_no_ls = nlp.add_pipe( + "tagger", "no_label_smoothing", config=dict(label_smoothing=False) + ) + tagger_ls = nlp.add_pipe( + "tagger", "label_smoothing", config=dict(label_smoothing=True) + ) train_examples = [] losses = {} for tag in TAGS: @@ -83,7 +87,10 @@ def test_label_smoothing(): for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - assert losses == {'no_label_smoothing': 1.4892945885658264, 'label_smoothing': 1.1432453989982605} + assert losses == { + "no_label_smoothing": 1.4892945885658264, + "label_smoothing": 1.1432453989982605, + } def test_no_label():