Fix/irreg adverbs extension (#3499)

* extended list of irreg adverbs

* added test to exceptions

* fixed typo
This commit is contained in:
Duygu Altinok 2019-03-28 13:23:33 +01:00 committed by Matthew Honnibal
parent 1db3e47509
commit 5a7bc6b39d
2 changed files with 24 additions and 0 deletions

View File

@ -5,9 +5,27 @@ from __future__ import unicode_literals
ADVERBS_IRREG = { ADVERBS_IRREG = {
"best": ("well",), "best": ("well",),
"better": ("well",), "better": ("well",),
"closer": ("close",),
"closest": ("close",),
"deeper": ("deeply",), "deeper": ("deeply",),
"earlier": ("early",),
"earliest": ("early",),
"farther": ("far",), "farther": ("far",),
"further": ("far",), "further": ("far",),
"faster": ("fast",),
"fastest": ("fast",),
"harder": ("hard",), "harder": ("hard",),
"hardest": ("hard",), "hardest": ("hard",),
"longer": ("long",),
"longest": ("long",),
"nearer": ("near",),
"nearest": ("near",),
"nigher": ("nigh",),
"nighest": ("nigh",),
"quicker": ("quick",),
"quickest": ("quick",),
"slower": ("slow",),
"slowest": ("slowest",),
"sooner": ("soon",),
"soonest": ("soon",)
} }

View File

@ -124,3 +124,9 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm assert tokens[0].norm_ == norm
@pytest.mark.parametrize("text", ["faster", "fastest", "better", "best"])
def test_en_lemmatizer_handles_irreg_adverbs(en_tokenizer, text):
tokens = en_tokenizer(text)
assert tokens[0].lemma_ in ["fast", "well"]