From 5a7bc6b39deba2ead3ea9d0d6b9af3c88f9f483b Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Thu, 28 Mar 2019 13:23:33 +0100 Subject: [PATCH] Fix/irreg adverbs extension (#3499) * extended list of irreg adverbs * added test to exceptions * fixed typo --- spacy/lang/en/lemmatizer/_adverbs_irreg.py | 18 ++++++++++++++++++ spacy/tests/lang/en/test_exceptions.py | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/spacy/lang/en/lemmatizer/_adverbs_irreg.py b/spacy/lang/en/lemmatizer/_adverbs_irreg.py index 4f0b479b8..4499b2a13 100644 --- a/spacy/lang/en/lemmatizer/_adverbs_irreg.py +++ b/spacy/lang/en/lemmatizer/_adverbs_irreg.py @@ -5,9 +5,27 @@ from __future__ import unicode_literals ADVERBS_IRREG = { "best": ("well",), "better": ("well",), + "closer": ("close",), + "closest": ("close",), "deeper": ("deeply",), + "earlier": ("early",), + "earliest": ("early",), "farther": ("far",), "further": ("far",), + "faster": ("fast",), + "fastest": ("fast",), "harder": ("hard",), "hardest": ("hard",), + "longer": ("long",), + "longest": ("long",), + "nearer": ("near",), + "nearest": ("near",), + "nigher": ("nigh",), + "nighest": ("nigh",), + "quicker": ("quick",), + "quickest": ("quick",), + "slower": ("slow",), + "slowest": ("slowest",), + "sooner": ("soon",), + "soonest": ("soon",) } diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..b360b517e 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -124,3 +124,9 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): tokens = en_tokenizer(text) assert tokens[0].norm_ == norm + + +@pytest.mark.parametrize("text", ["faster", "fastest", "better", "best"]) +def test_en_lemmatizer_handles_irreg_adverbs(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].lemma_ in ["fast", "well"]