From 87c329c7114767d8788090a3838fce0bf36822b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 16 Sep 2020 17:37:29 +0200 Subject: [PATCH] Set rule-based lemmatizers as default (#6076) For languages without provided models and with lemmatizer rules in `spacy-lookups-data`, make the rule-based lemmatizer the default: Bengali, Persian, Norwegian, Swedish --- spacy/lang/bn/__init__.py | 22 ++++++++++++++++++++++ spacy/lang/fa/__init__.py | 22 ++++++++++++++++++++++ spacy/lang/nb/__init__.py | 22 ++++++++++++++++++++++ spacy/lang/sv/__init__.py | 23 +++++++++++++++++++++++ spacy/tests/lang/test_lemmatizers.py | 2 +- 5 files changed, 90 insertions(+), 1 deletion(-) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 6c1d66cba..270185a4b 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,7 +1,11 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class BengaliDefaults(Language.Defaults): @@ -17,4 +21,22 @@ class Bengali(Language): Defaults = BengaliDefaults +@Bengali.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Bengali"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 7fdb9d065..244534120 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,9 +1,13 @@ +from typing import Optional +from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class PersianDefaults(Language.Defaults): @@ -20,4 +24,22 @@ class Persian(Language): Defaults = PersianDefaults +@Persian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Persian"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index d2bb92072..28a2f0bf2 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,9 +1,13 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class NorwegianDefaults(Language.Defaults): @@ -20,4 +24,22 @@ class Norwegian(Language): Defaults = NorwegianDefaults +@Norwegian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Norwegian"] diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 0c6a1b9f4..6db74cd39 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,8 +1,13 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer + # Punctuation stolen from Danish from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES @@ -22,4 +27,22 @@ class Swedish(Language): Defaults = SwedishDefaults +@Swedish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Swedish"] diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 14c59659a..6e7f82341 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -8,7 +8,7 @@ from spacy.util import get_lang_class # Only include languages with no external dependencies # excluded: ru, uk # excluded for custom tables: pl -LANGUAGES = ["el", "en", "fr", "nl"] +LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on