mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Set rule-based lemmatizers as default (#6076)
For languages without provided models and with lemmatizer rules in `spacy-lookups-data`, make the rule-based lemmatizer the default: Bengali, Persian, Norwegian, Swedish
This commit is contained in:
parent
4d75040546
commit
87c329c711
|
@ -1,7 +1,11 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
|
@ -17,4 +21,22 @@ class Bengali(Language):
|
|||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,22 @@ class Persian(Language):
|
|||
Defaults = PersianDefaults
|
||||
|
||||
|
||||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,22 @@ class Norwegian(Language):
|
|||
Defaults = NorwegianDefaults
|
||||
|
||||
|
||||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -22,4 +27,22 @@ class Swedish(Language):
|
|||
Defaults = SwedishDefaults
|
||||
|
||||
|
||||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
|||
# Only include languages with no external dependencies
|
||||
# excluded: ru, uk
|
||||
# excluded for custom tables: pl
|
||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user