mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Set rule-based lemmatizers as default (#6076)
For languages without provided models and with lemmatizer rules in `spacy-lookups-data`, make the rule-based lemmatizer the default: Bengali, Persian, Norwegian, Swedish
This commit is contained in:
parent
4d75040546
commit
87c329c711
|
@ -1,7 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
|
@ -17,4 +21,22 @@ class Bengali(Language):
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Bengali.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +24,22 @@ class Persian(Language):
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Persian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +24,22 @@ class Norwegian(Language):
|
||||||
Defaults = NorwegianDefaults
|
Defaults = NorwegianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Norwegian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -22,4 +27,22 @@ class Swedish(Language):
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Swedish.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ru, uk
|
# excluded: ru, uk
|
||||||
# excluded for custom tables: pl
|
# excluded for custom tables: pl
|
||||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user