diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 21dbf1798..7ba008fb6 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,24 +1,13 @@ [nlp] lang = null -stop_words = [] -lex_attr_getters = {} vocab_data = {} -get_noun_chunks = null pipeline = [] [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" -token_match = null -url_match = {"@language_data": "spacy.xx.url_match"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -data = {} - -[nlp.writing_system] -direction = "ltr" -has_case = true -has_letters = true [components] diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index ee187ae5a..91917daee 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "af" -stop_words = {"@language_data": "spacy.af.stop_words"} -""" - - -@registry.language_data("spacy.af.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class AfrikaansDefaults(Language.Defaults): + stop_words = STOP_WORDS class Afrikaans(Language): lang = "af" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = AfrikaansDefaults __all__ = ["Afrikaans"] diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index f387d0310..6abb65efb 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -1,46 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ar" -stop_words = {"@language_data": "spacy.ar.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true -""" - - -@registry.language_data("spacy.ar.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ar.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class ArabicDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Arabic(Language): - lang = "ar" Defaults = ArabicDefaults - default_config = Config().from_str(DEFAULT_CONFIG) + lang = "ar" __all__ = ["Arabic"] diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 4a31a3653..a30f49ce7 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "bg" -stop_words = {"@language_data": "spacy.bg.stop_words"} -""" - - -@registry.language_data("spacy.bg.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class BulgarianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Bulgarian(Language): lang = "bg" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = BulgarianDefaults __all__ = ["Bulgarian"] diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index da2ca0c8d..6c1d66cba 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,31 +1,7 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "bn" -stop_words = {"@language_data": "spacy.bn.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules"] -""" - - -@registry.language_data("spacy.bn.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class BengaliDefaults(Language.Defaults): @@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Bengali(Language): lang = "bn" Defaults = BengaliDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Bengali"] diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 1fe7516ad..970b23c1e 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,49 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -from .punctuation import TOKENIZER_INFIXES - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ca" -stop_words = {"@language_data": "spacy.ca.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ca.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ca.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class CatalanDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Catalan(Language): lang = "ca" Defaults = CatalanDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Catalan"] diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index f424c83fa..a4b546b13 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "cs" -stop_words = {"@language_data": "spacy.cs.stop_words"} -""" - - -@registry.language_data("spacy.cs.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class CzechDefaults(Language.Defaults): + stop_words = STOP_WORDS class Czech(Language): lang = "cs" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = CzechDefaults __all__ = ["Czech"] diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 4e6ee9383..8cac30b26 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -1,55 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "da" -stop_words = {"@language_data": "spacy.da.stop_words"} -lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.da.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.da.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class DanishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Danish(Language): lang = "da" Defaults = DanishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Danish"] diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 58ee71247..b645d3480 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,44 +1,8 @@ -from typing import Set, Callable -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "de" -stop_words = {"@language_data": "spacy.de.stop_words"} -get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] -""" - - -@registry.language_data("spacy.de.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.de.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class GermanDefaults(Language.Defaults): @@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class German(Language): lang = "de" Defaults = GermanDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["German"] diff --git a/spacy/lang/defaults.py b/spacy/lang/defaults.py deleted file mode 100644 index 6d692d6a5..000000000 --- a/spacy/lang/defaults.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Pattern - -from .tokenizer_exceptions import URL_MATCH -from ..util import registry - - -@registry.language_data("spacy.xx.url_match") -def url_match() -> Pattern: - return URL_MATCH diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index defe53891..c766c375e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,69 +1,50 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import GreekLemmatizer -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "el" -stop_words = {"@language_data": "spacy.el.stop_words"} -lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"} [nlp.lemmatizer] -@lemmatizers = "spacy.GreekLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_index", "lemma_exc", "lemma_rules"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"] +@lemmatizers = "spacy.el.GreekLemmatizer" """ -@registry.lemmatizers("spacy.GreekLemmatizer.v1") -def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: - return GreekLemmatizer(data=data) +@registry.lemmatizers("spacy.el.GreekLemmatizer") +def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]: + tables = ["lemma_index", "lemma_exc", "lemma_rules"] + def lemmatizer_factory(nlp: Language) -> GreekLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return GreekLemmatizer(lookups=lookups) -@registry.language_data("spacy.el.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.language_data("spacy.el.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.el.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS + return lemmatizer_factory class GreekDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Greek(Language): lang = "el" Defaults = GreekDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ebe2d1d53..81200da27 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,68 +1,49 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import is_base_form from .punctuation import TOKENIZER_INFIXES from ...language import Language from ...lemmatizer import Lemmatizer +from ...lookups import load_lookups from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "en" -stop_words = {"@language_data": "spacy.en.stop_words"} -lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"} [nlp.lemmatizer] -@lemmatizers = "spacy.EnglishLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] +@lemmatizers = "spacy.en.EnglishLemmatizer" """ -@registry.language_data("spacy.en.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.en.EnglishLemmatizer") +def create_lemmatizer() -> Callable[[Language], Lemmatizer]: + tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + def lemmatizer_factory(nlp: Language) -> Lemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return Lemmatizer(lookups=lookups, is_base_form=is_base_form) -@registry.language_data("spacy.en.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.EnglishLemmatizer.v1") -def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer": - return Lemmatizer(data=data, is_base_form=is_base_form) - - -@registry.language_data("spacy.en.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks + return lemmatizer_factory class EnglishDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class English(Language): lang = "en" Defaults = EnglishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["English"] diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index bc378f3db..9a47855b1 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,62 +1,23 @@ -from typing import Set, Dict, Callable, Any -from thinc.config import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "es" -stop_words = {"@language_data": "spacy.es.stop_words"} -lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"] -""" - - -@registry.language_data("spacy.es.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.language_data("spacy.es.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.es.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class SpanishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Spanish(Language): lang = "es" Defaults = SpanishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Spanish"] diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index 38da9ab1e..9f71882d2 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "et" -stop_words = {"@language_data": "spacy.et.stop_words"} -""" - - -@registry.language_data("spacy.et.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class EstonianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Estonian(Language): lang = "et" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = EstonianDefaults __all__ = ["Estonian"] diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py index 4df50bca5..89550be96 100644 --- a/spacy/lang/eu/__init__.py +++ b/spacy/lang/eu/__init__.py @@ -1,41 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "eu" -stop_words = {"@language_data": "spacy.eu.stop_words"} -lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"} -""" - - -@registry.language_data("spacy.eu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.eu.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class BasqueDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Basque(Language): lang = "eu" Defaults = BasqueDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Basque"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index a1ab0712f..7fdb9d065 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,61 +1,23 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - -from ...language import Language -from ...util import registry from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES -from .syntax_iterators import noun_chunks - - -DEFAULT_CONFIG = """ -[nlp] -lang = "fa" -stop_words = {"@language_data": "spacy.fa.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc"] -""" - - -@registry.language_data("spacy.fa.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fa.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.fa.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS +from ...language import Language class PersianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Persian(Language): lang = "fa" Defaults = PersianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Persian"] diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 33313aeb6..9233c6547 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -1,42 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "fi" -stop_words = {"@language_data": "spacy.fi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"} -""" - - -@registry.language_data("spacy.fi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class FinnishDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Finnish(Language): lang = "fi" Defaults = FinnishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Finnish"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 41014aa34..a5350d422 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any, Pattern +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH @@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import FrenchLemmatizer, is_base_form -from .syntax_iterators import noun_chunks +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "fr" -stop_words = {"@language_data": "spacy.fr.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"} - -[nlp.tokenizer] -@tokenizers = "spacy.Tokenizer.v1" -token_match = {"@language_data": "spacy.fr.token_match"} [nlp.lemmatizer] -@lemmatizers = "spacy.FrenchLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] +@lemmatizers = "spacy.fr.FrenchLemmatizer" """ -@registry.lemmatizers("spacy.FrenchLemmatizer.v1") -def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: - return FrenchLemmatizer(data=data, is_base_form=is_base_form) +@registry.lemmatizers("spacy.fr.FrenchLemmatizer") +def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]: + tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] + def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form) -@registry.language_data("spacy.fr.token_match") -def token_match() -> Pattern: - return TOKEN_MATCH - - -@registry.language_data("spacy.fr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fr.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.fr.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks + return lemmatizer_factory class FrenchDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + token_match = TOKEN_MATCH + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class French(Language): lang = "fr" Defaults = FrenchDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["French"] diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 3c13f56fb..80131368b 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,32 +1,16 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ga" -stop_words = {"@language_data": "spacy.ga.stop_words"} -""" - - -@registry.language_data("spacy.ga.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class IrishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS class Irish(Language): lang = "ga" Defaults = IrishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Irish"] diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 3ca8bbd4c..67228ac40 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "gu" -stop_words = {"@language_data": "spacy.gu.stop_words"} -""" - - -@registry.language_data("spacy.gu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class GujaratiDefaults(Language.Defaults): + stop_words = STOP_WORDS class Gujarati(Language): lang = "gu" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = GujaratiDefaults __all__ = ["Gujarati"] diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index cd07d405e..70bd9cf45 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,37 +1,15 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "he" -stop_words = {"@language_data": "spacy.he.stop_words"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true -""" - - -@registry.language_data("spacy.he.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class HebrewDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Hebrew(Language): lang = "he" Defaults = HebrewDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Hebrew"] diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index 48890c4f9..384f040c8 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "hi" -stop_words = {"@language_data": "spacy.hi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"} -""" - - -@registry.language_data("spacy.hi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class HindiDefaults(Language.Defaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Hindi(Language): lang = "hi" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = HindiDefaults __all__ = ["Hindi"] diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 54c1a8f1f..118e0946a 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,40 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "hr" -stop_words = {"@language_data": "spacy.hr.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.hr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class CroatianDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + stop_words = STOP_WORDS class Croatian(Language): lang = "hr" Defaults = CroatianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Croatian"] diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index b9f5a5c34..8962603a6 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,40 +1,7 @@ -from typing import Set, Pattern -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "hu" -stop_words = {"@language_data": "spacy.hu.stop_words"} - -[nlp.tokenizer] -@tokenizers = "spacy.Tokenizer.v1" -token_match = {"@language_data": "spacy.hu.token_match"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.hu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hu.token_match") -def token_match() -> Pattern: - return TOKEN_MATCH class HungarianDefaults(Language.Defaults): @@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + token_match = TOKEN_MATCH + stop_words = STOP_WORDS class Hungarian(Language): lang = "hu" Defaults = HungarianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Hungarian"] diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 33bb8d08a..4577ab641 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "hy" -stop_words = {"@language_data": "spacy.hy.stop_words"} -lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"} -""" - - -@registry.language_data("spacy.hy.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hy.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class ArmenianDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Armenian(Language): lang = "hy" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = ArmenianDefaults __all__ = ["Armenian"] diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ecefd0a66..87373551c 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -1,50 +1,9 @@ -from typing import Set, Dict, Callable, Any -from thinc.config import Config - from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "id" -stop_words = {"@language_data": "spacy.id.stop_words"} -lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.id.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.id.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.id.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class IndonesianDefaults(Language.Defaults): @@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Indonesian(Language): lang = "id" Defaults = IndonesianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Indonesian"] diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index 82fc7e0c2..be5de5981 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "is" -stop_words = {"@language_data": "spacy.is.stop_words"} -""" - - -@registry.language_data("spacy.is.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class IcelandicDefaults(Language.Defaults): + stop_words = STOP_WORDS class Icelandic(Language): lang = "is" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = IcelandicDefaults __all__ = ["Icelandic"] diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 107018392..25cbaa651 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,31 +1,7 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "it" -stop_words = {"@language_data": "spacy.it.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.it.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class ItalianDefaults(Language.Defaults): @@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults): class Italian(Language): lang = "it" Defaults = ItalianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Italian"] diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 642b59a4b..d435afe12 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,11 +1,11 @@ -from typing import Optional, Union, Dict, Any, Set, Callable +from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple from thinc.api import Config from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP @@ -20,33 +20,15 @@ from ... import util DEFAULT_CONFIG = """ [nlp] -lang = "ja" -stop_words = {"@language_data": "spacy.ja.stop_words"} -get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"} [nlp.tokenizer] -@tokenizers = "spacy.JapaneseTokenizer.v1" +@tokenizers = "spacy.ja.JapaneseTokenizer" split_mode = null - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false """ -@registry.language_data("spacy.ja.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ja.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.tokenizers("spacy.JapaneseTokenizer.v1") -def create_japanese_tokenizer(split_mode: Optional[str] = None): +@registry.tokenizers("spacy.ja.JapaneseTokenizer") +def create_tokenizer(split_mode: Optional[str] = None): def japanese_tokenizer_factory(nlp): return JapaneseTokenizer(nlp, split_mode=split_mode) @@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer): return self +class JapaneseDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + class Japanese(Language): lang = "ja" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = JapaneseDefaults # Hold the attributes we need with convenient names diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index c323ca5c7..8e53989e6 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "kn" -stop_words = {"@language_data": "spacy.kn.stop_words"} -""" - - -@registry.language_data("spacy.kn.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class KannadaDefaults(Language.Defaults): + stop_words = STOP_WORDS class Kannada(Language): lang = "kn" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = KannadaDefaults __all__ = ["Kannada"] diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 83cd44ded..d2af9c4b1 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Optional, Any, Dict +from typing import Optional, Any, Dict from thinc.api import Config from .stop_words import STOP_WORDS @@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry DEFAULT_CONFIG = """ [nlp] -lang = "ko" -stop_words = {"@language_data": "spacy.ko.stop_words"} [nlp.tokenizer] -@tokenizers = "spacy.KoreanTokenizer.v1" - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false +@tokenizers = "spacy.ko.KoreanTokenizer" """ -@registry.language_data("spacy.ko.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.tokenizers("spacy.KoreanTokenizer.v1") -def create_korean_tokenizer(): +@registry.tokenizers("spacy.ko.KoreanTokenizer") +def create_tokenizer(): def korean_tokenizer_factory(nlp): return KoreanTokenizer(nlp) @@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer): yield {"surface": surface, "lemma": lemma, "tag": tag} +class KoreanDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + stop_words = STOP_WORDS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + class Korean(Language): lang = "ko" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = KoreanDefaults def try_mecab_import() -> None: diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 56b09208f..da6fe55d7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -1,54 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lb" -stop_words = {"@language_data": "spacy.lb.stop_words"} -lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.lb.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.lb.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class LuxembourgishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Luxembourgish(Language): lang = "lb" Defaults = LuxembourgishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Luxembourgish"] diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index 02f9a72b6..5ae280324 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -1,34 +1,18 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lij" -stop_words = {"@language_data": "spacy.lij.stop_words"} -""" - - -@registry.language_data("spacy.lij.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class LigurianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Ligurian(Language): lang = "lij" Defaults = LigurianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Ligurian"] diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index e82c4c4e0..e395a8f62 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,50 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lt" -stop_words = {"@language_data": "spacy.lt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.lt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.lt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class LithuanianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Lithuanian(Language): lang = "lt" Defaults = LithuanianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Lithuanian"] diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index e37b44b0d..142bc706e 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "lv" -stop_words = {"@language_data": "spacy.lv.stop_words"} -""" - - -@registry.language_data("spacy.lv.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class LatvianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Latvian(Language): lang = "lv" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = LatvianDefaults __all__ = ["Latvian"] diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index e2ac0a641..166d0e061 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ml" -stop_words = {"@language_data": "spacy.ml.stop_words"} -""" - - -@registry.language_data("spacy.ml.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class MalayalamDefaults(Language.Defaults): + stop_words = STOP_WORDS class Malayalam(Language): lang = "ml" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = MalayalamDefaults __all__ = ["Malayalam"] diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index 3d7c621cb..af0c49878 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "af" -stop_words = {"@language_data": "spacy.mr.stop_words"} -""" - - -@registry.language_data("spacy.mr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class MarathiDefaults(Language.Defaults): + stop_words = STOP_WORDS class Marathi(Language): lang = "mr" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = MarathiDefaults __all__ = ["Marathi"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index f26c68e91..d2bb92072 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,39 +1,9 @@ -from typing import Set, Callable -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "nb" -stop_words = {"@language_data": "spacy.nb.stop_words"} -get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules", "lemma_exc"] -""" - - -@registry.language_data("spacy.nb.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.nb.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class NorwegianDefaults(Language.Defaults): @@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Norwegian(Language): lang = "nb" Defaults = NorwegianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Norwegian"] diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py index b72af86e4..68632e9ad 100644 --- a/spacy/lang/ne/__init__.py +++ b/spacy/lang/ne/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ne" -stop_words = {"@language_data": "spacy.ne.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"} -""" - - -@registry.language_data("spacy.ne.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ne.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class NepaliDefaults(Language.Defaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Nepali(Language): lang = "ne" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = NepaliDefaults __all__ = ["Nepali"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 9bf58fddd..d874ef7a1 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .stop_words import STOP_WORDS @@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "nl" -stop_words = {"@language_data": "spacy.nl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.DutchLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] +@lemmatizers = "spacy.nl.DutchLemmatizer" """ -@registry.language_data("spacy.nl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.nl.DutchLemmatizer") +def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]: + tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] + def lemmatizer_factory(nlp: Language) -> DutchLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return DutchLemmatizer(lookups=lookups) -@registry.language_data("spacy.nl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.DutchLemmatizer.v1") -def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer: - return DutchLemmatizer(data=data) + return lemmatizer_factory class DutchDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Dutch(Language): lang = "nl" Defaults = DutchDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 82957dc7a..2393f1aea 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "pl" -stop_words = {"@language_data": "spacy.pl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.PolishLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"] +@lemmatizers = "spacy.pl.PolishLemmatizer" """ - -@registry.language_data("spacy.pl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +TOKENIZER_EXCEPTIONS = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") +} -@registry.language_data("spacy.pl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +@registry.lemmatizers("spacy.pl.PolishLemmatizer") +def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]: + # fmt: off + tables = [ + "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", + "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", + "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb" + ] + # fmt: on + def lemmatizer_factory(nlp: Language) -> PolishLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return PolishLemmatizer(lookups=lookups) -@registry.lemmatizers("spacy.PolishLemmatizer.v1") -def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer: - return PolishLemmatizer(data=data) + return lemmatizer_factory class PolishDefaults(Language.Defaults): - mod_base_exceptions = { - exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") - } - tokenizer_exceptions = mod_base_exceptions + config = Config().from_str(DEFAULT_CONFIG) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Polish(Language): lang = "pl" Defaults = PolishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Polish"] diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index fce12393d..0447099f0 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -1,50 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "pt" -stop_words = {"@language_data": "spacy.pt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.pt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.pt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class PortugueseDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Portuguese(Language): lang = "pt" Defaults = PortugueseDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Portuguese"] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index bf7357e48..e712e71d6 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT -_prefixes = ( +TOKENIZER_PREFIXES = ( ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + LIST_PUNCT + LIST_ELLIPSES @@ -13,7 +13,7 @@ _prefixes = ( ) -_suffixes = ( +TOKENIZER_SUFFIXES = ( LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES @@ -31,7 +31,7 @@ _suffixes = ( ] ) -_infixes = ( +TOKENIZER_INFIXES = ( LIST_ELLIPSES + LIST_ICONS + [ @@ -44,7 +44,3 @@ _infixes = ( r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) - -TOKENIZER_PREFIXES = _prefixes -TOKENIZER_SUFFIXES = _suffixes -TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 881188b21..74016d3e9 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -1,49 +1,25 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry # Lemma data note: # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Replaced characters using cedillas with the correct ones (ș and ț) -DEFAULT_CONFIG = """ -[nlp] -lang = "ro" -stop_words = {"@language_data": "spacy.ro.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ro.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - class RomanianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Romanian(Language): lang = "ro" Defaults = RomanianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Romanian"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index b37ac6226..5d2333edf 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .stop_words import STOP_WORDS @@ -11,43 +11,30 @@ from ...language import Language DEFAULT_CONFIG = """ [nlp] -lang = "ru" -stop_words = {"@language_data": "spacy.ru.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.RussianLemmatizer.v1" - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] +@lemmatizers = "spacy.ru.RussianLemmatizer" """ -@registry.language_data("spacy.ru.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.ru.RussianLemmatizer") +def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]: + def lemmatizer_factory(nlp: Language) -> RussianLemmatizer: + return RussianLemmatizer() - -@registry.language_data("spacy.ru.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.RussianLemmatizer.v1") -def create_russian_lemmatizer() -> RussianLemmatizer: - return RussianLemmatizer() + return lemmatizer_factory class RussianDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Russian(Language): lang = "ru" Defaults = RussianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Russian"] diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index 69c4718c0..d77e3bb8b 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "si" -stop_words = {"@language_data": "spacy.si.stop_words"} -lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"} -""" - - -@registry.language_data("spacy.si.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.si.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class SinhalaDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Sinhala(Language): lang = "si" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SinhalaDefaults __all__ = ["Sinhala"] diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index c9493e829..4003c7340 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sk" -stop_words = {"@language_data": "spacy.sk.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"} -""" - - -@registry.language_data("spacy.sk.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sk.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class SlovakDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Slovak(Language): lang = "sk" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SlovakDefaults __all__ = ["Slovak"] diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 4f1954669..0330cc4d0 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sl" -stop_words = {"@language_data": "spacy.sl.stop_words"} -""" - - -@registry.language_data("spacy.sl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class SlovenianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Slovenian(Language): lang = "sl" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SlovenianDefaults __all__ = ["Slovenian"] diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index a3da6b354..a4bacfa49 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sq" -stop_words = {"@language_data": "spacy.sq.stop_words"} -""" - - -@registry.language_data("spacy.sq.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class AlbanianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Albanian(Language): lang = "sq" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = AlbanianDefaults __all__ = ["Albanian"] diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 36703aa5f..165e54975 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,52 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "sr" -stop_words = {"@language_data": "spacy.sr.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.sr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sr.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class SerbianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Serbian(Language): lang = "sr" Defaults = SerbianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Serbian"] diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index dc9f71ac6..0c6a1b9f4 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,59 +1,25 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry -from .syntax_iterators import noun_chunks # Punctuation stolen from Danish from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -DEFAULT_CONFIG = """ -[nlp] -lang = "sv" -stop_words = {"@language_data": "spacy.sv.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules"] -""" - - -@registry.language_data("spacy.sv.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sv.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.sv.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - class SwedishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Swedish(Language): lang = "sv" Defaults = SwedishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Swedish"] diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index c429127c9..ac5fc7124 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,38 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ta" -stop_words = {"@language_data": "spacy.ta.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"} - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.ta.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ta.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class TamilDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tamil(Language): lang = "ta" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = TamilDefaults __all__ = ["Tamil"] diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index d012d418d..e6dc80e28 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "te" -stop_words = {"@language_data": "spacy.te.stop_words"} -lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"} -""" - - -@registry.language_data("spacy.te.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.te.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class TeluguDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Telugu(Language): lang = "te" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = TeluguDefaults __all__ = ["Telugu"] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 1fdf4311e..989c22a42 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,4 +1,3 @@ -from typing import Set, Dict, Callable, Any from thinc.api import Config from .stop_words import STOP_WORDS @@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry DEFAULT_CONFIG = """ [nlp] -lang = "th" -stop_words = {"@language_data": "spacy.th.stop_words"} -lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.ThaiTokenizer.v1" - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] +@tokenizers = "spacy.th.ThaiTokenizer" """ -@registry.language_data("spacy.th.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.th.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.ThaiTokenizer.v1") +@registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): return ThaiTokenizer(nlp) @@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer): return Doc(self.vocab, words=words, spaces=spaces) +class ThaiDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + class Thai(Language): lang = "th" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = ThaiDefaults __all__ = ["Thai"] diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 7176e07d4..61530dc30 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,47 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tl" -stop_words = {"@language_data": "spacy.tl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.tl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.tl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class TagalogDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tagalog(Language): lang = "tl" Defaults = TagalogDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Tagalog"] diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 3bb1e0d06..70b277487 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,40 +1,16 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tr" -stop_words = {"@language_data": "spacy.tr.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.tr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class TurkishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS class Turkish(Language): lang = "tr" Defaults = TurkishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Turkish"] diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index d4828d96c..c8e293f29 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -1,41 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tt" -stop_words = {"@language_data": "spacy.tt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"} -""" - - -@registry.language_data("spacy.tt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.tt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class TatarDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - infixes = tuple(TOKENIZER_INFIXES) + infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tatar(Language): lang = "tt" Defaults = TatarDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Tatar"] diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 24a859951..6b44a7144 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer DEFAULT_CONFIG = """ [nlp] -lang = "uk" -stop_words = {"@language_data": "spacy.uk.stop_words"} -lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.UkrainianLemmatizer.v1" +@lemmatizers = "spacy.uk.UkrainianLemmatizer" """ -@registry.language_data("spacy.uk.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.uk.UkrainianLemmatizer") +def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]: + def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer: + return UkrainianLemmatizer() - -@registry.language_data("spacy.uk.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.UkrainianLemmatizer.v1") -def create_ukrainian_lemmatizer() -> UkrainianLemmatizer: - return UkrainianLemmatizer() + return lemmatizer_factory class UkrainianDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Ukrainian(Language): lang = "uk" Defaults = UkrainianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Ukrainian"] diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index db714c296..e3dee5805 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,54 +1,19 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ur" -stop_words = {"@language_data": "spacy.ur.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ur.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ur.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class UrduDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Urdu(Language): lang = "ur" Defaults = UrduDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Urdu"] diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 2003e904b..2b06d33f7 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,4 +1,3 @@ -from typing import Set, Dict, Callable, Any from thinc.api import Config from ...language import Language @@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS DEFAULT_CONFIG = """ [nlp] -lang = "vi" -stop_words = {"@language_data": "spacy.vi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.VietnameseTokenizer.v1" +@tokenizers = "spacy.vi.VietnameseTokenizer" use_pyvi = true """ -@registry.language_data("spacy.vi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.vi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.VietnameseTokenizer.v1") +@registry.tokenizers("spacy.vi.VietnameseTokenizer") def create_vietnamese_tokenizer(use_pyvi: bool = True,): def vietnamese_tokenizer_factory(nlp): return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) @@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer): return Doc(self.vocab, words=words, spaces=spaces) +class VietnameseDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + class Vietnamese(Language): lang = "vi" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = VietnameseDefaults __all__ = ["Vietnamese"] diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 2167d9a5e..aff8403ff 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,27 +1,12 @@ -from thinc.api import Config - -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -DEFAULT_CONFIG = """ -[nlp] -lang = "xx" -""" - - -class MultiLanguageDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS - - class MultiLanguage(Language): """Language class to be used for models that support multiple languages. This module allows models to specify their language ID as 'xx'. """ lang = "xx" - Defaults = MultiLanguageDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["MultiLanguage"] diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index b739ffbd7..df6bb7d4a 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,39 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "si" -stop_words = {"@language_data": "spacy.yo.stop_words"} -lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"} -""" - - -@registry.language_data("spacy.yo.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.yo.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class YorubaDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Yoruba(Language): lang = "yo" Defaults = YorubaDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Yoruba"] diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ba5489dfd..fe0613c80 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Set, Dict, Callable, Any +from typing import Optional, List, Dict, Any from enum import Enum import tempfile import srsly @@ -10,7 +10,6 @@ from ...errors import Warnings, Errors from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer, registry -from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http DEFAULT_CONFIG = """ [nlp] -lang = "zh" -stop_words = {"@language_data": "spacy.zh.stop_words"} -lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.ChineseTokenizer.v1" +@tokenizers = "spacy.zh.ChineseTokenizer" segmenter = "char" pkuseg_model = null pkuseg_user_dict = "default" - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false """ @@ -47,17 +38,7 @@ class Segmenter(str, Enum): return list(cls.__members__.keys()) -@registry.language_data("spacy.zh.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.zh.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.ChineseTokenizer.v1") +@registry.tokenizers("spacy.zh.ChineseTokenizer") def create_chinese_tokenizer( segmenter: Segmenter = Segmenter.char, pkuseg_model: Optional[str] = None, @@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def _get_config(self) -> Dict[str, Any]: + return { + "segmenter": self.segmenter, + "pkuseg_model": self.pkuseg_model, + "pkuseg_user_dict": self.pkuseg_user_dict, + } + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.segmenter = config.get("segmenter", Segmenter.char) + self.pkuseg_model = config.get("pkuseg_model", None) + self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") + def to_bytes(self, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" @@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer): sorted(list(self.pkuseg_seg.postprocesser.other_words)), ) serializers = { + "cfg": lambda: srsly.json_dumps(self._get_config()), "pkuseg_features": lambda: pkuseg_features_b, "pkuseg_weights": lambda: pkuseg_weights_b, "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data), @@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer): pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = { + "cfg": lambda b: self._set_config(srsly.json_loads(b)), "pkuseg_features": deserialize_pkuseg_features, "pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_processors": deserialize_pkuseg_processors, @@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer): srsly.write_msgpack(path, data) serializers = { + "cfg": lambda p: srsly.write_json(p, self._get_config()), "pkuseg_model": lambda p: save_pkuseg_model(p), "pkuseg_processors": lambda p: save_pkuseg_processors(p), } @@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.postprocesser.other_words = set(other_words) serializers = { + "cfg": lambda p: self._set_config(srsly.read_json(p)), "pkuseg_model": lambda p: load_pkuseg_model(p), "pkuseg_processors": lambda p: load_pkuseg_processors(p), } @@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} class Chinese(Language): lang = "zh" Defaults = ChineseDefaults - default_config = Config().from_str(DEFAULT_CONFIG) def try_jieba_import(segmenter: str) -> None: diff --git a/spacy/language.py b/spacy/language.py index 99fe98a66..6d2ae3dbe 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -16,27 +16,25 @@ import multiprocessing as mp from itertools import chain, cycle from .tokens.underscore import Underscore -from .vocab import Vocab +from .vocab import Vocab, create_vocab from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry from .util import SimpleFrozenDict +from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc +from .lookups import load_lookups +from .tokenizer import Tokenizer +from .lemmatizer import Lemmatizer from .errors import Errors, Warnings from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about -# We also need to import these to make sure the functions are registered -from .tokenizer import Tokenizer # noqa: F401 -from .lemmatizer import Lemmatizer # noqa: F401 -from .lookups import Lookups # noqa: F401 -from .lang import defaults # noqa: F401 - ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) @@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) class BaseDefaults: - prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES) - suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES) - infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES) - tokenizer_exceptions: Dict[str, List[dict]] = {} + config: Config = Config() + tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS + prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES + suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES + infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES + token_match: Optional[Pattern] = None + url_match: Optional[Pattern] = URL_MATCH + syntax_iterators: Dict[str, Callable] = {} + lex_attr_getters: Dict[int, Callable[[str], Any]] = {} + stop_words = set() + writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} + + +@registry.tokenizers("spacy.Tokenizer.v1") +def create_tokenizer() -> Callable[["Language"], Tokenizer]: + def tokenizer_factory(nlp: "Language") -> Tokenizer: + prefixes = nlp.Defaults.prefixes + suffixes = nlp.Defaults.suffixes + infixes = nlp.Defaults.infixes + prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None + suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None + infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None + return Tokenizer( + nlp.vocab, + rules=nlp.Defaults.tokenizer_exceptions, + prefix_search=prefix_search, + suffix_search=suffix_search, + infix_finditer=infix_finditer, + token_match=nlp.Defaults.token_match, + url_match=nlp.Defaults.url_match, + ) + + return tokenizer_factory + + +@registry.lemmatizers("spacy.Lemmatizer.v1") +def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: + tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + + def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": + lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False) + return Lemmatizer(lookups=lookups) + + return lemmatizer_factory class Language: @@ -65,8 +103,8 @@ class Language: Defaults = BaseDefaults lang: str = None default_config = DEFAULT_CONFIG - factories = SimpleFrozenDict(error=Errors.E957) + factories = SimpleFrozenDict(error=Errors.E957) _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory def __init__( @@ -75,6 +113,7 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, + create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, ): """Initialise a Language object. @@ -108,7 +147,16 @@ class Language: if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - vocab = Vocab.from_config(self._config, vectors_name=vectors_name) + if not create_lemmatizer: + lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} + create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] + # TODO: where does the vocab data come in? + vocab = create_vocab( + self.lang, + self.Defaults, + lemmatizer=create_lemmatizer(self), + vectors_name=vectors_name, + ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -126,7 +174,10 @@ class Language: def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) - cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG) + cls.default_config = util.deep_merge_configs( + cls.Defaults.config, DEFAULT_CONFIG + ) + cls.default_config["nlp"]["lang"] = cls.lang @property def path(self): @@ -1226,17 +1277,16 @@ class Language: config = util.deep_merge_configs(config, cls.default_config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) - nlp_config = config["nlp"] - config_lang = nlp_config["lang"] + config_lang = config["nlp"]["lang"] if cls.lang is not None and config_lang is not None and config_lang != cls.lang: raise ValueError( Errors.E958.format( - bad_lang_code=nlp_config["lang"], + bad_lang_code=config["nlp"]["lang"], lang_code=cls.lang, lang=util.get_object_name(cls), ) ) - nlp_config["lang"] = cls.lang + config["nlp"]["lang"] = cls.lang # This isn't very elegant, but we remove the [components] block here to prevent # it from getting resolved (causes problems because we expect to pass in # the nlp and name args for each component). If we're auto-filling, we're @@ -1251,22 +1301,12 @@ class Language: filled["components"] = orig_pipeline config["components"] = orig_pipeline create_tokenizer = resolved["nlp"]["tokenizer"] - lemmatizer = resolved["nlp"]["lemmatizer"] - lex_attr_getters = resolved["nlp"]["lex_attr_getters"] - stop_words = resolved["nlp"]["stop_words"] - vocab_data = resolved["nlp"]["vocab_data"] - get_noun_chunks = resolved["nlp"]["get_noun_chunks"] - vocab = Vocab.from_config( - filled, - lemmatizer=lemmatizer, - lex_attr_getters=lex_attr_getters, - stop_words=stop_words, - vocab_data=vocab_data, - get_noun_chunks=get_noun_chunks, + create_lemmatizer = resolved["nlp"]["lemmatizer"] + nlp = cls( + create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, ) - nlp = cls(vocab, create_tokenizer=create_tokenizer) pipeline = config.get("components", {}) - for pipe_name in nlp_config["pipeline"]: + for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 8255b4b36..1cfb681f4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES -from .util import registry - - -@registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer": - return Lemmatizer(data=data) class Lemmatizer: @@ -21,7 +15,6 @@ class Lemmatizer: def __init__( self, lookups: Optional[Lookups] = None, - data: Dict[str, dict] = {}, is_base_form: Optional[Callable] = None, ) -> None: """Initialize a Lemmatizer. @@ -31,9 +24,6 @@ class Lemmatizer: RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() - for name, table in data.items(): - if table is not None: - self.lookups.add_table(name, table) self.is_base_form = is_base_form def __call__( diff --git a/spacy/lookups.py b/spacy/lookups.py index d5def882e..e5a4a0b40 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -13,7 +13,9 @@ UNSET = object() @registry.language_data("spacy-lookups-data") -def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: +def load_lookups( + lang: str, tables: List[str], strict: bool = True +) -> Optional[Dict[str, Any]]: """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty dict if there's no data or if the package is not installed. @@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: RETURNS (Dict[str, Any]): The lookups, keyed by table name. """ # TODO: import spacy_lookups_data instead of going via entry points here? + lookups = Lookups() if lang not in registry.lookups: - return {} + return lookups data = registry.lookups.get(lang) - result = {} for table in tables: if table not in data: - raise ValueError("TODO: unknown table") - result[table] = load_language_data(data[table]) - return result + if strict: + raise ValueError("TODO: unknown table") + language_data = {} + else: + language_data = load_language_data(data[table]) + lookups.add_table(table, language_data) + return lookups class Lookups: diff --git a/spacy/schemas.py b/spacy/schemas.py index 8b6e3ebab..ad16f3233 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") - writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system") - stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") - lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") - get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc") # fmt: on class Config: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a2e319e12..cfdb8e4ff 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -257,7 +257,7 @@ def zh_tokenizer_char(): def zh_tokenizer_jieba(): pytest.importorskip("jieba") config = { - "@tokenizers": "spacy.ChineseTokenizer.v1", + "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "jieba", } nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) @@ -268,7 +268,7 @@ def zh_tokenizer_jieba(): def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") config = { - "@tokenizers": "spacy.ChineseTokenizer.v1", + "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "pkuseg", "pkuseg_model": "default", } diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5fffa4503..6268a77ae 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -26,37 +26,6 @@ from .attrs import intify_attrs from .symbols import ORTH -@registry.tokenizers("spacy.Tokenizer.v1") -def create_tokenizer( - # exceptions: Dict[str, List[dict]], - # prefixes: Optional[List[Union[str, Pattern]]], - # suffixes: Optional[List[Union[str, Pattern]]], - # infixes: Optional[List[Union[str, Pattern]]], - # We currently can't validate against Pattern because that will cause - # Pydantic to parse value *as* pattern - token_match: Optional[Any] = None, - url_match: Optional[Any] = None, -) -> "Tokenizer": - def tokenizer_factory(nlp): - exceptions = nlp.Defaults.tokenizer_exceptions - prefixes = nlp.Defaults.prefixes - suffixes = nlp.Defaults.suffixes - infixes = nlp.Defaults.infixes - prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None - suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None - infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None - return Tokenizer( - nlp.vocab, - rules=exceptions, - prefix_search=prefix_search, - suffix_search=suffix_search, - infix_finditer=infix_finditer, - token_match=token_match, - url_match=url_match, - ) - return tokenizer_factory - - cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1a4959833..0f99a45f5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang +def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None): + lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} + # This is messy, but it's the minimal working fix to Issue #639. + lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) + # Ensure that getter can be pickled + lex_attrs[LANG] = functools.partial(get_lang, lang=lang) + lex_attrs[NORM] = util.add_lookups( + lex_attrs.get(NORM, LEX_ATTRS[NORM]), + BASE_NORMS, + vocab_data.get("lexeme_norm", {}), + ) + lookups = Lookups() + for name, data in vocab_data.items(): + if name not in lookups: + data = data if data is not None else {} + lookups.add_table(name, data) + return Vocab( + lex_attr_getters=lex_attrs, + lemmatizer=lemmatizer, + lookups=lookups, + writing_system=defaults.writing_system, + get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), + vectors_name=vectors_name, + ) + + + cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying @@ -31,7 +58,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, lemmatizer=None, - strings=tuple(), lookups=None, tag_map={}, vocab_data={}, + strings=tuple(), lookups=None, tag_map={}, oov_prob=-20., vectors_name=None, writing_system={}, get_noun_chunks=None, **deprecated_kwargs): """Create the vocabulary. @@ -51,10 +78,6 @@ cdef class Vocab: lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): lookups = Lookups() - for name, data in vocab_data.items(): - if name not in lookups: - data = data if data is not None else {} - lookups.add_table(name, data) if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) self.cfg = {'oov_prob': oov_prob} @@ -416,66 +439,6 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - @classmethod - def from_config( - cls, - config, - lemmatizer=None, - lex_attr_getters=None, - stop_words=None, - vocab_data=None, - get_noun_chunks=None, - vectors_name=None, - ): - """Create a Vocab from a config and (currently) language defaults, i.e. - nlp.Defaults. - - config (Dict[str, Any]): The full config. - lemmatizer (Callable): Optional lemmatizer. - vectors_name (str): Optional vectors name. - RETURNS (Vocab): The vocab. - """ - # TODO: make this less messy – move lemmatizer out into its own pipeline - # component, move language defaults to config - lang = config["nlp"]["lang"] - writing_system = config["nlp"]["writing_system"] - if not lemmatizer: - lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]} - lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - if stop_words is None: - stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]} - stop_words = registry.make_from_config(stop_words_cfg)["stop_words"] - if vocab_data is None: - vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} - vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] - if get_noun_chunks is None: - noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]} - get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"] - if lex_attr_getters is None: - lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} - lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] - lex_attrs = dict(LEX_ATTRS) - lex_attrs.update(lex_attr_getters) - # This is messy, but it's the minimal working fix to Issue #639. - lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words) - # Ensure that getter can be pickled - lex_attrs[LANG] = functools.partial(get_lang, lang=lang) - lex_attrs[NORM] = util.add_lookups( - lex_attrs.get(NORM, LEX_ATTRS[NORM]), - BASE_NORMS, - vocab_data.get("lexeme_norm", {}), - ) - vocab = cls( - lex_attr_getters=lex_attrs, - vocab_data=vocab_data, - lemmatizer=lemmatizer, - writing_system=writing_system, - get_noun_chunks=get_noun_chunks - ) - if vocab.vectors.name is None and vectors_name: - vocab.vectors.name = vectors_name - return vocab - def to_disk(self, path, exclude=tuple()): """Save the current state to a directory.