Simplify language data and revert detailed configs

This commit is contained in:
Ines Montani 2020-07-24 14:50:26 +02:00
parent 87737a5a60
commit 38f6ea7a78
70 changed files with 414 additions and 1677 deletions

View File

@ -1,24 +1,13 @@
[nlp] [nlp]
lang = null lang = null
stop_words = []
lex_attr_getters = {}
vocab_data = {} vocab_data = {}
get_noun_chunks = null
pipeline = [] pipeline = []
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
token_match = null
url_match = {"@language_data": "spacy.xx.url_match"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
data = {}
[nlp.writing_system]
direction = "ltr"
has_case = true
has_letters = true
[components] [components]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class AfrikaansDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "af"
stop_words = {"@language_data": "spacy.af.stop_words"}
"""
@registry.language_data("spacy.af.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Afrikaans(Language): class Afrikaans(Language):
lang = "af" lang = "af"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = AfrikaansDefaults
__all__ = ["Afrikaans"] __all__ = ["Afrikaans"]

View File

@ -1,46 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ar"
stop_words = {"@language_data": "spacy.ar.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.ar.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArabicDefaults(Language.Defaults): class ArabicDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language): class Arabic(Language):
lang = "ar"
Defaults = ArabicDefaults Defaults = ArabicDefaults
default_config = Config().from_str(DEFAULT_CONFIG) lang = "ar"
__all__ = ["Arabic"] __all__ = ["Arabic"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class BulgarianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "bg"
stop_words = {"@language_data": "spacy.bg.stop_words"}
"""
@registry.language_data("spacy.bg.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Bulgarian(Language): class Bulgarian(Language):
lang = "bg" lang = "bg"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = BulgarianDefaults
__all__ = ["Bulgarian"] __all__ = ["Bulgarian"]

View File

@ -1,31 +1,7 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "bn"
stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules"]
"""
@registry.language_data("spacy.bn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Bengali(Language): class Bengali(Language):
lang = "bn" lang = "bn"
Defaults = BengaliDefaults Defaults = BengaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -1,49 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
from .punctuation import TOKENIZER_INFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "ca"
stop_words = {"@language_data": "spacy.ca.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ca.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class CatalanDefaults(Language.Defaults): class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Catalan(Language): class Catalan(Language):
lang = "ca" lang = "ca"
Defaults = CatalanDefaults Defaults = CatalanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Catalan"] __all__ = ["Catalan"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class CzechDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "cs"
stop_words = {"@language_data": "spacy.cs.stop_words"}
"""
@registry.language_data("spacy.cs.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Czech(Language): class Czech(Language):
lang = "cs" lang = "cs"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = CzechDefaults
__all__ = ["Czech"] __all__ = ["Czech"]

View File

@ -1,55 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "da"
stop_words = {"@language_data": "spacy.da.stop_words"}
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.da.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Danish(Language): class Danish(Language):
lang = "da" lang = "da"
Defaults = DanishDefaults Defaults = DanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Danish"] __all__ = ["Danish"]

View File

@ -1,44 +1,8 @@
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
"""
@registry.language_data("spacy.de.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.de.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class German(Language): class German(Language):
lang = "de" lang = "de"
Defaults = GermanDefaults Defaults = GermanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["German"] __all__ = ["German"]

View File

@ -1,9 +0,0 @@
from typing import Pattern
from .tokenizer_exceptions import URL_MATCH
from ..util import registry
@registry.language_data("spacy.xx.url_match")
def url_match() -> Pattern:
return URL_MATCH

View File

@ -1,69 +1,50 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1" @lemmatizers = "spacy.el.GreekLemmatizer"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
""" """
@registry.lemmatizers("spacy.GreekLemmatizer.v1") @registry.lemmatizers("spacy.el.GreekLemmatizer")
def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
return GreekLemmatizer(data=data) tables = ["lemma_index", "lemma_exc", "lemma_rules"]
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return GreekLemmatizer(lookups=lookups)
@registry.language_data("spacy.el.get_noun_chunks") return lemmatizer_factory
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Greek(Language): class Greek(Language):
lang = "el" lang = "el"
Defaults = GreekDefaults Defaults = GreekDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,68 +1,49 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ...language import Language from ...language import Language
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...lookups import load_lookups
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1" @lemmatizers = "spacy.en.EnglishLemmatizer"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
""" """
@registry.language_data("spacy.en.stop_words") @registry.lemmatizers("spacy.en.EnglishLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
return STOP_WORDS tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.en.lex_attr_getters") return lemmatizer_factory
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
return Lemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.en.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class English(Language): class English(Language):
lang = "en" lang = "en"
Defaults = EnglishDefaults Defaults = EnglishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["English"] __all__ = ["English"]

View File

@ -1,62 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
"""
@registry.language_data("spacy.es.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Spanish(Language): class Spanish(Language):
lang = "es" lang = "es"
Defaults = SpanishDefaults Defaults = SpanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Spanish"] __all__ = ["Spanish"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class EstonianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "et"
stop_words = {"@language_data": "spacy.et.stop_words"}
"""
@registry.language_data("spacy.et.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Estonian(Language): class Estonian(Language):
lang = "et" lang = "et"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = EstonianDefaults
__all__ = ["Estonian"] __all__ = ["Estonian"]

View File

@ -1,41 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "eu"
stop_words = {"@language_data": "spacy.eu.stop_words"}
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
"""
@registry.language_data("spacy.eu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class BasqueDefaults(Language.Defaults): class BasqueDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Basque(Language): class Basque(Language):
lang = "eu" lang = "eu"
Defaults = BasqueDefaults Defaults = BasqueDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Basque"] __all__ = ["Basque"]

View File

@ -1,61 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...util import registry
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
DEFAULT_CONFIG = """
[nlp]
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
"""
@registry.language_data("spacy.fa.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fa.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class PersianDefaults(Language.Defaults): class PersianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Persian(Language): class Persian(Language):
lang = "fa" lang = "fa"
Defaults = PersianDefaults Defaults = PersianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -1,42 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "fi"
stop_words = {"@language_data": "spacy.fi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
"""
@registry.language_data("spacy.fi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FinnishDefaults(Language.Defaults): class FinnishDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Finnish(Language): class Finnish(Language):
lang = "fi" lang = "fi"
Defaults = FinnishDefaults Defaults = FinnishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Finnish"] __all__ = ["Finnish"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any, Pattern from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer, is_base_form from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import noun_chunks from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.fr.token_match"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1" @lemmatizers = "spacy.fr.FrenchLemmatizer"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
""" """
@registry.lemmatizers("spacy.FrenchLemmatizer.v1") @registry.lemmatizers("spacy.fr.FrenchLemmatizer")
def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
return FrenchLemmatizer(data=data, is_base_form=is_base_form) tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.fr.token_match") return lemmatizer_factory
def token_match() -> Pattern:
return TOKEN_MATCH
@registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fr.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class French(Language): class French(Language):
lang = "fr" lang = "fr"
Defaults = FrenchDefaults Defaults = FrenchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["French"] __all__ = ["French"]

View File

@ -1,32 +1,16 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ga"
stop_words = {"@language_data": "spacy.ga.stop_words"}
"""
@registry.language_data("spacy.ga.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IrishDefaults(Language.Defaults): class IrishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Irish(Language): class Irish(Language):
lang = "ga" lang = "ga"
Defaults = IrishDefaults Defaults = IrishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Irish"] __all__ = ["Irish"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class GujaratiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "gu"
stop_words = {"@language_data": "spacy.gu.stop_words"}
"""
@registry.language_data("spacy.gu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Gujarati(Language): class Gujarati(Language):
lang = "gu" lang = "gu"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = GujaratiDefaults
__all__ = ["Gujarati"] __all__ = ["Gujarati"]

View File

@ -1,37 +1,15 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "he"
stop_words = {"@language_data": "spacy.he.stop_words"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.he.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language): class Hebrew(Language):
lang = "he" lang = "he"
Defaults = HebrewDefaults Defaults = HebrewDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hebrew"] __all__ = ["Hebrew"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class HindiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "hi" lex_attr_getters = LEX_ATTRS
stop_words = {"@language_data": "spacy.hi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
"""
@registry.language_data("spacy.hi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Hindi(Language): class Hindi(Language):
lang = "hi" lang = "hi"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = HindiDefaults
__all__ = ["Hindi"] __all__ = ["Hindi"]

View File

@ -1,40 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hr"
stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.hr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CroatianDefaults(Language.Defaults): class CroatianDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS
class Croatian(Language): class Croatian(Language):
lang = "hr" lang = "hr"
Defaults = CroatianDefaults Defaults = CroatianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Croatian"] __all__ = ["Croatian"]

View File

@ -1,40 +1,7 @@
from typing import Set, Pattern
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.hu.token_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.hu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hu.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
class HungarianDefaults(Language.Defaults): class HungarianDefaults(Language.Defaults):
@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
stop_words = STOP_WORDS
class Hungarian(Language): class Hungarian(Language):
lang = "hu" lang = "hu"
Defaults = HungarianDefaults Defaults = HungarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hungarian"] __all__ = ["Hungarian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class ArmenianDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "hy" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.hy.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
"""
@registry.language_data("spacy.hy.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Armenian(Language): class Armenian(Language):
lang = "hy" lang = "hy"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = ArmenianDefaults
__all__ = ["Armenian"] __all__ = ["Armenian"]

View File

@ -1,50 +1,9 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.id.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.id.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Indonesian(Language): class Indonesian(Language):
lang = "id" lang = "id"
Defaults = IndonesianDefaults Defaults = IndonesianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Indonesian"] __all__ = ["Indonesian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class IcelandicDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "is"
stop_words = {"@language_data": "spacy.is.stop_words"}
"""
@registry.language_data("spacy.is.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Icelandic(Language): class Icelandic(Language):
lang = "is" lang = "is"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = IcelandicDefaults
__all__ = ["Icelandic"] __all__ = ["Icelandic"]

View File

@ -1,31 +1,7 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "it"
stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.it.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
class Italian(Language): class Italian(Language):
lang = "it" lang = "it"
Defaults = ItalianDefaults Defaults = ItalianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Italian"] __all__ = ["Italian"]

View File

@ -1,11 +1,11 @@
from typing import Optional, Union, Dict, Any, Set, Callable from typing import Optional, Union, Dict, Any
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
@ -20,33 +20,15 @@ from ... import util
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1" @tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null split_mode = null
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
""" """
@registry.language_data("spacy.ja.stop_words") @registry.tokenizers("spacy.ja.JapaneseTokenizer")
def stop_words() -> Set[str]: def create_tokenizer(split_mode: Optional[str] = None):
return STOP_WORDS
@registry.language_data("spacy.ja.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode) return JapaneseTokenizer(nlp, split_mode=split_mode)
@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer):
return self return self
class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Japanese(Language): class Japanese(Language):
lang = "ja" lang = "ja"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = JapaneseDefaults
# Hold the attributes we need with convenient names # Hold the attributes we need with convenient names

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class KannadaDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "kn"
stop_words = {"@language_data": "spacy.kn.stop_words"}
"""
@registry.language_data("spacy.kn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Kannada(Language): class Kannada(Language):
lang = "kn" lang = "kn"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = KannadaDefaults
__all__ = ["Kannada"] __all__ = ["Kannada"]

View File

@ -1,4 +1,4 @@
from typing import Set, Optional, Any, Dict from typing import Optional, Any, Dict
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ko"
stop_words = {"@language_data": "spacy.ko.stop_words"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.KoreanTokenizer.v1" @tokenizers = "spacy.ko.KoreanTokenizer"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
""" """
@registry.language_data("spacy.ko.stop_words") @registry.tokenizers("spacy.ko.KoreanTokenizer")
def stop_words() -> Set[str]: def create_tokenizer():
return STOP_WORDS
@registry.tokenizers("spacy.KoreanTokenizer.v1")
def create_korean_tokenizer():
def korean_tokenizer_factory(nlp): def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp) return KoreanTokenizer(nlp)
@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer):
yield {"surface": surface, "lemma": lemma, "tag": tag} yield {"surface": surface, "lemma": lemma, "tag": tag}
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Korean(Language): class Korean(Language):
lang = "ko" lang = "ko"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = KoreanDefaults
def try_mecab_import() -> None: def try_mecab_import() -> None:

View File

@ -1,54 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lb"
stop_words = {"@language_data": "spacy.lb.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.lb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Luxembourgish(Language): class Luxembourgish(Language):
lang = "lb" lang = "lb"
Defaults = LuxembourgishDefaults Defaults = LuxembourgishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Luxembourgish"] __all__ = ["Luxembourgish"]

View File

@ -1,34 +1,18 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lij"
stop_words = {"@language_data": "spacy.lij.stop_words"}
"""
@registry.language_data("spacy.lij.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LigurianDefaults(Language.Defaults): class LigurianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Ligurian(Language): class Ligurian(Language):
lang = "lij" lang = "lij"
Defaults = LigurianDefaults Defaults = LigurianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ligurian"] __all__ = ["Ligurian"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lt"
stop_words = {"@language_data": "spacy.lt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.lt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LithuanianDefaults(Language.Defaults): class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Lithuanian(Language): class Lithuanian(Language):
lang = "lt" lang = "lt"
Defaults = LithuanianDefaults Defaults = LithuanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Lithuanian"] __all__ = ["Lithuanian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class LatvianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "lv"
stop_words = {"@language_data": "spacy.lv.stop_words"}
"""
@registry.language_data("spacy.lv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Latvian(Language): class Latvian(Language):
lang = "lv" lang = "lv"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = LatvianDefaults
__all__ = ["Latvian"] __all__ = ["Latvian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class MalayalamDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "ml"
stop_words = {"@language_data": "spacy.ml.stop_words"}
"""
@registry.language_data("spacy.ml.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Malayalam(Language): class Malayalam(Language):
lang = "ml" lang = "ml"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = MalayalamDefaults
__all__ = ["Malayalam"] __all__ = ["Malayalam"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class MarathiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "af"
stop_words = {"@language_data": "spacy.mr.stop_words"}
"""
@registry.language_data("spacy.mr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Marathi(Language): class Marathi(Language):
lang = "mr" lang = "mr"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = MarathiDefaults
__all__ = ["Marathi"] __all__ = ["Marathi"]

View File

@ -1,39 +1,9 @@
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
"""
@registry.language_data("spacy.nb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.nb.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Norwegian(Language): class Norwegian(Language):
lang = "nb" lang = "nb"
Defaults = NorwegianDefaults Defaults = NorwegianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class NepaliDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "ne" lex_attr_getters = LEX_ATTRS
stop_words = {"@language_data": "spacy.ne.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
"""
@registry.language_data("spacy.ne.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Nepali(Language): class Nepali(Language):
lang = "ne" lang = "ne"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = NepaliDefaults
__all__ = ["Nepali"] __all__ = ["Nepali"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "nl"
stop_words = {"@language_data": "spacy.nl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1" @lemmatizers = "spacy.nl.DutchLemmatizer"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
""" """
@registry.language_data("spacy.nl.stop_words") @registry.lemmatizers("spacy.nl.DutchLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
return STOP_WORDS tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return DutchLemmatizer(lookups=lookups)
@registry.language_data("spacy.nl.lex_attr_getters") return lemmatizer_factory
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
return DutchLemmatizer(data=data)
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Dutch(Language): class Dutch(Language):
lang = "nl" lang = "nl"
Defaults = DutchDefaults Defaults = DutchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "pl"
stop_words = {"@language_data": "spacy.pl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1" @lemmatizers = "spacy.pl.PolishLemmatizer"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
""" """
TOKENIZER_EXCEPTIONS = {
@registry.language_data("spacy.pl.stop_words") exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
def stop_words() -> Set[str]: }
return STOP_WORDS
@registry.language_data("spacy.pl.lex_attr_getters") @registry.lemmatizers("spacy.pl.PolishLemmatizer")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
return LEX_ATTRS # fmt: off
tables = [
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
]
# fmt: on
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return PolishLemmatizer(lookups=lookups)
@registry.lemmatizers("spacy.PolishLemmatizer.v1") return lemmatizer_factory
def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
return PolishLemmatizer(data=data)
class PolishDefaults(Language.Defaults): class PolishDefaults(Language.Defaults):
mod_base_exceptions = { config = Config().from_str(DEFAULT_CONFIG)
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") tokenizer_exceptions = TOKENIZER_EXCEPTIONS
}
tokenizer_exceptions = mod_base_exceptions
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Polish(Language): class Polish(Language):
lang = "pl" lang = "pl"
Defaults = PolishDefaults Defaults = PolishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "pt"
stop_words = {"@language_data": "spacy.pt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.pt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Portuguese(Language): class Portuguese(Language):
lang = "pt" lang = "pt"
Defaults = PortugueseDefaults Defaults = PortugueseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Portuguese"] __all__ = ["Portuguese"]

View File

@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_prefixes = ( TOKENIZER_PREFIXES = (
["§", "%", "=", "", "", r"\+(?![0-9])"] ["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT + LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
@ -13,7 +13,7 @@ _prefixes = (
) )
_suffixes = ( TOKENIZER_SUFFIXES = (
LIST_PUNCT LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
@ -31,7 +31,7 @@ _suffixes = (
] ]
) )
_infixes = ( TOKENIZER_INFIXES = (
LIST_ELLIPSES LIST_ELLIPSES
+ LIST_ICONS + LIST_ICONS
+ [ + [
@ -44,7 +44,3 @@ _infixes = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -1,49 +1,25 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language
from ...util import registry
# Lemma data note: # Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț) # Replaced characters using cedillas with the correct ones (ș and ț)
DEFAULT_CONFIG = """
[nlp]
lang = "ro"
stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ro.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class RomanianDefaults(Language.Defaults): class RomanianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Romanian(Language): class Romanian(Language):
lang = "ro" lang = "ro"
Defaults = RomanianDefaults Defaults = RomanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Romanian"] __all__ = ["Romanian"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -11,43 +11,30 @@ from ...language import Language
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ru"
stop_words = {"@language_data": "spacy.ru.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1" @lemmatizers = "spacy.ru.RussianLemmatizer"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """
@registry.language_data("spacy.ru.stop_words") @registry.lemmatizers("spacy.ru.RussianLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
return STOP_WORDS def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
@registry.language_data("spacy.ru.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
def create_russian_lemmatizer() -> RussianLemmatizer:
return RussianLemmatizer() return RussianLemmatizer()
return lemmatizer_factory
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Russian(Language): class Russian(Language):
lang = "ru" lang = "ru"
Defaults = RussianDefaults Defaults = RussianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SinhalaDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "si" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.si.stop_words"}
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
"""
@registry.language_data("spacy.si.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Sinhala(Language): class Sinhala(Language):
lang = "si" lang = "si"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SinhalaDefaults
__all__ = ["Sinhala"] __all__ = ["Sinhala"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SlovakDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "sk" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.sk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
"""
@registry.language_data("spacy.sk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Slovak(Language): class Slovak(Language):
lang = "sk" lang = "sk"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SlovakDefaults
__all__ = ["Slovak"] __all__ = ["Slovak"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SlovenianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "sl"
stop_words = {"@language_data": "spacy.sl.stop_words"}
"""
@registry.language_data("spacy.sl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Slovenian(Language): class Slovenian(Language):
lang = "sl" lang = "sl"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SlovenianDefaults
__all__ = ["Slovenian"] __all__ = ["Slovenian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class AlbanianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "sq"
stop_words = {"@language_data": "spacy.sq.stop_words"}
"""
@registry.language_data("spacy.sq.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Albanian(Language): class Albanian(Language):
lang = "sq" lang = "sq"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = AlbanianDefaults
__all__ = ["Albanian"] __all__ = ["Albanian"]

View File

@ -1,52 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "sr"
stop_words = {"@language_data": "spacy.sr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.sr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Serbian(Language): class Serbian(Language):
lang = "sr" lang = "sr"
Defaults = SerbianDefaults Defaults = SerbianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Serbian"] __all__ = ["Serbian"]

View File

@ -1,59 +1,25 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import registry
from .syntax_iterators import noun_chunks
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules"]
"""
@registry.language_data("spacy.sv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.sv.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class SwedishDefaults(Language.Defaults): class SwedishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Swedish(Language): class Swedish(Language):
lang = "sv" lang = "sv"
Defaults = SwedishDefaults Defaults = SwedishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -1,38 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class TamilDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "ta" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.ta.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.ta.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Tamil(Language): class Tamil(Language):
lang = "ta" lang = "ta"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = TamilDefaults
__all__ = ["Tamil"] __all__ = ["Tamil"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class TeluguDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "te" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.te.stop_words"}
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
"""
@registry.language_data("spacy.te.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Telugu(Language): class Telugu(Language):
lang = "te" lang = "te"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = TeluguDefaults
__all__ = ["Telugu"] __all__ = ["Telugu"]

View File

@ -1,4 +1,3 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "th"
stop_words = {"@language_data": "spacy.th.stop_words"}
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1" @tokenizers = "spacy.th.ThaiTokenizer"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """
@registry.language_data("spacy.th.stop_words") @registry.tokenizers("spacy.th.ThaiTokenizer")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
def create_thai_tokenizer(): def create_thai_tokenizer():
def thai_tokenizer_factory(nlp): def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp) return ThaiTokenizer(nlp)
@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Thai(Language): class Thai(Language):
lang = "th" lang = "th"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = ThaiDefaults
__all__ = ["Thai"] __all__ = ["Thai"]

View File

@ -1,47 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tl"
stop_words = {"@language_data": "spacy.tl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.tl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TagalogDefaults(Language.Defaults): class TagalogDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tagalog(Language): class Tagalog(Language):
lang = "tl" lang = "tl"
Defaults = TagalogDefaults Defaults = TagalogDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tagalog"] __all__ = ["Tagalog"]

View File

@ -1,40 +1,16 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tr"
stop_words = {"@language_data": "spacy.tr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.tr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class TurkishDefaults(Language.Defaults): class TurkishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Turkish(Language): class Turkish(Language):
lang = "tr" lang = "tr"
Defaults = TurkishDefaults Defaults = TurkishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Turkish"] __all__ = ["Turkish"]

View File

@ -1,41 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tt"
stop_words = {"@language_data": "spacy.tt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
"""
@registry.language_data("spacy.tt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TatarDefaults(Language.Defaults): class TatarDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = tuple(TOKENIZER_INFIXES) infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tatar(Language): class Tatar(Language):
lang = "tt" lang = "tt"
Defaults = TatarDefaults Defaults = TatarDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tatar"] __all__ = ["Tatar"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "uk"
stop_words = {"@language_data": "spacy.uk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.UkrainianLemmatizer.v1" @lemmatizers = "spacy.uk.UkrainianLemmatizer"
""" """
@registry.language_data("spacy.uk.stop_words") @registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
def stop_words() -> Set[str]: def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
return STOP_WORDS def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
@registry.language_data("spacy.uk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
return UkrainianLemmatizer() return UkrainianLemmatizer()
return lemmatizer_factory
class UkrainianDefaults(Language.Defaults): class UkrainianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Ukrainian(Language): class Ukrainian(Language):
lang = "uk" lang = "uk"
Defaults = UkrainianDefaults Defaults = UkrainianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,54 +1,19 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ur"
stop_words = {"@language_data": "spacy.ur.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ur.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ur.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class UrduDefaults(Language.Defaults): class UrduDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Urdu(Language): class Urdu(Language):
lang = "ur" lang = "ur"
Defaults = UrduDefaults Defaults = UrduDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Urdu"] __all__ = ["Urdu"]

View File

@ -1,4 +1,3 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config from thinc.api import Config
from ...language import Language from ...language import Language
@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "vi"
stop_words = {"@language_data": "spacy.vi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.VietnameseTokenizer.v1" @tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true use_pyvi = true
""" """
@registry.language_data("spacy.vi.stop_words") @registry.tokenizers("spacy.vi.VietnameseTokenizer")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.vi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
def create_vietnamese_tokenizer(use_pyvi: bool = True,): def create_vietnamese_tokenizer(use_pyvi: bool = True,):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Vietnamese(Language): class Vietnamese(Language):
lang = "vi" lang = "vi"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = VietnameseDefaults
__all__ = ["Vietnamese"] __all__ = ["Vietnamese"]

View File

@ -1,27 +1,12 @@
from thinc.api import Config
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
DEFAULT_CONFIG = """
[nlp]
lang = "xx"
"""
class MultiLanguageDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
class MultiLanguage(Language): class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages. """Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'. This module allows models to specify their language ID as 'xx'.
""" """
lang = "xx" lang = "xx"
Defaults = MultiLanguageDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["MultiLanguage"] __all__ = ["MultiLanguage"]

View File

@ -1,39 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.yo.stop_words"}
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
"""
@registry.language_data("spacy.yo.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.yo.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class YorubaDefaults(Language.Defaults): class YorubaDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Yoruba(Language): class Yoruba(Language):
lang = "yo" lang = "yo"
Defaults = YorubaDefaults Defaults = YorubaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Yoruba"] __all__ = ["Yoruba"]

View File

@ -1,4 +1,4 @@
from typing import Optional, List, Set, Dict, Callable, Any from typing import Optional, List, Dict, Any
from enum import Enum from enum import Enum
import tempfile import tempfile
import srsly import srsly
@ -10,7 +10,6 @@ from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "zh"
stop_words = {"@language_data": "spacy.zh.stop_words"}
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.ChineseTokenizer.v1" @tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char" segmenter = "char"
pkuseg_model = null pkuseg_model = null
pkuseg_user_dict = "default" pkuseg_user_dict = "default"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
""" """
@ -47,17 +38,7 @@ class Segmenter(str, Enum):
return list(cls.__members__.keys()) return list(cls.__members__.keys())
@registry.language_data("spacy.zh.stop_words") @registry.tokenizers("spacy.zh.ChineseTokenizer")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.zh.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ChineseTokenizer.v1")
def create_chinese_tokenizer( def create_chinese_tokenizer(
segmenter: Segmenter = Segmenter.char, segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None, pkuseg_model: Optional[str] = None,
@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg) warnings.warn(warn_msg)
def _get_config(self) -> Dict[str, Any]:
return {
"segmenter": self.segmenter,
"pkuseg_model": self.pkuseg_model,
"pkuseg_user_dict": self.pkuseg_user_dict,
}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.segmenter = config.get("segmenter", Segmenter.char)
self.pkuseg_model = config.get("pkuseg_model", None)
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
pkuseg_features_b = b"" pkuseg_features_b = b""
pkuseg_weights_b = b"" pkuseg_weights_b = b""
@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer):
sorted(list(self.pkuseg_seg.postprocesser.other_words)), sorted(list(self.pkuseg_seg.postprocesser.other_words)),
) )
serializers = { serializers = {
"cfg": lambda: srsly.json_dumps(self._get_config()),
"pkuseg_features": lambda: pkuseg_features_b, "pkuseg_features": lambda: pkuseg_features_b,
"pkuseg_weights": lambda: pkuseg_weights_b, "pkuseg_weights": lambda: pkuseg_weights_b,
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data), "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer):
pkuseg_data["processors_data"] = srsly.msgpack_loads(b) pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
deserializers = { deserializers = {
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
"pkuseg_features": deserialize_pkuseg_features, "pkuseg_features": deserialize_pkuseg_features,
"pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_weights": deserialize_pkuseg_weights,
"pkuseg_processors": deserialize_pkuseg_processors, "pkuseg_processors": deserialize_pkuseg_processors,
@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer):
srsly.write_msgpack(path, data) srsly.write_msgpack(path, data)
serializers = { serializers = {
"cfg": lambda p: srsly.write_json(p, self._get_config()),
"pkuseg_model": lambda p: save_pkuseg_model(p), "pkuseg_model": lambda p: save_pkuseg_model(p),
"pkuseg_processors": lambda p: save_pkuseg_processors(p), "pkuseg_processors": lambda p: save_pkuseg_processors(p),
} }
@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.postprocesser.other_words = set(other_words) self.pkuseg_seg.postprocesser.other_words = set(other_words)
serializers = { serializers = {
"cfg": lambda p: self._set_config(srsly.read_json(p)),
"pkuseg_model": lambda p: load_pkuseg_model(p), "pkuseg_model": lambda p: load_pkuseg_model(p),
"pkuseg_processors": lambda p: load_pkuseg_processors(p), "pkuseg_processors": lambda p: load_pkuseg_processors(p),
} }
@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Chinese(Language): class Chinese(Language):
lang = "zh" lang = "zh"
Defaults = ChineseDefaults Defaults = ChineseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def try_jieba_import(segmenter: str) -> None: def try_jieba_import(segmenter: str) -> None:

View File

@ -16,27 +16,25 @@ import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
from .tokens.underscore import Underscore from .tokens.underscore import Underscore
from .vocab import Vocab from .vocab import Vocab, create_vocab
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example from .gold import Example
from .scorer import Scorer from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry from .util import link_vectors_to_models, create_default_optimizer, registry
from .util import SimpleFrozenDict from .util import SimpleFrozenDict
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer
from .lemmatizer import Lemmatizer
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema from .schemas import ConfigSchema
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
# We also need to import these to make sure the functions are registered
from .tokenizer import Tokenizer # noqa: F401
from .lemmatizer import Lemmatizer # noqa: F401
from .lookups import Lookups # noqa: F401
from .lang import defaults # noqa: F401
ENABLE_PIPELINE_ANALYSIS = False ENABLE_PIPELINE_ANALYSIS = False
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
class BaseDefaults: class BaseDefaults:
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES) config: Config = Config()
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES) tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES) prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
tokenizer_exceptions: Dict[str, List[dict]] = {} suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
token_match: Optional[Pattern] = None
url_match: Optional[Pattern] = URL_MATCH
syntax_iterators: Dict[str, Callable] = {}
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
stop_words = set()
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
def tokenizer_factory(nlp: "Language") -> Tokenizer:
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
return Tokenizer(
nlp.vocab,
rules=nlp.Defaults.tokenizer_exceptions,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=nlp.Defaults.token_match,
url_match=nlp.Defaults.url_match,
)
return tokenizer_factory
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
return Lemmatizer(lookups=lookups)
return lemmatizer_factory
class Language: class Language:
@ -65,8 +103,8 @@ class Language:
Defaults = BaseDefaults Defaults = BaseDefaults
lang: str = None lang: str = None
default_config = DEFAULT_CONFIG default_config = DEFAULT_CONFIG
factories = SimpleFrozenDict(error=Errors.E957)
factories = SimpleFrozenDict(error=Errors.E957)
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
def __init__( def __init__(
@ -75,6 +113,7 @@ class Language:
max_length: int = 10 ** 6, max_length: int = 10 ** 6,
meta: Dict[str, Any] = {}, meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
**kwargs, **kwargs,
): ):
"""Initialise a Language object. """Initialise a Language object.
@ -108,7 +147,16 @@ class Language:
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vectors_name = meta.get("vectors", {}).get("name")
vocab = Vocab.from_config(self._config, vectors_name=vectors_name) if not create_lemmatizer:
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
# TODO: where does the vocab data come in?
vocab = create_vocab(
self.lang,
self.Defaults,
lemmatizer=create_lemmatizer(self),
vectors_name=vectors_name,
)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -126,7 +174,10 @@ class Language:
def __init_subclass__(cls, **kwargs): def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs) super().__init_subclass__(**kwargs)
cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG) cls.default_config = util.deep_merge_configs(
cls.Defaults.config, DEFAULT_CONFIG
)
cls.default_config["nlp"]["lang"] = cls.lang
@property @property
def path(self): def path(self):
@ -1226,17 +1277,16 @@ class Language:
config = util.deep_merge_configs(config, cls.default_config) config = util.deep_merge_configs(config, cls.default_config)
if "nlp" not in config: if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config)) raise ValueError(Errors.E985.format(config=config))
nlp_config = config["nlp"] config_lang = config["nlp"]["lang"]
config_lang = nlp_config["lang"]
if cls.lang is not None and config_lang is not None and config_lang != cls.lang: if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
raise ValueError( raise ValueError(
Errors.E958.format( Errors.E958.format(
bad_lang_code=nlp_config["lang"], bad_lang_code=config["nlp"]["lang"],
lang_code=cls.lang, lang_code=cls.lang,
lang=util.get_object_name(cls), lang=util.get_object_name(cls),
) )
) )
nlp_config["lang"] = cls.lang config["nlp"]["lang"] = cls.lang
# This isn't very elegant, but we remove the [components] block here to prevent # This isn't very elegant, but we remove the [components] block here to prevent
# it from getting resolved (causes problems because we expect to pass in # it from getting resolved (causes problems because we expect to pass in
# the nlp and name args for each component). If we're auto-filling, we're # the nlp and name args for each component). If we're auto-filling, we're
@ -1251,22 +1301,12 @@ class Language:
filled["components"] = orig_pipeline filled["components"] = orig_pipeline
config["components"] = orig_pipeline config["components"] = orig_pipeline
create_tokenizer = resolved["nlp"]["tokenizer"] create_tokenizer = resolved["nlp"]["tokenizer"]
lemmatizer = resolved["nlp"]["lemmatizer"] create_lemmatizer = resolved["nlp"]["lemmatizer"]
lex_attr_getters = resolved["nlp"]["lex_attr_getters"] nlp = cls(
stop_words = resolved["nlp"]["stop_words"] create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer,
vocab_data = resolved["nlp"]["vocab_data"]
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
vocab = Vocab.from_config(
filled,
lemmatizer=lemmatizer,
lex_attr_getters=lex_attr_getters,
stop_words=stop_words,
vocab_data=vocab_data,
get_noun_chunks=get_noun_chunks,
) )
nlp = cls(vocab, create_tokenizer=create_tokenizer)
pipeline = config.get("components", {}) pipeline = config.get("components", {})
for pipe_name in nlp_config["pipeline"]: for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline: if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys()) opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))

View File

@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict
from .lookups import Lookups from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES from .parts_of_speech import NAMES as UPOS_NAMES
from .util import registry
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
return Lemmatizer(data=data)
class Lemmatizer: class Lemmatizer:
@ -21,7 +15,6 @@ class Lemmatizer:
def __init__( def __init__(
self, self,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
data: Dict[str, dict] = {},
is_base_form: Optional[Callable] = None, is_base_form: Optional[Callable] = None,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -31,9 +24,6 @@ class Lemmatizer:
RETURNS (Lemmatizer): The newly constructed object. RETURNS (Lemmatizer): The newly constructed object.
""" """
self.lookups = lookups if lookups is not None else Lookups() self.lookups = lookups if lookups is not None else Lookups()
for name, table in data.items():
if table is not None:
self.lookups.add_table(name, table)
self.is_base_form = is_base_form self.is_base_form = is_base_form
def __call__( def __call__(

View File

@ -13,7 +13,9 @@ UNSET = object()
@registry.language_data("spacy-lookups-data") @registry.language_data("spacy-lookups-data")
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: def load_lookups(
lang: str, tables: List[str], strict: bool = True
) -> Optional[Dict[str, Any]]:
"""Load the data from the spacy-lookups-data package for a given language, """Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package if available. Returns an empty dict if there's no data or if the package
is not installed. is not installed.
@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
RETURNS (Dict[str, Any]): The lookups, keyed by table name. RETURNS (Dict[str, Any]): The lookups, keyed by table name.
""" """
# TODO: import spacy_lookups_data instead of going via entry points here? # TODO: import spacy_lookups_data instead of going via entry points here?
lookups = Lookups()
if lang not in registry.lookups: if lang not in registry.lookups:
return {} return lookups
data = registry.lookups.get(lang) data = registry.lookups.get(lang)
result = {}
for table in tables: for table in tables:
if table not in data: if table not in data:
if strict:
raise ValueError("TODO: unknown table") raise ValueError("TODO: unknown table")
result[table] = load_language_data(data[table]) language_data = {}
return result else:
language_data = load_language_data(data[table])
lookups.add_table(table, language_data)
return lookups
class Lookups: class Lookups:

View File

@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use") tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use")
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -257,7 +257,7 @@ def zh_tokenizer_char():
def zh_tokenizer_jieba(): def zh_tokenizer_jieba():
pytest.importorskip("jieba") pytest.importorskip("jieba")
config = { config = {
"@tokenizers": "spacy.ChineseTokenizer.v1", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba", "segmenter": "jieba",
} }
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
@ -268,7 +268,7 @@ def zh_tokenizer_jieba():
def zh_tokenizer_pkuseg(): def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg") pytest.importorskip("pkuseg")
config = { config = {
"@tokenizers": "spacy.ChineseTokenizer.v1", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg", "segmenter": "pkuseg",
"pkuseg_model": "default", "pkuseg_model": "default",
} }

View File

@ -26,37 +26,6 @@ from .attrs import intify_attrs
from .symbols import ORTH from .symbols import ORTH
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer(
# exceptions: Dict[str, List[dict]],
# prefixes: Optional[List[Union[str, Pattern]]],
# suffixes: Optional[List[Union[str, Pattern]]],
# infixes: Optional[List[Union[str, Pattern]]],
# We currently can't validate against Pattern because that will cause
# Pydantic to parse value *as* pattern
token_match: Optional[Any] = None,
url_match: Optional[Any] = None,
) -> "Tokenizer":
def tokenizer_factory(nlp):
exceptions = nlp.Defaults.tokenizer_exceptions
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
return Tokenizer(
nlp.vocab,
rules=exceptions,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
url_match=url_match,
)
return tokenizer_factory
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment """Segment text, and create Doc objects with the discovered segment
boundaries. boundaries.

View File

@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
# Ensure that getter can be pickled
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
vocab_data.get("lexeme_norm", {}),
)
lookups = Lookups()
for name, data in vocab_data.items():
if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
return Vocab(
lex_attr_getters=lex_attrs,
lemmatizer=lemmatizer,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
)
cdef class Vocab: cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab` """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying instance also provides access to the `StringStore`, and owns underlying
@ -31,7 +58,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, lemmatizer=None,
strings=tuple(), lookups=None, tag_map={}, vocab_data={}, strings=tuple(), lookups=None, tag_map={},
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., vectors_name=None, writing_system={},
get_noun_chunks=None, **deprecated_kwargs): get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
@ -51,10 +78,6 @@ cdef class Vocab:
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False): if lookups in (None, True, False):
lookups = Lookups() lookups = Lookups()
for name, data in vocab_data.items():
if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups) lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob} self.cfg = {'oov_prob': oov_prob}
@ -416,66 +439,6 @@ cdef class Vocab:
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
@classmethod
def from_config(
cls,
config,
lemmatizer=None,
lex_attr_getters=None,
stop_words=None,
vocab_data=None,
get_noun_chunks=None,
vectors_name=None,
):
"""Create a Vocab from a config and (currently) language defaults, i.e.
nlp.Defaults.
config (Dict[str, Any]): The full config.
lemmatizer (Callable): Optional lemmatizer.
vectors_name (str): Optional vectors name.
RETURNS (Vocab): The vocab.
"""
# TODO: make this less messy move lemmatizer out into its own pipeline
# component, move language defaults to config
lang = config["nlp"]["lang"]
writing_system = config["nlp"]["writing_system"]
if not lemmatizer:
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
if stop_words is None:
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
if vocab_data is None:
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
if get_noun_chunks is None:
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
if lex_attr_getters is None:
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
lex_attrs = dict(LEX_ATTRS)
lex_attrs.update(lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words)
# Ensure that getter can be pickled
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
vocab_data.get("lexeme_norm", {}),
)
vocab = cls(
lex_attr_getters=lex_attrs,
vocab_data=vocab_data,
lemmatizer=lemmatizer,
writing_system=writing_system,
get_noun_chunks=get_noun_chunks
)
if vocab.vectors.name is None and vectors_name:
vocab.vectors.name = vectors_name
return vocab
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.