Simplify language data and revert detailed configs

This commit is contained in:
Ines Montani 2020-07-24 14:50:26 +02:00
parent 87737a5a60
commit 38f6ea7a78
70 changed files with 414 additions and 1677 deletions

View File

@ -1,24 +1,13 @@
[nlp]
lang = null
stop_words = []
lex_attr_getters = {}
vocab_data = {}
get_noun_chunks = null
pipeline = []
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = null
url_match = {"@language_data": "spacy.xx.url_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
data = {}
[nlp.writing_system]
direction = "ltr"
has_case = true
has_letters = true
[components]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "af"
stop_words = {"@language_data": "spacy.af.stop_words"}
"""
@registry.language_data("spacy.af.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class AfrikaansDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Afrikaans(Language):
lang = "af"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = AfrikaansDefaults
__all__ = ["Afrikaans"]

View File

@ -1,46 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ar"
stop_words = {"@language_data": "spacy.ar.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.ar.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArabicDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language):
lang = "ar"
Defaults = ArabicDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
lang = "ar"
__all__ = ["Arabic"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "bg"
stop_words = {"@language_data": "spacy.bg.stop_words"}
"""
@registry.language_data("spacy.bg.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BulgarianDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Bulgarian(Language):
lang = "bg"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = BulgarianDefaults
__all__ = ["Bulgarian"]

View File

@ -1,31 +1,7 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "bn"
stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules"]
"""
@registry.language_data("spacy.bn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BengaliDefaults(Language.Defaults):
@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Bengali(Language):
lang = "bn"
Defaults = BengaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bengali"]

View File

@ -1,49 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
from .punctuation import TOKENIZER_INFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "ca"
stop_words = {"@language_data": "spacy.ca.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ca.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Catalan(Language):
lang = "ca"
Defaults = CatalanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Catalan"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "cs"
stop_words = {"@language_data": "spacy.cs.stop_words"}
"""
@registry.language_data("spacy.cs.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CzechDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Czech(Language):
lang = "cs"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = CzechDefaults
__all__ = ["Czech"]

View File

@ -1,55 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "da"
stop_words = {"@language_data": "spacy.da.stop_words"}
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.da.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class DanishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Danish(Language):
lang = "da"
Defaults = DanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Danish"]

View File

@ -1,44 +1,8 @@
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
"""
@registry.language_data("spacy.de.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.de.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class GermanDefaults(Language.Defaults):
@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class German(Language):
lang = "de"
Defaults = GermanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["German"]

View File

@ -1,9 +0,0 @@
from typing import Pattern
from .tokenizer_exceptions import URL_MATCH
from ..util import registry
@registry.language_data("spacy.xx.url_match")
def url_match() -> Pattern:
return URL_MATCH

View File

@ -1,69 +1,50 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...lookups import load_lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
@lemmatizers = "spacy.el.GreekLemmatizer"
"""
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
return GreekLemmatizer(data=data)
@registry.lemmatizers("spacy.el.GreekLemmatizer")
def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return GreekLemmatizer(lookups=lookups)
@registry.language_data("spacy.el.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
return lemmatizer_factory
class GreekDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Greek(Language):
lang = "el"
Defaults = GreekDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Greek"]

View File

@ -1,68 +1,49 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES
from ...language import Language
from ...lemmatizer import Lemmatizer
from ...lookups import load_lookups
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
@lemmatizers = "spacy.en.EnglishLemmatizer"
"""
@registry.language_data("spacy.en.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.en.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
return Lemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.en.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
return lemmatizer_factory
class EnglishDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class English(Language):
lang = "en"
Defaults = EnglishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["English"]

View File

@ -1,62 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
"""
@registry.language_data("spacy.es.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SpanishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Spanish(Language):
lang = "es"
Defaults = SpanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Spanish"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "et"
stop_words = {"@language_data": "spacy.et.stop_words"}
"""
@registry.language_data("spacy.et.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class EstonianDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Estonian(Language):
lang = "et"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = EstonianDefaults
__all__ = ["Estonian"]

View File

@ -1,41 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "eu"
stop_words = {"@language_data": "spacy.eu.stop_words"}
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
"""
@registry.language_data("spacy.eu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class BasqueDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Basque(Language):
lang = "eu"
Defaults = BasqueDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Basque"]

View File

@ -1,61 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...util import registry
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import noun_chunks
DEFAULT_CONFIG = """
[nlp]
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
"""
@registry.language_data("spacy.fa.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fa.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
class PersianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Persian(Language):
lang = "fa"
Defaults = PersianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Persian"]

View File

@ -1,42 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "fi"
stop_words = {"@language_data": "spacy.fi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
"""
@registry.language_data("spacy.fi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FinnishDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Finnish(Language):
lang = "fi"
Defaults = FinnishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Finnish"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any, Pattern
from typing import Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import noun_chunks
from ...lookups import load_lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.fr.token_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
@lemmatizers = "spacy.fr.FrenchLemmatizer"
"""
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.fr.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
@registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fr.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
return lemmatizer_factory
class FrenchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class French(Language):
lang = "fr"
Defaults = FrenchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["French"]

View File

@ -1,32 +1,16 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ga"
stop_words = {"@language_data": "spacy.ga.stop_words"}
"""
@registry.language_data("spacy.ga.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IrishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Irish(Language):
lang = "ga"
Defaults = IrishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Irish"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "gu"
stop_words = {"@language_data": "spacy.gu.stop_words"}
"""
@registry.language_data("spacy.gu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class GujaratiDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Gujarati(Language):
lang = "gu"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = GujaratiDefaults
__all__ = ["Gujarati"]

View File

@ -1,37 +1,15 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "he"
stop_words = {"@language_data": "spacy.he.stop_words"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.he.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HebrewDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language):
lang = "he"
Defaults = HebrewDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hebrew"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hi"
stop_words = {"@language_data": "spacy.hi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
"""
@registry.language_data("spacy.hi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class HindiDefaults(Language.Defaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Hindi(Language):
lang = "hi"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = HindiDefaults
__all__ = ["Hindi"]

View File

@ -1,40 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hr"
stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.hr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CroatianDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
class Croatian(Language):
lang = "hr"
Defaults = CroatianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Croatian"]

View File

@ -1,40 +1,7 @@
from typing import Set, Pattern
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.hu.token_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.hu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hu.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
class HungarianDefaults(Language.Defaults):
@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
stop_words = STOP_WORDS
class Hungarian(Language):
lang = "hu"
Defaults = HungarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hungarian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "hy"
stop_words = {"@language_data": "spacy.hy.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
"""
@registry.language_data("spacy.hy.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArmenianDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Armenian(Language):
lang = "hy"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = ArmenianDefaults
__all__ = ["Armenian"]

View File

@ -1,50 +1,9 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.id.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.id.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class IndonesianDefaults(Language.Defaults):
@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Indonesian(Language):
lang = "id"
Defaults = IndonesianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Indonesian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "is"
stop_words = {"@language_data": "spacy.is.stop_words"}
"""
@registry.language_data("spacy.is.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IcelandicDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Icelandic(Language):
lang = "is"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = IcelandicDefaults
__all__ = ["Icelandic"]

View File

@ -1,31 +1,7 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "it"
stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.it.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class ItalianDefaults(Language.Defaults):
@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
class Italian(Language):
lang = "it"
Defaults = ItalianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Italian"]

View File

@ -1,11 +1,11 @@
from typing import Optional, Union, Dict, Any, Set, Callable
from typing import Optional, Union, Dict, Any
from pathlib import Path
import srsly
from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
@ -20,33 +20,15 @@ from ... import util
DEFAULT_CONFIG = """
[nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1"
@tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
@registry.language_data("spacy.ja.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ja.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode)
@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer):
return self
class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Japanese(Language):
lang = "ja"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = JapaneseDefaults
# Hold the attributes we need with convenient names

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "kn"
stop_words = {"@language_data": "spacy.kn.stop_words"}
"""
@registry.language_data("spacy.kn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class KannadaDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Kannada(Language):
lang = "kn"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = KannadaDefaults
__all__ = ["Kannada"]

View File

@ -1,4 +1,4 @@
from typing import Set, Optional, Any, Dict
from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS
@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """
[nlp]
lang = "ko"
stop_words = {"@language_data": "spacy.ko.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.KoreanTokenizer.v1"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
@tokenizers = "spacy.ko.KoreanTokenizer"
"""
@registry.language_data("spacy.ko.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.tokenizers("spacy.KoreanTokenizer.v1")
def create_korean_tokenizer():
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp)
@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer):
yield {"surface": surface, "lemma": lemma, "tag": tag}
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Korean(Language):
lang = "ko"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = KoreanDefaults
def try_mecab_import() -> None:

View File

@ -1,54 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lb"
stop_words = {"@language_data": "spacy.lb.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.lb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LuxembourgishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Luxembourgish(Language):
lang = "lb"
Defaults = LuxembourgishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Luxembourgish"]

View File

@ -1,34 +1,18 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lij"
stop_words = {"@language_data": "spacy.lij.stop_words"}
"""
@registry.language_data("spacy.lij.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LigurianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Ligurian(Language):
lang = "lij"
Defaults = LigurianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ligurian"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lt"
stop_words = {"@language_data": "spacy.lt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.lt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Lithuanian(Language):
lang = "lt"
Defaults = LithuanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Lithuanian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "lv"
stop_words = {"@language_data": "spacy.lv.stop_words"}
"""
@registry.language_data("spacy.lv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LatvianDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Latvian(Language):
lang = "lv"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = LatvianDefaults
__all__ = ["Latvian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ml"
stop_words = {"@language_data": "spacy.ml.stop_words"}
"""
@registry.language_data("spacy.ml.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class MalayalamDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Malayalam(Language):
lang = "ml"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = MalayalamDefaults
__all__ = ["Malayalam"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "af"
stop_words = {"@language_data": "spacy.mr.stop_words"}
"""
@registry.language_data("spacy.mr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class MarathiDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Marathi(Language):
lang = "mr"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = MarathiDefaults
__all__ = ["Marathi"]

View File

@ -1,39 +1,9 @@
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import noun_chunks
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
"""
@registry.language_data("spacy.nb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.nb.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class NorwegianDefaults(Language.Defaults):
@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Norwegian(Language):
lang = "nb"
Defaults = NorwegianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Norwegian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ne"
stop_words = {"@language_data": "spacy.ne.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
"""
@registry.language_data("spacy.ne.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class NepaliDefaults(Language.Defaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Nepali(Language):
lang = "ne"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = NepaliDefaults
__all__ = ["Nepali"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .stop_words import STOP_WORDS
@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from ...lookups import load_lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "nl"
stop_words = {"@language_data": "spacy.nl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
@lemmatizers = "spacy.nl.DutchLemmatizer"
"""
@registry.language_data("spacy.nl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return DutchLemmatizer(lookups=lookups)
@registry.language_data("spacy.nl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
return DutchLemmatizer(data=data)
return lemmatizer_factory
class DutchDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Dutch(Language):
lang = "nl"
Defaults = DutchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Dutch"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...lookups import load_lookups
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "pl"
stop_words = {"@language_data": "spacy.pl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
@lemmatizers = "spacy.pl.PolishLemmatizer"
"""
@registry.language_data("spacy.pl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
TOKENIZER_EXCEPTIONS = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
@registry.language_data("spacy.pl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
# fmt: off
tables = [
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
]
# fmt: on
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return PolishLemmatizer(lookups=lookups)
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
return PolishLemmatizer(data=data)
return lemmatizer_factory
class PolishDefaults(Language.Defaults):
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
tokenizer_exceptions = mod_base_exceptions
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Polish(Language):
lang = "pl"
Defaults = PolishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Polish"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "pt"
stop_words = {"@language_data": "spacy.pt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.pt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PortugueseDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Portuguese(Language):
lang = "pt"
Defaults = PortugueseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Portuguese"]

View File

@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_prefixes = (
TOKENIZER_PREFIXES = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT
+ LIST_ELLIPSES
@ -13,7 +13,7 @@ _prefixes = (
)
_suffixes = (
TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
@ -31,7 +31,7 @@ _suffixes = (
]
)
_infixes = (
TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
@ -44,7 +44,3 @@ _infixes = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -1,49 +1,25 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from ...language import Language
from ...util import registry
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț)
DEFAULT_CONFIG = """
[nlp]
lang = "ro"
stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ro.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class RomanianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Romanian(Language):
lang = "ro"
Defaults = RomanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Romanian"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .stop_words import STOP_WORDS
@ -11,43 +11,30 @@ from ...language import Language
DEFAULT_CONFIG = """
[nlp]
lang = "ru"
stop_words = {"@language_data": "spacy.ru.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
@lemmatizers = "spacy.ru.RussianLemmatizer"
"""
@registry.language_data("spacy.ru.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
return RussianLemmatizer()
@registry.language_data("spacy.ru.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
def create_russian_lemmatizer() -> RussianLemmatizer:
return RussianLemmatizer()
return lemmatizer_factory
class RussianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Russian(Language):
lang = "ru"
Defaults = RussianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Russian"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.si.stop_words"}
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
"""
@registry.language_data("spacy.si.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SinhalaDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Sinhala(Language):
lang = "si"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = SinhalaDefaults
__all__ = ["Sinhala"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "sk"
stop_words = {"@language_data": "spacy.sk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
"""
@registry.language_data("spacy.sk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SlovakDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Slovak(Language):
lang = "sk"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = SlovakDefaults
__all__ = ["Slovak"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "sl"
stop_words = {"@language_data": "spacy.sl.stop_words"}
"""
@registry.language_data("spacy.sl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class SlovenianDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Slovenian(Language):
lang = "sl"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = SlovenianDefaults
__all__ = ["Slovenian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "sq"
stop_words = {"@language_data": "spacy.sq.stop_words"}
"""
@registry.language_data("spacy.sq.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class AlbanianDefaults(Language.Defaults):
stop_words = STOP_WORDS
class Albanian(Language):
lang = "sq"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = AlbanianDefaults
__all__ = ["Albanian"]

View File

@ -1,52 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "sr"
stop_words = {"@language_data": "spacy.sr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.sr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SerbianDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Serbian(Language):
lang = "sr"
Defaults = SerbianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Serbian"]

View File

@ -1,59 +1,25 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import registry
from .syntax_iterators import noun_chunks
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules"]
"""
@registry.language_data("spacy.sv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.sv.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class SwedishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Swedish(Language):
lang = "sv"
Defaults = SwedishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Swedish"]

View File

@ -1,38 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ta"
stop_words = {"@language_data": "spacy.ta.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
"""
@registry.language_data("spacy.ta.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TamilDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tamil(Language):
lang = "ta"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = TamilDefaults
__all__ = ["Tamil"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "te"
stop_words = {"@language_data": "spacy.te.stop_words"}
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
"""
@registry.language_data("spacy.te.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TeluguDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Telugu(Language):
lang = "te"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = TeluguDefaults
__all__ = ["Telugu"]

View File

@ -1,4 +1,3 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """
[nlp]
lang = "th"
stop_words = {"@language_data": "spacy.th.stop_words"}
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
@tokenizers = "spacy.th.ThaiTokenizer"
"""
@registry.language_data("spacy.th.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
@registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp)
@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Thai(Language):
lang = "th"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = ThaiDefaults
__all__ = ["Thai"]

View File

@ -1,47 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tl"
stop_words = {"@language_data": "spacy.tl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.tl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TagalogDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tagalog(Language):
lang = "tl"
Defaults = TagalogDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tagalog"]

View File

@ -1,40 +1,16 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tr"
stop_words = {"@language_data": "spacy.tr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.tr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class TurkishDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Turkish(Language):
lang = "tr"
Defaults = TurkishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Turkish"]

View File

@ -1,41 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "tt"
stop_words = {"@language_data": "spacy.tt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
"""
@registry.language_data("spacy.tt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TatarDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = tuple(TOKENIZER_INFIXES)
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tatar(Language):
lang = "tt"
Defaults = TatarDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tatar"]

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any
from typing import Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer
DEFAULT_CONFIG = """
[nlp]
lang = "uk"
stop_words = {"@language_data": "spacy.uk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
"""
@registry.language_data("spacy.uk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
return UkrainianLemmatizer()
@registry.language_data("spacy.uk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
return UkrainianLemmatizer()
return lemmatizer_factory
class UkrainianDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Ukrainian(Language):
lang = "uk"
Defaults = UkrainianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ukrainian"]

View File

@ -1,54 +1,19 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ur"
stop_words = {"@language_data": "spacy.ur.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lemma_lookup"]
"""
@registry.language_data("spacy.ur.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ur.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class UrduDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Urdu(Language):
lang = "ur"
Defaults = UrduDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Urdu"]

View File

@ -1,4 +1,3 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS
DEFAULT_CONFIG = """
[nlp]
lang = "vi"
stop_words = {"@language_data": "spacy.vi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.VietnameseTokenizer.v1"
@tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true
"""
@registry.language_data("spacy.vi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.vi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces)
class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Vietnamese(Language):
lang = "vi"
default_config = Config().from_str(DEFAULT_CONFIG)
Defaults = VietnameseDefaults
__all__ = ["Vietnamese"]

View File

@ -1,27 +1,12 @@
from thinc.api import Config
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
DEFAULT_CONFIG = """
[nlp]
lang = "xx"
"""
class MultiLanguageDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = "xx"
Defaults = MultiLanguageDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["MultiLanguage"]

View File

@ -1,39 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.yo.stop_words"}
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
"""
@registry.language_data("spacy.yo.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.yo.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class YorubaDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Yoruba(Language):
lang = "yo"
Defaults = YorubaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Yoruba"]

View File

@ -1,4 +1,4 @@
from typing import Optional, List, Set, Dict, Callable, Any
from typing import Optional, List, Dict, Any
from enum import Enum
import tempfile
import srsly
@ -10,7 +10,6 @@ from ...errors import Warnings, Errors
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ... import util
@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http
DEFAULT_CONFIG = """
[nlp]
lang = "zh"
stop_words = {"@language_data": "spacy.zh.stop_words"}
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.ChineseTokenizer.v1"
@tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char"
pkuseg_model = null
pkuseg_user_dict = "default"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
@ -47,17 +38,7 @@ class Segmenter(str, Enum):
return list(cls.__members__.keys())
@registry.language_data("spacy.zh.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.zh.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ChineseTokenizer.v1")
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def _get_config(self) -> Dict[str, Any]:
return {
"segmenter": self.segmenter,
"pkuseg_model": self.pkuseg_model,
"pkuseg_user_dict": self.pkuseg_user_dict,
}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.segmenter = config.get("segmenter", Segmenter.char)
self.pkuseg_model = config.get("pkuseg_model", None)
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
def to_bytes(self, **kwargs):
pkuseg_features_b = b""
pkuseg_weights_b = b""
@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer):
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
)
serializers = {
"cfg": lambda: srsly.json_dumps(self._get_config()),
"pkuseg_features": lambda: pkuseg_features_b,
"pkuseg_weights": lambda: pkuseg_weights_b,
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer):
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
deserializers = {
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
"pkuseg_features": deserialize_pkuseg_features,
"pkuseg_weights": deserialize_pkuseg_weights,
"pkuseg_processors": deserialize_pkuseg_processors,
@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer):
srsly.write_msgpack(path, data)
serializers = {
"cfg": lambda p: srsly.write_json(p, self._get_config()),
"pkuseg_model": lambda p: save_pkuseg_model(p),
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
}
@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.postprocesser.other_words = set(other_words)
serializers = {
"cfg": lambda p: self._set_config(srsly.read_json(p)),
"pkuseg_model": lambda p: load_pkuseg_model(p),
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
}
@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Chinese(Language):
lang = "zh"
Defaults = ChineseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def try_jieba_import(segmenter: str) -> None:

View File

@ -16,27 +16,25 @@ import multiprocessing as mp
from itertools import chain, cycle
from .tokens.underscore import Underscore
from .vocab import Vocab
from .vocab import Vocab, create_vocab
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example
from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry
from .util import SimpleFrozenDict
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer
from .lemmatizer import Lemmatizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema
from .git_info import GIT_VERSION
from . import util
from . import about
# We also need to import these to make sure the functions are registered
from .tokenizer import Tokenizer # noqa: F401
from .lemmatizer import Lemmatizer # noqa: F401
from .lookups import Lookups # noqa: F401
from .lang import defaults # noqa: F401
ENABLE_PIPELINE_ANALYSIS = False
# This is the base config will all settings (training etc.)
@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
class BaseDefaults:
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
tokenizer_exceptions: Dict[str, List[dict]] = {}
config: Config = Config()
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
token_match: Optional[Pattern] = None
url_match: Optional[Pattern] = URL_MATCH
syntax_iterators: Dict[str, Callable] = {}
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
stop_words = set()
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
def tokenizer_factory(nlp: "Language") -> Tokenizer:
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
return Tokenizer(
nlp.vocab,
rules=nlp.Defaults.tokenizer_exceptions,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=nlp.Defaults.token_match,
url_match=nlp.Defaults.url_match,
)
return tokenizer_factory
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
return Lemmatizer(lookups=lookups)
return lemmatizer_factory
class Language:
@ -65,8 +103,8 @@ class Language:
Defaults = BaseDefaults
lang: str = None
default_config = DEFAULT_CONFIG
factories = SimpleFrozenDict(error=Errors.E957)
factories = SimpleFrozenDict(error=Errors.E957)
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
def __init__(
@ -75,6 +113,7 @@ class Language:
max_length: int = 10 ** 6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
**kwargs,
):
"""Initialise a Language object.
@ -108,7 +147,16 @@ class Language:
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
if not create_lemmatizer:
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
# TODO: where does the vocab data come in?
vocab = create_vocab(
self.lang,
self.Defaults,
lemmatizer=create_lemmatizer(self),
vectors_name=vectors_name,
)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -126,7 +174,10 @@ class Language:
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG)
cls.default_config = util.deep_merge_configs(
cls.Defaults.config, DEFAULT_CONFIG
)
cls.default_config["nlp"]["lang"] = cls.lang
@property
def path(self):
@ -1226,17 +1277,16 @@ class Language:
config = util.deep_merge_configs(config, cls.default_config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
nlp_config = config["nlp"]
config_lang = nlp_config["lang"]
config_lang = config["nlp"]["lang"]
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
raise ValueError(
Errors.E958.format(
bad_lang_code=nlp_config["lang"],
bad_lang_code=config["nlp"]["lang"],
lang_code=cls.lang,
lang=util.get_object_name(cls),
)
)
nlp_config["lang"] = cls.lang
config["nlp"]["lang"] = cls.lang
# This isn't very elegant, but we remove the [components] block here to prevent
# it from getting resolved (causes problems because we expect to pass in
# the nlp and name args for each component). If we're auto-filling, we're
@ -1251,22 +1301,12 @@ class Language:
filled["components"] = orig_pipeline
config["components"] = orig_pipeline
create_tokenizer = resolved["nlp"]["tokenizer"]
lemmatizer = resolved["nlp"]["lemmatizer"]
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
stop_words = resolved["nlp"]["stop_words"]
vocab_data = resolved["nlp"]["vocab_data"]
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
vocab = Vocab.from_config(
filled,
lemmatizer=lemmatizer,
lex_attr_getters=lex_attr_getters,
stop_words=stop_words,
vocab_data=vocab_data,
get_noun_chunks=get_noun_chunks,
create_lemmatizer = resolved["nlp"]["lemmatizer"]
nlp = cls(
create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer,
)
nlp = cls(vocab, create_tokenizer=create_tokenizer)
pipeline = config.get("components", {})
for pipe_name in nlp_config["pipeline"]:
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))

View File

@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
from .util import registry
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
return Lemmatizer(data=data)
class Lemmatizer:
@ -21,7 +15,6 @@ class Lemmatizer:
def __init__(
self,
lookups: Optional[Lookups] = None,
data: Dict[str, dict] = {},
is_base_form: Optional[Callable] = None,
) -> None:
"""Initialize a Lemmatizer.
@ -31,9 +24,6 @@ class Lemmatizer:
RETURNS (Lemmatizer): The newly constructed object.
"""
self.lookups = lookups if lookups is not None else Lookups()
for name, table in data.items():
if table is not None:
self.lookups.add_table(name, table)
self.is_base_form = is_base_form
def __call__(

View File

@ -13,7 +13,9 @@ UNSET = object()
@registry.language_data("spacy-lookups-data")
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
def load_lookups(
lang: str, tables: List[str], strict: bool = True
) -> Optional[Dict[str, Any]]:
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package
is not installed.
@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
"""
# TODO: import spacy_lookups_data instead of going via entry points here?
lookups = Lookups()
if lang not in registry.lookups:
return {}
return lookups
data = registry.lookups.get(lang)
result = {}
for table in tables:
if table not in data:
raise ValueError("TODO: unknown table")
result[table] = load_language_data(data[table])
return result
if strict:
raise ValueError("TODO: unknown table")
language_data = {}
else:
language_data = load_language_data(data[table])
lookups.add_table(table, language_data)
return lookups
class Lookups:

View File

@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
# fmt: on
class Config:

View File

@ -257,7 +257,7 @@ def zh_tokenizer_char():
def zh_tokenizer_jieba():
pytest.importorskip("jieba")
config = {
"@tokenizers": "spacy.ChineseTokenizer.v1",
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba",
}
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
@ -268,7 +268,7 @@ def zh_tokenizer_jieba():
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
config = {
"@tokenizers": "spacy.ChineseTokenizer.v1",
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
"pkuseg_model": "default",
}

View File

@ -26,37 +26,6 @@ from .attrs import intify_attrs
from .symbols import ORTH
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer(
# exceptions: Dict[str, List[dict]],
# prefixes: Optional[List[Union[str, Pattern]]],
# suffixes: Optional[List[Union[str, Pattern]]],
# infixes: Optional[List[Union[str, Pattern]]],
# We currently can't validate against Pattern because that will cause
# Pydantic to parse value *as* pattern
token_match: Optional[Any] = None,
url_match: Optional[Any] = None,
) -> "Tokenizer":
def tokenizer_factory(nlp):
exceptions = nlp.Defaults.tokenizer_exceptions
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
return Tokenizer(
nlp.vocab,
rules=exceptions,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
url_match=url_match,
)
return tokenizer_factory
cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.

View File

@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
# Ensure that getter can be pickled
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
vocab_data.get("lexeme_norm", {}),
)
lookups = Lookups()
for name, data in vocab_data.items():
if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
return Vocab(
lex_attr_getters=lex_attrs,
lemmatizer=lemmatizer,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
)
cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
@ -31,7 +58,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, lemmatizer=None,
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
strings=tuple(), lookups=None, tag_map={},
oov_prob=-20., vectors_name=None, writing_system={},
get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary.
@ -51,10 +78,6 @@ cdef class Vocab:
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False):
lookups = Lookups()
for name, data in vocab_data.items():
if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob}
@ -416,66 +439,6 @@ cdef class Vocab:
orth = self.strings.add(orth)
return orth in self.vectors
@classmethod
def from_config(
cls,
config,
lemmatizer=None,
lex_attr_getters=None,
stop_words=None,
vocab_data=None,
get_noun_chunks=None,
vectors_name=None,
):
"""Create a Vocab from a config and (currently) language defaults, i.e.
nlp.Defaults.
config (Dict[str, Any]): The full config.
lemmatizer (Callable): Optional lemmatizer.
vectors_name (str): Optional vectors name.
RETURNS (Vocab): The vocab.
"""
# TODO: make this less messy move lemmatizer out into its own pipeline
# component, move language defaults to config
lang = config["nlp"]["lang"]
writing_system = config["nlp"]["writing_system"]
if not lemmatizer:
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
if stop_words is None:
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
if vocab_data is None:
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
if get_noun_chunks is None:
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
if lex_attr_getters is None:
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
lex_attrs = dict(LEX_ATTRS)
lex_attrs.update(lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words)
# Ensure that getter can be pickled
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
vocab_data.get("lexeme_norm", {}),
)
vocab = cls(
lex_attr_getters=lex_attrs,
vocab_data=vocab_data,
lemmatizer=lemmatizer,
writing_system=writing_system,
get_noun_chunks=get_noun_chunks
)
if vocab.vectors.name is None and vectors_name:
vocab.vectors.name = vectors_name
return vocab
def to_disk(self, path, exclude=tuple()):
"""Save the current state to a directory.