mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Simplify language data and revert detailed configs
This commit is contained in:
parent
87737a5a60
commit
38f6ea7a78
|
@ -1,24 +1,13 @@
|
|||
[nlp]
|
||||
lang = null
|
||||
stop_words = []
|
||||
lex_attr_getters = {}
|
||||
vocab_data = {}
|
||||
get_noun_chunks = null
|
||||
pipeline = []
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = null
|
||||
url_match = {"@language_data": "spacy.xx.url_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
data = {}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = true
|
||||
has_letters = true
|
||||
|
||||
[components]
|
||||
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "af"
|
||||
stop_words = {"@language_data": "spacy.af.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.af.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class AfrikaansDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Afrikaans(Language):
|
||||
lang = "af"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = AfrikaansDefaults
|
||||
|
||||
|
||||
__all__ = ["Afrikaans"]
|
||||
|
|
|
@ -1,46 +1,21 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ar"
|
||||
stop_words = {"@language_data": "spacy.ar.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ar.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ar.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class ArabicDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Arabic(Language):
|
||||
lang = "ar"
|
||||
Defaults = ArabicDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
lang = "ar"
|
||||
|
||||
|
||||
__all__ = ["Arabic"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "bg"
|
||||
stop_words = {"@language_data": "spacy.bg.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.bg.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class BulgarianDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
lang = "bg"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = BulgarianDefaults
|
||||
|
||||
|
||||
__all__ = ["Bulgarian"]
|
||||
|
|
|
@ -1,31 +1,7 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "bn"
|
||||
stop_words = {"@language_data": "spacy.bn.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_rules"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.bn.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
|
@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
lang = "bn"
|
||||
Defaults = BengaliDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -1,49 +1,20 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ca"
|
||||
stop_words = {"@language_data": "spacy.ca.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ca.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ca.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class CatalanDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Catalan(Language):
|
||||
lang = "ca"
|
||||
Defaults = CatalanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "cs"
|
||||
stop_words = {"@language_data": "spacy.cs.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.cs.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class CzechDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Czech(Language):
|
||||
lang = "cs"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = CzechDefaults
|
||||
|
||||
|
||||
__all__ = ["Czech"]
|
||||
|
|
|
@ -1,55 +1,21 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "da"
|
||||
stop_words = {"@language_data": "spacy.da.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.da.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.da.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
lang = "da"
|
||||
Defaults = DanishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Danish"]
|
||||
|
|
|
@ -1,44 +1,8 @@
|
|||
from typing import Set, Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "de"
|
||||
stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.de.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.de.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
|
@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class German(Language):
|
||||
lang = "de"
|
||||
Defaults = GermanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["German"]
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
from typing import Pattern
|
||||
|
||||
from .tokenizer_exceptions import URL_MATCH
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.language_data("spacy.xx.url_match")
|
||||
def url_match() -> Pattern:
|
||||
return URL_MATCH
|
|
@ -1,69 +1,50 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...lookups import load_lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "el"
|
||||
stop_words = {"@language_data": "spacy.el.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
|
||||
@lemmatizers = "spacy.el.GreekLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
||||
def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
|
||||
return GreekLemmatizer(data=data)
|
||||
@registry.lemmatizers("spacy.el.GreekLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
|
||||
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return GreekLemmatizer(lookups=lookups)
|
||||
|
||||
@registry.language_data("spacy.el.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Greek(Language):
|
||||
lang = "el"
|
||||
Defaults = GreekDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,68 +1,49 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import is_base_form
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ...language import Language
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...lookups import load_lookups
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
stop_words = {"@language_data": "spacy.en.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
||||
@lemmatizers = "spacy.en.EnglishLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.en.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||
|
||||
@registry.language_data("spacy.en.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
||||
def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
|
||||
return Lemmatizer(data=data, is_base_form=is_base_form)
|
||||
|
||||
|
||||
@registry.language_data("spacy.en.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = "en"
|
||||
Defaults = EnglishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -1,62 +1,23 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.config import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "es"
|
||||
stop_words = {"@language_data": "spacy.es.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
lang = "es"
|
||||
Defaults = SpanishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Spanish"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "et"
|
||||
stop_words = {"@language_data": "spacy.et.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.et.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class EstonianDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Estonian(Language):
|
||||
lang = "et"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = EstonianDefaults
|
||||
|
||||
|
||||
__all__ = ["Estonian"]
|
||||
|
|
|
@ -1,41 +1,18 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "eu"
|
||||
stop_words = {"@language_data": "spacy.eu.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.eu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.eu.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class BasqueDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Basque(Language):
|
||||
lang = "eu"
|
||||
Defaults = BasqueDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Basque"]
|
||||
|
|
|
@ -1,61 +1,23 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import noun_chunks
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fa"
|
||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Persian(Language):
|
||||
lang = "fa"
|
||||
Defaults = PersianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -1,42 +1,21 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fi"
|
||||
stop_words = {"@language_data": "spacy.fi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.fi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
lang = "fi"
|
||||
Defaults = FinnishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Finnish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any, Pattern
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
|
@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
|||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||
from .syntax_iterators import noun_chunks
|
||||
from ...lookups import load_lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fr"
|
||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = {"@language_data": "spacy.fr.token_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
@lemmatizers = "spacy.fr.FrenchLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
||||
def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
|
||||
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
||||
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||
|
||||
@registry.language_data("spacy.fr.token_match")
|
||||
def token_match() -> Pattern:
|
||||
return TOKEN_MATCH
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = "fr"
|
||||
Defaults = FrenchDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -1,32 +1,16 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ga"
|
||||
stop_words = {"@language_data": "spacy.ga.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ga.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class IrishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Irish(Language):
|
||||
lang = "ga"
|
||||
Defaults = IrishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Irish"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "gu"
|
||||
stop_words = {"@language_data": "spacy.gu.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.gu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class GujaratiDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Gujarati(Language):
|
||||
lang = "gu"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = GujaratiDefaults
|
||||
|
||||
|
||||
__all__ = ["Gujarati"]
|
||||
|
|
|
@ -1,37 +1,15 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "he"
|
||||
stop_words = {"@language_data": "spacy.he.stop_words"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.he.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class HebrewDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = "he"
|
||||
Defaults = HebrewDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Hebrew"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hi"
|
||||
stop_words = {"@language_data": "spacy.hi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class HindiDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Hindi(Language):
|
||||
lang = "hi"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = HindiDefaults
|
||||
|
||||
|
||||
__all__ = ["Hindi"]
|
||||
|
|
|
@ -1,40 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hr"
|
||||
stop_words = {"@language_data": "spacy.hr.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class CroatianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Croatian(Language):
|
||||
lang = "hr"
|
||||
Defaults = CroatianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Croatian"]
|
||||
|
|
|
@ -1,40 +1,7 @@
|
|||
from typing import Set, Pattern
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hu"
|
||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = {"@language_data": "spacy.hu.token_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hu.token_match")
|
||||
def token_match() -> Pattern:
|
||||
return TOKEN_MATCH
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
|
@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
lang = "hu"
|
||||
Defaults = HungarianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Hungarian"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hy"
|
||||
stop_words = {"@language_data": "spacy.hy.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hy.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hy.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class ArmenianDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Armenian(Language):
|
||||
lang = "hy"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = ArmenianDefaults
|
||||
|
||||
|
||||
__all__ = ["Armenian"]
|
||||
|
|
|
@ -1,50 +1,9 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.config import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "id"
|
||||
stop_words = {"@language_data": "spacy.id.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
|
@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Indonesian(Language):
|
||||
lang = "id"
|
||||
Defaults = IndonesianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Indonesian"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "is"
|
||||
stop_words = {"@language_data": "spacy.is.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.is.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class IcelandicDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Icelandic(Language):
|
||||
lang = "is"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = IcelandicDefaults
|
||||
|
||||
|
||||
__all__ = ["Icelandic"]
|
||||
|
|
|
@ -1,31 +1,7 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "it"
|
||||
stop_words = {"@language_data": "spacy.it.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.it.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
|
@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
|
|||
class Italian(Language):
|
||||
lang = "it"
|
||||
Defaults = ItalianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Italian"]
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from typing import Optional, Union, Dict, Any, Set, Callable
|
||||
from typing import Optional, Union, Dict, Any
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tag_map import TAG_MAP
|
||||
from .tag_orth_map import TAG_ORTH_MAP
|
||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
|
@ -20,33 +20,15 @@ from ... import util
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ja"
|
||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
||||
@tokenizers = "spacy.ja.JapaneseTokenizer"
|
||||
split_mode = null
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ja.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ja.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||
def create_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||
|
||||
|
@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
return self
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = "ja"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
|
||||
# Hold the attributes we need with convenient names
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "kn"
|
||||
stop_words = {"@language_data": "spacy.kn.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.kn.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class KannadaDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Kannada(Language):
|
||||
lang = "kn"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = KannadaDefaults
|
||||
|
||||
|
||||
__all__ = ["Kannada"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Optional, Any, Dict
|
||||
from typing import Optional, Any, Dict
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ko"
|
||||
stop_words = {"@language_data": "spacy.ko.stop_words"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.KoreanTokenizer.v1"
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ko.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.KoreanTokenizer.v1")
|
||||
def create_korean_tokenizer():
|
||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer():
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp)
|
||||
|
||||
|
@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
lang = "ko"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
|
|
|
@ -1,54 +1,20 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lb"
|
||||
stop_words = {"@language_data": "spacy.lb.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lb.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.lb.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Luxembourgish(Language):
|
||||
lang = "lb"
|
||||
Defaults = LuxembourgishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Luxembourgish"]
|
||||
|
|
|
@ -1,34 +1,18 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lij"
|
||||
stop_words = {"@language_data": "spacy.lij.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lij.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class LigurianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Ligurian(Language):
|
||||
lang = "lij"
|
||||
Defaults = LigurianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Ligurian"]
|
||||
|
|
|
@ -1,50 +1,21 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lt"
|
||||
stop_words = {"@language_data": "spacy.lt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.lt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class LithuanianDefaults(Language.Defaults):
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Lithuanian(Language):
|
||||
lang = "lt"
|
||||
Defaults = LithuanianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Lithuanian"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lv"
|
||||
stop_words = {"@language_data": "spacy.lv.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lv.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class LatvianDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Latvian(Language):
|
||||
lang = "lv"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = LatvianDefaults
|
||||
|
||||
|
||||
__all__ = ["Latvian"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ml"
|
||||
stop_words = {"@language_data": "spacy.ml.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ml.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class MalayalamDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Malayalam(Language):
|
||||
lang = "ml"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = MalayalamDefaults
|
||||
|
||||
|
||||
__all__ = ["Malayalam"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "af"
|
||||
stop_words = {"@language_data": "spacy.mr.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.mr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class MarathiDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Marathi(Language):
|
||||
lang = "mr"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = MarathiDefaults
|
||||
|
||||
|
||||
__all__ = ["Marathi"]
|
||||
|
|
|
@ -1,39 +1,9 @@
|
|||
from typing import Set, Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "nb"
|
||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.nb.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.nb.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
|
@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
lang = "nb"
|
||||
Defaults = NorwegianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ne"
|
||||
stop_words = {"@language_data": "spacy.ne.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ne.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ne.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class NepaliDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Nepali(Language):
|
||||
lang = "ne"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = NepaliDefaults
|
||||
|
||||
|
||||
__all__ = ["Nepali"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ...lookups import load_lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "nl"
|
||||
stop_words = {"@language_data": "spacy.nl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
@lemmatizers = "spacy.nl.DutchLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.nl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
|
||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return DutchLemmatizer(lookups=lookups)
|
||||
|
||||
@registry.language_data("spacy.nl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
||||
def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
|
||||
return DutchLemmatizer(data=data)
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
lang = "nl"
|
||||
Defaults = DutchDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
|
@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import PolishLemmatizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...lookups import load_lookups
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "pl"
|
||||
stop_words = {"@language_data": "spacy.pl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
|
||||
@lemmatizers = "spacy.pl.PolishLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.pl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
|
||||
|
||||
@registry.language_data("spacy.pl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
|
||||
# fmt: off
|
||||
tables = [
|
||||
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
|
||||
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
|
||||
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||
return PolishLemmatizer(lookups=lookups)
|
||||
|
||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
||||
def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
|
||||
return PolishLemmatizer(data=data)
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class PolishDefaults(Language.Defaults):
|
||||
mod_base_exceptions = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
tokenizer_exceptions = mod_base_exceptions
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
lang = "pl"
|
||||
Defaults = PolishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,50 +1,21 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "pt"
|
||||
stop_words = {"@language_data": "spacy.pt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.pt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.pt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
lang = "pt"
|
||||
Defaults = PortugueseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Portuguese"]
|
||||
|
|
|
@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
|||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
_prefixes = (
|
||||
TOKENIZER_PREFIXES = (
|
||||
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
|
@ -13,7 +13,7 @@ _prefixes = (
|
|||
)
|
||||
|
||||
|
||||
_suffixes = (
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
|
@ -31,7 +31,7 @@ _suffixes = (
|
|||
]
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
|
@ -44,7 +44,3 @@ _infixes = (
|
|||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -1,49 +1,25 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
# Lemma data note:
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ro"
|
||||
stop_words = {"@language_data": "spacy.ro.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ro.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class RomanianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Romanian(Language):
|
||||
lang = "ro"
|
||||
Defaults = RomanianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Romanian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -11,43 +11,30 @@ from ...language import Language
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ru"
|
||||
stop_words = {"@language_data": "spacy.ru.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
@lemmatizers = "spacy.ru.RussianLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ru.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
|
||||
def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
|
||||
def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
|
||||
return RussianLemmatizer()
|
||||
|
||||
|
||||
@registry.language_data("spacy.ru.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
|
||||
def create_russian_lemmatizer() -> RussianLemmatizer:
|
||||
return RussianLemmatizer()
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
lang = "ru"
|
||||
Defaults = RussianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "si"
|
||||
stop_words = {"@language_data": "spacy.si.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.si.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.si.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class SinhalaDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Sinhala(Language):
|
||||
lang = "si"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = SinhalaDefaults
|
||||
|
||||
|
||||
__all__ = ["Sinhala"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sk"
|
||||
stop_words = {"@language_data": "spacy.sk.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sk.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sk.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class SlovakDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Slovak(Language):
|
||||
lang = "sk"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = SlovakDefaults
|
||||
|
||||
|
||||
__all__ = ["Slovak"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sl"
|
||||
stop_words = {"@language_data": "spacy.sl.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class SlovenianDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Slovenian(Language):
|
||||
lang = "sl"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = SlovenianDefaults
|
||||
|
||||
|
||||
__all__ = ["Slovenian"]
|
||||
|
|
|
@ -1,26 +1,14 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sq"
|
||||
stop_words = {"@language_data": "spacy.sq.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sq.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
class AlbanianDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Albanian(Language):
|
||||
lang = "sq"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = AlbanianDefaults
|
||||
|
||||
|
||||
__all__ = ["Albanian"]
|
||||
|
|
|
@ -1,52 +1,18 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sr"
|
||||
stop_words = {"@language_data": "spacy.sr.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sr.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Serbian(Language):
|
||||
lang = "sr"
|
||||
Defaults = SerbianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Serbian"]
|
||||
|
|
|
@ -1,59 +1,25 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
from .syntax_iterators import noun_chunks
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sv"
|
||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup", "lemma_rules"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = "sv"
|
||||
Defaults = SwedishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -1,38 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ta"
|
||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ta.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ta.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class TamilDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tamil(Language):
|
||||
lang = "ta"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = TamilDefaults
|
||||
|
||||
|
||||
__all__ = ["Tamil"]
|
||||
|
|
|
@ -1,33 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "te"
|
||||
stop_words = {"@language_data": "spacy.te.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.te.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.te.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
class TeluguDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Telugu(Language):
|
||||
lang = "te"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = TeluguDefaults
|
||||
|
||||
|
||||
__all__ = ["Telugu"]
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "th"
|
||||
stop_words = {"@language_data": "spacy.th.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ThaiTokenizer.v1"
|
||||
|
||||
[nlp.vocab_data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lexeme_norm"]
|
||||
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.th.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.th.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ThaiTokenizer.v1")
|
||||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||
def create_thai_tokenizer():
|
||||
def thai_tokenizer_factory(nlp):
|
||||
return ThaiTokenizer(nlp)
|
||||
|
@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Thai(Language):
|
||||
lang = "th"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = ThaiDefaults
|
||||
|
||||
|
||||
__all__ = ["Thai"]
|
||||
|
|
|
@ -1,47 +1,18 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tl"
|
||||
stop_words = {"@language_data": "spacy.tl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.tl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class TagalogDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tagalog(Language):
|
||||
lang = "tl"
|
||||
Defaults = TagalogDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Tagalog"]
|
||||
|
|
|
@ -1,40 +1,16 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tr"
|
||||
stop_words = {"@language_data": "spacy.tr.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Turkish(Language):
|
||||
lang = "tr"
|
||||
Defaults = TurkishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Turkish"]
|
||||
|
|
|
@ -1,41 +1,20 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tt"
|
||||
stop_words = {"@language_data": "spacy.tt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.tt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class TatarDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tatar(Language):
|
||||
lang = "tt"
|
||||
Defaults = TatarDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Tatar"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "uk"
|
||||
stop_words = {"@language_data": "spacy.uk.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
|
||||
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.uk.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
|
||||
def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
|
||||
def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
|
||||
return UkrainianLemmatizer()
|
||||
|
||||
|
||||
@registry.language_data("spacy.uk.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
|
||||
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
||||
return UkrainianLemmatizer()
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
lang = "uk"
|
||||
Defaults = UkrainianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,54 +1,19 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ur"
|
||||
stop_words = {"@language_data": "spacy.ur.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
tables = ["lemma_lookup"]
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ur.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ur.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class UrduDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Urdu(Language):
|
||||
lang = "ur"
|
||||
Defaults = UrduDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Urdu"]
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from ...language import Language
|
||||
|
@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "vi"
|
||||
stop_words = {"@language_data": "spacy.vi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.VietnameseTokenizer.v1"
|
||||
@tokenizers = "spacy.vi.VietnameseTokenizer"
|
||||
use_pyvi = true
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.vi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.vi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
|
||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
|
@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Vietnamese(Language):
|
||||
lang = "vi"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
Defaults = VietnameseDefaults
|
||||
|
||||
|
||||
__all__ = ["Vietnamese"]
|
||||
|
|
|
@ -1,27 +1,12 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "xx"
|
||||
"""
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
"""
|
||||
|
||||
lang = "xx"
|
||||
Defaults = MultiLanguageDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["MultiLanguage"]
|
||||
|
|
|
@ -1,39 +1,16 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "si"
|
||||
stop_words = {"@language_data": "spacy.yo.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.yo.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.yo.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class YorubaDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Yoruba(Language):
|
||||
lang = "yo"
|
||||
Defaults = YorubaDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Yoruba"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, List, Set, Dict, Callable, Any
|
||||
from typing import Optional, List, Dict, Any
|
||||
from enum import Enum
|
||||
import tempfile
|
||||
import srsly
|
||||
|
@ -10,7 +10,6 @@ from ...errors import Warnings, Errors
|
|||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ... import util
|
||||
|
@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http
|
|||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "zh"
|
||||
stop_words = {"@language_data": "spacy.zh.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ChineseTokenizer.v1"
|
||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||
segmenter = "char"
|
||||
pkuseg_model = null
|
||||
pkuseg_user_dict = "default"
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
"""
|
||||
|
||||
|
||||
|
@ -47,17 +38,7 @@ class Segmenter(str, Enum):
|
|||
return list(cls.__members__.keys())
|
||||
|
||||
|
||||
@registry.language_data("spacy.zh.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.zh.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ChineseTokenizer.v1")
|
||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
|
@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||
warnings.warn(warn_msg)
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"segmenter": self.segmenter,
|
||||
"pkuseg_model": self.pkuseg_model,
|
||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
||||
}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
pkuseg_weights_b = b""
|
||||
|
@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||
)
|
||||
serializers = {
|
||||
"cfg": lambda: srsly.json_dumps(self._get_config()),
|
||||
"pkuseg_features": lambda: pkuseg_features_b,
|
||||
"pkuseg_weights": lambda: pkuseg_weights_b,
|
||||
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||
|
@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||
|
||||
deserializers = {
|
||||
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
|
||||
"pkuseg_features": deserialize_pkuseg_features,
|
||||
"pkuseg_weights": deserialize_pkuseg_weights,
|
||||
"pkuseg_processors": deserialize_pkuseg_processors,
|
||||
|
@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
srsly.write_msgpack(path, data)
|
||||
|
||||
serializers = {
|
||||
"cfg": lambda p: srsly.write_json(p, self._get_config()),
|
||||
"pkuseg_model": lambda p: save_pkuseg_model(p),
|
||||
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
|
||||
}
|
||||
|
@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
||||
serializers = {
|
||||
"cfg": lambda p: self._set_config(srsly.read_json(p)),
|
||||
"pkuseg_model": lambda p: load_pkuseg_model(p),
|
||||
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
|
||||
}
|
||||
|
@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
lang = "zh"
|
||||
Defaults = ChineseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
def try_jieba_import(segmenter: str) -> None:
|
||||
|
|
|
@ -16,27 +16,25 @@ import multiprocessing as mp
|
|||
from itertools import chain, cycle
|
||||
|
||||
from .tokens.underscore import Underscore
|
||||
from .vocab import Vocab
|
||||
from .vocab import Vocab, create_vocab
|
||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||
from .gold import Example
|
||||
from .scorer import Scorer
|
||||
from .util import link_vectors_to_models, create_default_optimizer, registry
|
||||
from .util import SimpleFrozenDict
|
||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .tokens import Doc
|
||||
from .lookups import load_lookups
|
||||
from .tokenizer import Tokenizer
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema
|
||||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
|
||||
# We also need to import these to make sure the functions are registered
|
||||
from .tokenizer import Tokenizer # noqa: F401
|
||||
from .lemmatizer import Lemmatizer # noqa: F401
|
||||
from .lookups import Lookups # noqa: F401
|
||||
from .lang import defaults # noqa: F401
|
||||
|
||||
|
||||
ENABLE_PIPELINE_ANALYSIS = False
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
|||
|
||||
|
||||
class BaseDefaults:
|
||||
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
|
||||
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
||||
config: Config = Config()
|
||||
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||||
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||||
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||||
infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
|
||||
token_match: Optional[Pattern] = None
|
||||
url_match: Optional[Pattern] = URL_MATCH
|
||||
syntax_iterators: Dict[str, Callable] = {}
|
||||
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
|
||||
stop_words = set()
|
||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||
def tokenizer_factory(nlp: "Language") -> Tokenizer:
|
||||
prefixes = nlp.Defaults.prefixes
|
||||
suffixes = nlp.Defaults.suffixes
|
||||
infixes = nlp.Defaults.infixes
|
||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||
return Tokenizer(
|
||||
nlp.vocab,
|
||||
rules=nlp.Defaults.tokenizer_exceptions,
|
||||
prefix_search=prefix_search,
|
||||
suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer,
|
||||
token_match=nlp.Defaults.token_match,
|
||||
url_match=nlp.Defaults.url_match,
|
||||
)
|
||||
|
||||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
|
||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
|
||||
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
|
||||
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
|
||||
return Lemmatizer(lookups=lookups)
|
||||
|
||||
return lemmatizer_factory
|
||||
|
||||
|
||||
class Language:
|
||||
|
@ -65,8 +103,8 @@ class Language:
|
|||
Defaults = BaseDefaults
|
||||
lang: str = None
|
||||
default_config = DEFAULT_CONFIG
|
||||
factories = SimpleFrozenDict(error=Errors.E957)
|
||||
|
||||
factories = SimpleFrozenDict(error=Errors.E957)
|
||||
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
||||
|
||||
def __init__(
|
||||
|
@ -75,6 +113,7 @@ class Language:
|
|||
max_length: int = 10 ** 6,
|
||||
meta: Dict[str, Any] = {},
|
||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialise a Language object.
|
||||
|
@ -108,7 +147,16 @@ class Language:
|
|||
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
|
||||
if not create_lemmatizer:
|
||||
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
||||
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||
# TODO: where does the vocab data come in?
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
lemmatizer=create_lemmatizer(self),
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -126,7 +174,10 @@ class Language:
|
|||
|
||||
def __init_subclass__(cls, **kwargs):
|
||||
super().__init_subclass__(**kwargs)
|
||||
cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG)
|
||||
cls.default_config = util.deep_merge_configs(
|
||||
cls.Defaults.config, DEFAULT_CONFIG
|
||||
)
|
||||
cls.default_config["nlp"]["lang"] = cls.lang
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
|
@ -1226,17 +1277,16 @@ class Language:
|
|||
config = util.deep_merge_configs(config, cls.default_config)
|
||||
if "nlp" not in config:
|
||||
raise ValueError(Errors.E985.format(config=config))
|
||||
nlp_config = config["nlp"]
|
||||
config_lang = nlp_config["lang"]
|
||||
config_lang = config["nlp"]["lang"]
|
||||
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
|
||||
raise ValueError(
|
||||
Errors.E958.format(
|
||||
bad_lang_code=nlp_config["lang"],
|
||||
bad_lang_code=config["nlp"]["lang"],
|
||||
lang_code=cls.lang,
|
||||
lang=util.get_object_name(cls),
|
||||
)
|
||||
)
|
||||
nlp_config["lang"] = cls.lang
|
||||
config["nlp"]["lang"] = cls.lang
|
||||
# This isn't very elegant, but we remove the [components] block here to prevent
|
||||
# it from getting resolved (causes problems because we expect to pass in
|
||||
# the nlp and name args for each component). If we're auto-filling, we're
|
||||
|
@ -1251,22 +1301,12 @@ class Language:
|
|||
filled["components"] = orig_pipeline
|
||||
config["components"] = orig_pipeline
|
||||
create_tokenizer = resolved["nlp"]["tokenizer"]
|
||||
lemmatizer = resolved["nlp"]["lemmatizer"]
|
||||
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
||||
stop_words = resolved["nlp"]["stop_words"]
|
||||
vocab_data = resolved["nlp"]["vocab_data"]
|
||||
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
|
||||
vocab = Vocab.from_config(
|
||||
filled,
|
||||
lemmatizer=lemmatizer,
|
||||
lex_attr_getters=lex_attr_getters,
|
||||
stop_words=stop_words,
|
||||
vocab_data=vocab_data,
|
||||
get_noun_chunks=get_noun_chunks,
|
||||
create_lemmatizer = resolved["nlp"]["lemmatizer"]
|
||||
nlp = cls(
|
||||
create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer,
|
||||
)
|
||||
nlp = cls(vocab, create_tokenizer=create_tokenizer)
|
||||
pipeline = config.get("components", {})
|
||||
for pipe_name in nlp_config["pipeline"]:
|
||||
for pipe_name in config["nlp"]["pipeline"]:
|
||||
if pipe_name not in pipeline:
|
||||
opts = ", ".join(pipeline.keys())
|
||||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||
|
|
|
@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict
|
|||
|
||||
from .lookups import Lookups
|
||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||
from .util import registry
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||
def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
|
||||
return Lemmatizer(data=data)
|
||||
|
||||
|
||||
class Lemmatizer:
|
||||
|
@ -21,7 +15,6 @@ class Lemmatizer:
|
|||
def __init__(
|
||||
self,
|
||||
lookups: Optional[Lookups] = None,
|
||||
data: Dict[str, dict] = {},
|
||||
is_base_form: Optional[Callable] = None,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
@ -31,9 +24,6 @@ class Lemmatizer:
|
|||
RETURNS (Lemmatizer): The newly constructed object.
|
||||
"""
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
for name, table in data.items():
|
||||
if table is not None:
|
||||
self.lookups.add_table(name, table)
|
||||
self.is_base_form = is_base_form
|
||||
|
||||
def __call__(
|
||||
|
|
|
@ -13,7 +13,9 @@ UNSET = object()
|
|||
|
||||
|
||||
@registry.language_data("spacy-lookups-data")
|
||||
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
||||
def load_lookups(
|
||||
lang: str, tables: List[str], strict: bool = True
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Load the data from the spacy-lookups-data package for a given language,
|
||||
if available. Returns an empty dict if there's no data or if the package
|
||||
is not installed.
|
||||
|
@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
|||
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
||||
"""
|
||||
# TODO: import spacy_lookups_data instead of going via entry points here?
|
||||
lookups = Lookups()
|
||||
if lang not in registry.lookups:
|
||||
return {}
|
||||
return lookups
|
||||
data = registry.lookups.get(lang)
|
||||
result = {}
|
||||
for table in tables:
|
||||
if table not in data:
|
||||
raise ValueError("TODO: unknown table")
|
||||
result[table] = load_language_data(data[table])
|
||||
return result
|
||||
if strict:
|
||||
raise ValueError("TODO: unknown table")
|
||||
language_data = {}
|
||||
else:
|
||||
language_data = load_language_data(data[table])
|
||||
lookups.add_table(table, language_data)
|
||||
return lookups
|
||||
|
||||
|
||||
class Lookups:
|
||||
|
|
|
@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
|
||||
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
||||
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
||||
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -257,7 +257,7 @@ def zh_tokenizer_char():
|
|||
def zh_tokenizer_jieba():
|
||||
pytest.importorskip("jieba")
|
||||
config = {
|
||||
"@tokenizers": "spacy.ChineseTokenizer.v1",
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "jieba",
|
||||
}
|
||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||
|
@ -268,7 +268,7 @@ def zh_tokenizer_jieba():
|
|||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
config = {
|
||||
"@tokenizers": "spacy.ChineseTokenizer.v1",
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
"pkuseg_model": "default",
|
||||
}
|
||||
|
|
|
@ -26,37 +26,6 @@ from .attrs import intify_attrs
|
|||
from .symbols import ORTH
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||
def create_tokenizer(
|
||||
# exceptions: Dict[str, List[dict]],
|
||||
# prefixes: Optional[List[Union[str, Pattern]]],
|
||||
# suffixes: Optional[List[Union[str, Pattern]]],
|
||||
# infixes: Optional[List[Union[str, Pattern]]],
|
||||
# We currently can't validate against Pattern because that will cause
|
||||
# Pydantic to parse value *as* pattern
|
||||
token_match: Optional[Any] = None,
|
||||
url_match: Optional[Any] = None,
|
||||
) -> "Tokenizer":
|
||||
def tokenizer_factory(nlp):
|
||||
exceptions = nlp.Defaults.tokenizer_exceptions
|
||||
prefixes = nlp.Defaults.prefixes
|
||||
suffixes = nlp.Defaults.suffixes
|
||||
infixes = nlp.Defaults.infixes
|
||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||
return Tokenizer(
|
||||
nlp.vocab,
|
||||
rules=exceptions,
|
||||
prefix_search=prefix_search,
|
||||
suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer,
|
||||
token_match=token_match,
|
||||
url_match=url_match,
|
||||
)
|
||||
return tokenizer_factory
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
"""Segment text, and create Doc objects with the discovered segment
|
||||
boundaries.
|
||||
|
|
|
@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||
# Ensure that getter can be pickled
|
||||
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
|
||||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
vocab_data.get("lexeme_norm", {}),
|
||||
)
|
||||
lookups = Lookups()
|
||||
for name, data in vocab_data.items():
|
||||
if name not in lookups:
|
||||
data = data if data is not None else {}
|
||||
lookups.add_table(name, data)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lemmatizer=lemmatizer,
|
||||
lookups=lookups,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||
instance also provides access to the `StringStore`, and owns underlying
|
||||
|
@ -31,7 +58,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
||||
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
|
||||
strings=tuple(), lookups=None, tag_map={},
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
get_noun_chunks=None, **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
@ -51,10 +78,6 @@ cdef class Vocab:
|
|||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
if lookups in (None, True, False):
|
||||
lookups = Lookups()
|
||||
for name, data in vocab_data.items():
|
||||
if name not in lookups:
|
||||
data = data if data is not None else {}
|
||||
lookups.add_table(name, data)
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
|
@ -416,66 +439,6 @@ cdef class Vocab:
|
|||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
config,
|
||||
lemmatizer=None,
|
||||
lex_attr_getters=None,
|
||||
stop_words=None,
|
||||
vocab_data=None,
|
||||
get_noun_chunks=None,
|
||||
vectors_name=None,
|
||||
):
|
||||
"""Create a Vocab from a config and (currently) language defaults, i.e.
|
||||
nlp.Defaults.
|
||||
|
||||
config (Dict[str, Any]): The full config.
|
||||
lemmatizer (Callable): Optional lemmatizer.
|
||||
vectors_name (str): Optional vectors name.
|
||||
RETURNS (Vocab): The vocab.
|
||||
"""
|
||||
# TODO: make this less messy – move lemmatizer out into its own pipeline
|
||||
# component, move language defaults to config
|
||||
lang = config["nlp"]["lang"]
|
||||
writing_system = config["nlp"]["writing_system"]
|
||||
if not lemmatizer:
|
||||
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
|
||||
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||
if stop_words is None:
|
||||
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
|
||||
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
|
||||
if vocab_data is None:
|
||||
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
||||
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
||||
if get_noun_chunks is None:
|
||||
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
|
||||
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
|
||||
if lex_attr_getters is None:
|
||||
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
||||
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
||||
lex_attrs = dict(LEX_ATTRS)
|
||||
lex_attrs.update(lex_attr_getters)
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words)
|
||||
# Ensure that getter can be pickled
|
||||
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
|
||||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
vocab_data.get("lexeme_norm", {}),
|
||||
)
|
||||
vocab = cls(
|
||||
lex_attr_getters=lex_attrs,
|
||||
vocab_data=vocab_data,
|
||||
lemmatizer=lemmatizer,
|
||||
writing_system=writing_system,
|
||||
get_noun_chunks=get_noun_chunks
|
||||
)
|
||||
if vocab.vectors.name is None and vectors_name:
|
||||
vocab.vectors.name = vectors_name
|
||||
return vocab
|
||||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user