mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Simplify language data and revert detailed configs
This commit is contained in:
parent
87737a5a60
commit
38f6ea7a78
|
@ -1,24 +1,13 @@
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
stop_words = []
|
|
||||||
lex_attr_getters = {}
|
|
||||||
vocab_data = {}
|
vocab_data = {}
|
||||||
get_noun_chunks = null
|
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
token_match = null
|
|
||||||
url_match = {"@language_data": "spacy.xx.url_match"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
data = {}
|
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "ltr"
|
|
||||||
has_case = true
|
|
||||||
has_letters = true
|
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class AfrikaansDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "af"
|
|
||||||
stop_words = {"@language_data": "spacy.af.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.af.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Afrikaans(Language):
|
class Afrikaans(Language):
|
||||||
lang = "af"
|
lang = "af"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = AfrikaansDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Afrikaans"]
|
__all__ = ["Afrikaans"]
|
||||||
|
|
|
@ -1,46 +1,21 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "ar"
|
|
||||||
stop_words = {"@language_data": "spacy.ar.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "rtl"
|
|
||||||
has_case = false
|
|
||||||
has_letters = true
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ar.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ar.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Arabic(Language):
|
class Arabic(Language):
|
||||||
lang = "ar"
|
|
||||||
Defaults = ArabicDefaults
|
Defaults = ArabicDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
lang = "ar"
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Arabic"]
|
__all__ = ["Arabic"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class BulgarianDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "bg"
|
|
||||||
stop_words = {"@language_data": "spacy.bg.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.bg.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Bulgarian(Language):
|
class Bulgarian(Language):
|
||||||
lang = "bg"
|
lang = "bg"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = BulgarianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bulgarian"]
|
__all__ = ["Bulgarian"]
|
||||||
|
|
|
@ -1,31 +1,7 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "bn"
|
|
||||||
stop_words = {"@language_data": "spacy.bn.stop_words"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_rules"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.bn.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
|
@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
lang = "bn"
|
lang = "bn"
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -1,49 +1,20 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "ca"
|
|
||||||
stop_words = {"@language_data": "spacy.ca.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ca.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ca.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
lang = "ca"
|
lang = "ca"
|
||||||
Defaults = CatalanDefaults
|
Defaults = CatalanDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class CzechDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "cs"
|
|
||||||
stop_words = {"@language_data": "spacy.cs.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.cs.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Czech(Language):
|
class Czech(Language):
|
||||||
lang = "cs"
|
lang = "cs"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = CzechDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Czech"]
|
__all__ = ["Czech"]
|
||||||
|
|
|
@ -1,55 +1,21 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "da"
|
|
||||||
stop_words = {"@language_data": "spacy.da.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.da.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.da.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
lang = "da"
|
lang = "da"
|
||||||
Defaults = DanishDefaults
|
Defaults = DanishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Danish"]
|
__all__ = ["Danish"]
|
||||||
|
|
|
@ -1,44 +1,8 @@
|
||||||
from typing import Set, Callable
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "de"
|
|
||||||
stop_words = {"@language_data": "spacy.de.stop_words"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.de.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.de.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
|
@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
lang = "de"
|
lang = "de"
|
||||||
Defaults = GermanDefaults
|
Defaults = GermanDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["German"]
|
__all__ = ["German"]
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
from typing import Pattern
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import URL_MATCH
|
|
||||||
from ..util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.xx.url_match")
|
|
||||||
def url_match() -> Pattern:
|
|
||||||
return URL_MATCH
|
|
|
@ -1,69 +1,50 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
from ...lookups import load_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "el"
|
|
||||||
stop_words = {"@language_data": "spacy.el.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
@lemmatizers = "spacy.el.GreekLemmatizer"
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
@registry.lemmatizers("spacy.el.GreekLemmatizer")
|
||||||
def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
|
def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
|
||||||
return GreekLemmatizer(data=data)
|
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||||
|
return GreekLemmatizer(lookups=lookups)
|
||||||
|
|
||||||
@registry.language_data("spacy.el.get_noun_chunks")
|
return lemmatizer_factory
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.el.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.el.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Greek(Language):
|
class Greek(Language):
|
||||||
lang = "el"
|
lang = "el"
|
||||||
Defaults = GreekDefaults
|
Defaults = GreekDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,68 +1,49 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import is_base_form
|
from .lemmatizer import is_base_form
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
from ...lookups import load_lookups
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
|
||||||
stop_words = {"@language_data": "spacy.en.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
@lemmatizers = "spacy.en.EnglishLemmatizer"
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.en.stop_words")
|
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
|
||||||
return STOP_WORDS
|
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||||
|
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||||
|
|
||||||
@registry.language_data("spacy.en.lex_attr_getters")
|
return lemmatizer_factory
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
|
||||||
def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
|
|
||||||
return Lemmatizer(data=data, is_base_form=is_base_form)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.en.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = "en"
|
lang = "en"
|
||||||
Defaults = EnglishDefaults
|
Defaults = EnglishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -1,62 +1,23 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.config import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "es"
|
|
||||||
stop_words = {"@language_data": "spacy.es.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.es.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.es.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.es.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(Language.Defaults):
|
class SpanishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
lang = "es"
|
lang = "es"
|
||||||
Defaults = SpanishDefaults
|
Defaults = SpanishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Spanish"]
|
__all__ = ["Spanish"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class EstonianDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "et"
|
|
||||||
stop_words = {"@language_data": "spacy.et.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.et.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Estonian(Language):
|
class Estonian(Language):
|
||||||
lang = "et"
|
lang = "et"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = EstonianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Estonian"]
|
__all__ = ["Estonian"]
|
||||||
|
|
|
@ -1,41 +1,18 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "eu"
|
|
||||||
stop_words = {"@language_data": "spacy.eu.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.eu.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.eu.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class BasqueDefaults(Language.Defaults):
|
class BasqueDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Basque(Language):
|
class Basque(Language):
|
||||||
lang = "eu"
|
lang = "eu"
|
||||||
Defaults = BasqueDefaults
|
Defaults = BasqueDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Basque"]
|
__all__ = ["Basque"]
|
||||||
|
|
|
@ -1,61 +1,23 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ...language import Language
|
|
||||||
from ...util import registry
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "fa"
|
|
||||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "rtl"
|
|
||||||
has_case = false
|
|
||||||
has_letters = true
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fa.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fa.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fa.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
lang = "fa"
|
lang = "fa"
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -1,42 +1,21 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "fi"
|
|
||||||
stop_words = {"@language_data": "spacy.fi.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fi.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fi.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class FinnishDefaults(Language.Defaults):
|
class FinnishDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Finnish(Language):
|
class Finnish(Language):
|
||||||
lang = "fi"
|
lang = "fi"
|
||||||
Defaults = FinnishDefaults
|
Defaults = FinnishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Finnish"]
|
__all__ = ["Finnish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any, Pattern
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
|
@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||||
from .syntax_iterators import noun_chunks
|
from ...lookups import load_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "fr"
|
|
||||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
token_match = {"@language_data": "spacy.fr.token_match"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
@lemmatizers = "spacy.fr.FrenchLemmatizer"
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
|
||||||
def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
|
def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
|
||||||
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||||
|
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.token_match")
|
return lemmatizer_factory
|
||||||
def token_match() -> Pattern:
|
|
||||||
return TOKEN_MATCH
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
token_match = TOKEN_MATCH
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
lang = "fr"
|
lang = "fr"
|
||||||
Defaults = FrenchDefaults
|
Defaults = FrenchDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -1,32 +1,16 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "ga"
|
|
||||||
stop_words = {"@language_data": "spacy.ga.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ga.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class IrishDefaults(Language.Defaults):
|
class IrishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Irish(Language):
|
class Irish(Language):
|
||||||
lang = "ga"
|
lang = "ga"
|
||||||
Defaults = IrishDefaults
|
Defaults = IrishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Irish"]
|
__all__ = ["Irish"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class GujaratiDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "gu"
|
|
||||||
stop_words = {"@language_data": "spacy.gu.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.gu.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Gujarati(Language):
|
class Gujarati(Language):
|
||||||
lang = "gu"
|
lang = "gu"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = GujaratiDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Gujarati"]
|
__all__ = ["Gujarati"]
|
||||||
|
|
|
@ -1,37 +1,15 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "he"
|
|
||||||
stop_words = {"@language_data": "spacy.he.stop_words"}
|
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "rtl"
|
|
||||||
has_case = false
|
|
||||||
has_letters = true
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.he.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Hebrew(Language):
|
class Hebrew(Language):
|
||||||
lang = "he"
|
lang = "he"
|
||||||
Defaults = HebrewDefaults
|
Defaults = HebrewDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hebrew"]
|
__all__ = ["Hebrew"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class HindiDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "hi"
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = {"@language_data": "spacy.hi.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hi.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hi.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Hindi(Language):
|
class Hindi(Language):
|
||||||
lang = "hi"
|
lang = "hi"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = HindiDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hindi"]
|
__all__ = ["Hindi"]
|
||||||
|
|
|
@ -1,40 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "hr"
|
|
||||||
stop_words = {"@language_data": "spacy.hr.stop_words"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hr.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class CroatianDefaults(Language.Defaults):
|
class CroatianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Croatian(Language):
|
class Croatian(Language):
|
||||||
lang = "hr"
|
lang = "hr"
|
||||||
Defaults = CroatianDefaults
|
Defaults = CroatianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Croatian"]
|
__all__ = ["Croatian"]
|
||||||
|
|
|
@ -1,40 +1,7 @@
|
||||||
from typing import Set, Pattern
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "hu"
|
|
||||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
token_match = {"@language_data": "spacy.hu.token_match"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hu.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hu.token_match")
|
|
||||||
def token_match() -> Pattern:
|
|
||||||
return TOKEN_MATCH
|
|
||||||
|
|
||||||
|
|
||||||
class HungarianDefaults(Language.Defaults):
|
class HungarianDefaults(Language.Defaults):
|
||||||
|
@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
token_match = TOKEN_MATCH
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
lang = "hu"
|
lang = "hu"
|
||||||
Defaults = HungarianDefaults
|
Defaults = HungarianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hungarian"]
|
__all__ = ["Hungarian"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class ArmenianDefaults(Language.Defaults):
|
||||||
[nlp]
|
lex_attr_getters = LEX_ATTRS
|
||||||
lang = "hy"
|
stop_words = STOP_WORDS
|
||||||
stop_words = {"@language_data": "spacy.hy.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hy.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.hy.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Armenian(Language):
|
class Armenian(Language):
|
||||||
lang = "hy"
|
lang = "hy"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = ArmenianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Armenian"]
|
__all__ = ["Armenian"]
|
||||||
|
|
|
@ -1,50 +1,9 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.config import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "id"
|
|
||||||
stop_words = {"@language_data": "spacy.id.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.id.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.id.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.id.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
|
@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
lang = "id"
|
lang = "id"
|
||||||
Defaults = IndonesianDefaults
|
Defaults = IndonesianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Indonesian"]
|
__all__ = ["Indonesian"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class IcelandicDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "is"
|
|
||||||
stop_words = {"@language_data": "spacy.is.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.is.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Icelandic(Language):
|
class Icelandic(Language):
|
||||||
lang = "is"
|
lang = "is"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = IcelandicDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Icelandic"]
|
__all__ = ["Icelandic"]
|
||||||
|
|
|
@ -1,31 +1,7 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "it"
|
|
||||||
stop_words = {"@language_data": "spacy.it.stop_words"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.it.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(Language.Defaults):
|
||||||
|
@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
lang = "it"
|
lang = "it"
|
||||||
Defaults = ItalianDefaults
|
Defaults = ItalianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Italian"]
|
__all__ = ["Italian"]
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Optional, Union, Dict, Any, Set, Callable
|
from typing import Optional, Union, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tag_orth_map import TAG_ORTH_MAP
|
from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
|
@ -20,33 +20,15 @@ from ... import util
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "ja"
|
|
||||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
@tokenizers = "spacy.ja.JapaneseTokenizer"
|
||||||
split_mode = null
|
split_mode = null
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "ltr"
|
|
||||||
has_case = false
|
|
||||||
has_letters = false
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ja.stop_words")
|
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_tokenizer(split_mode: Optional[str] = None):
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ja.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
|
||||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||||
|
|
||||||
|
@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class JapaneseDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
lang = "ja"
|
lang = "ja"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
|
|
||||||
# Hold the attributes we need with convenient names
|
# Hold the attributes we need with convenient names
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class KannadaDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "kn"
|
|
||||||
stop_words = {"@language_data": "spacy.kn.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.kn.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Kannada(Language):
|
class Kannada(Language):
|
||||||
lang = "kn"
|
lang = "kn"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = KannadaDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Kannada"]
|
__all__ = ["Kannada"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Optional, Any, Dict
|
from typing import Optional, Any, Dict
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "ko"
|
|
||||||
stop_words = {"@language_data": "spacy.ko.stop_words"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.KoreanTokenizer.v1"
|
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "ltr"
|
|
||||||
has_case = false
|
|
||||||
has_letters = false
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ko.stop_words")
|
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_tokenizer():
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.KoreanTokenizer.v1")
|
|
||||||
def create_korean_tokenizer():
|
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp)
|
return KoreanTokenizer(nlp)
|
||||||
|
|
||||||
|
@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
|
||||||
|
class KoreanDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
||||||
|
|
||||||
class Korean(Language):
|
class Korean(Language):
|
||||||
lang = "ko"
|
lang = "ko"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = KoreanDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import() -> None:
|
def try_mecab_import() -> None:
|
||||||
|
|
|
@ -1,54 +1,20 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "lb"
|
|
||||||
stop_words = {"@language_data": "spacy.lb.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lb.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lb.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Luxembourgish(Language):
|
class Luxembourgish(Language):
|
||||||
lang = "lb"
|
lang = "lb"
|
||||||
Defaults = LuxembourgishDefaults
|
Defaults = LuxembourgishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Luxembourgish"]
|
__all__ = ["Luxembourgish"]
|
||||||
|
|
|
@ -1,34 +1,18 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "lij"
|
|
||||||
stop_words = {"@language_data": "spacy.lij.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lij.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class LigurianDefaults(Language.Defaults):
|
class LigurianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Ligurian(Language):
|
class Ligurian(Language):
|
||||||
lang = "lij"
|
lang = "lij"
|
||||||
Defaults = LigurianDefaults
|
Defaults = LigurianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ligurian"]
|
__all__ = ["Ligurian"]
|
||||||
|
|
|
@ -1,50 +1,21 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "lt"
|
|
||||||
stop_words = {"@language_data": "spacy.lt.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lt.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lt.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class LithuanianDefaults(Language.Defaults):
|
class LithuanianDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Lithuanian(Language):
|
class Lithuanian(Language):
|
||||||
lang = "lt"
|
lang = "lt"
|
||||||
Defaults = LithuanianDefaults
|
Defaults = LithuanianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Lithuanian"]
|
__all__ = ["Lithuanian"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class LatvianDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "lv"
|
|
||||||
stop_words = {"@language_data": "spacy.lv.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.lv.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Latvian(Language):
|
class Latvian(Language):
|
||||||
lang = "lv"
|
lang = "lv"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = LatvianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Latvian"]
|
__all__ = ["Latvian"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class MalayalamDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "ml"
|
|
||||||
stop_words = {"@language_data": "spacy.ml.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ml.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Malayalam(Language):
|
class Malayalam(Language):
|
||||||
lang = "ml"
|
lang = "ml"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = MalayalamDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Malayalam"]
|
__all__ = ["Malayalam"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class MarathiDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "af"
|
|
||||||
stop_words = {"@language_data": "spacy.mr.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.mr.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Marathi(Language):
|
class Marathi(Language):
|
||||||
lang = "mr"
|
lang = "mr"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = MarathiDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Marathi"]
|
__all__ = ["Marathi"]
|
||||||
|
|
|
@ -1,39 +1,9 @@
|
||||||
from typing import Set, Callable
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import noun_chunks
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "nb"
|
|
||||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.nb.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.nb.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
|
@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
lang = "nb"
|
lang = "nb"
|
||||||
Defaults = NorwegianDefaults
|
Defaults = NorwegianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class NepaliDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "ne"
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = {"@language_data": "spacy.ne.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ne.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ne.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Nepali(Language):
|
class Nepali(Language):
|
||||||
lang = "ne"
|
lang = "ne"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = NepaliDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Nepali"]
|
__all__ = ["Nepali"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
|
from ...lookups import load_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "nl"
|
|
||||||
stop_words = {"@language_data": "spacy.nl.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
@lemmatizers = "spacy.nl.DutchLemmatizer"
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.nl.stop_words")
|
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
|
||||||
return STOP_WORDS
|
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||||
|
return DutchLemmatizer(lookups=lookups)
|
||||||
|
|
||||||
@registry.language_data("spacy.nl.lex_attr_getters")
|
return lemmatizer_factory
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
|
||||||
def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
|
|
||||||
return DutchLemmatizer(data=data)
|
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Dutch(Language):
|
class Dutch(Language):
|
||||||
lang = "nl"
|
lang = "nl"
|
||||||
Defaults = DutchDefaults
|
Defaults = DutchDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import PolishLemmatizer
|
from .lemmatizer import PolishLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...lookups import load_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "pl"
|
|
||||||
stop_words = {"@language_data": "spacy.pl.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
@lemmatizers = "spacy.pl.PolishLemmatizer"
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = {
|
||||||
@registry.language_data("spacy.pl.stop_words")
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
def stop_words() -> Set[str]:
|
}
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.pl.lex_attr_getters")
|
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
|
||||||
return LEX_ATTRS
|
# fmt: off
|
||||||
|
tables = [
|
||||||
|
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
|
||||||
|
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
|
||||||
|
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables)
|
||||||
|
return PolishLemmatizer(lookups=lookups)
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
return lemmatizer_factory
|
||||||
def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
|
|
||||||
return PolishLemmatizer(data=data)
|
|
||||||
|
|
||||||
|
|
||||||
class PolishDefaults(Language.Defaults):
|
class PolishDefaults(Language.Defaults):
|
||||||
mod_base_exceptions = {
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
}
|
|
||||||
tokenizer_exceptions = mod_base_exceptions
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Polish(Language):
|
class Polish(Language):
|
||||||
lang = "pl"
|
lang = "pl"
|
||||||
Defaults = PolishDefaults
|
Defaults = PolishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,50 +1,21 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "pt"
|
|
||||||
stop_words = {"@language_data": "spacy.pt.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.pt.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.pt.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Portuguese(Language):
|
class Portuguese(Language):
|
||||||
lang = "pt"
|
lang = "pt"
|
||||||
Defaults = PortugueseDefaults
|
Defaults = PortugueseDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Portuguese"]
|
__all__ = ["Portuguese"]
|
||||||
|
|
|
@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
TOKENIZER_PREFIXES = (
|
||||||
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||||
+ LIST_PUNCT
|
+ LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
|
@ -13,7 +13,7 @@ _prefixes = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
_suffixes = (
|
TOKENIZER_SUFFIXES = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
|
@ -31,7 +31,7 @@ _suffixes = (
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
_infixes = (
|
TOKENIZER_INFIXES = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
+ LIST_ICONS
|
+ LIST_ICONS
|
||||||
+ [
|
+ [
|
||||||
|
@ -44,7 +44,3 @@ _infixes = (
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
|
||||||
|
|
|
@ -1,49 +1,25 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
# Lemma data note:
|
# Lemma data note:
|
||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "ro"
|
|
||||||
stop_words = {"@language_data": "spacy.ro.stop_words"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ro.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class RomanianDefaults(Language.Defaults):
|
class RomanianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Romanian(Language):
|
class Romanian(Language):
|
||||||
lang = "ro"
|
lang = "ro"
|
||||||
Defaults = RomanianDefaults
|
Defaults = RomanianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Romanian"]
|
__all__ = ["Romanian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -11,43 +11,30 @@ from ...language import Language
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "ru"
|
|
||||||
stop_words = {"@language_data": "spacy.ru.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
@lemmatizers = "spacy.ru.RussianLemmatizer"
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ru.stop_words")
|
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
|
||||||
return STOP_WORDS
|
def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
|
||||||
|
return RussianLemmatizer()
|
||||||
|
|
||||||
|
return lemmatizer_factory
|
||||||
@registry.language_data("spacy.ru.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
|
|
||||||
def create_russian_lemmatizer() -> RussianLemmatizer:
|
|
||||||
return RussianLemmatizer()
|
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Russian(Language):
|
class Russian(Language):
|
||||||
lang = "ru"
|
lang = "ru"
|
||||||
Defaults = RussianDefaults
|
Defaults = RussianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class SinhalaDefaults(Language.Defaults):
|
||||||
[nlp]
|
lex_attr_getters = LEX_ATTRS
|
||||||
lang = "si"
|
stop_words = STOP_WORDS
|
||||||
stop_words = {"@language_data": "spacy.si.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.si.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.si.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Sinhala(Language):
|
class Sinhala(Language):
|
||||||
lang = "si"
|
lang = "si"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = SinhalaDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Sinhala"]
|
__all__ = ["Sinhala"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class SlovakDefaults(Language.Defaults):
|
||||||
[nlp]
|
lex_attr_getters = LEX_ATTRS
|
||||||
lang = "sk"
|
stop_words = STOP_WORDS
|
||||||
stop_words = {"@language_data": "spacy.sk.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sk.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sk.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Slovak(Language):
|
class Slovak(Language):
|
||||||
lang = "sk"
|
lang = "sk"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = SlovakDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Slovak"]
|
__all__ = ["Slovak"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class SlovenianDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "sl"
|
|
||||||
stop_words = {"@language_data": "spacy.sl.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sl.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Slovenian(Language):
|
class Slovenian(Language):
|
||||||
lang = "sl"
|
lang = "sl"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = SlovenianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Slovenian"]
|
__all__ = ["Slovenian"]
|
||||||
|
|
|
@ -1,26 +1,14 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class AlbanianDefaults(Language.Defaults):
|
||||||
[nlp]
|
stop_words = STOP_WORDS
|
||||||
lang = "sq"
|
|
||||||
stop_words = {"@language_data": "spacy.sq.stop_words"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sq.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Albanian(Language):
|
class Albanian(Language):
|
||||||
lang = "sq"
|
lang = "sq"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = AlbanianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Albanian"]
|
__all__ = ["Albanian"]
|
||||||
|
|
|
@ -1,52 +1,18 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "sr"
|
|
||||||
stop_words = {"@language_data": "spacy.sr.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sr.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sr.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(Language.Defaults):
|
class SerbianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Serbian(Language):
|
class Serbian(Language):
|
||||||
lang = "sr"
|
lang = "sr"
|
||||||
Defaults = SerbianDefaults
|
Defaults = SerbianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Serbian"]
|
__all__ = ["Serbian"]
|
||||||
|
|
|
@ -1,59 +1,25 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
from .syntax_iterators import noun_chunks
|
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "sv"
|
|
||||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
|
||||||
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup", "lemma_rules"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sv.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sv.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.sv.get_noun_chunks")
|
|
||||||
def get_noun_chunks() -> Callable:
|
|
||||||
return noun_chunks
|
|
||||||
|
|
||||||
|
|
||||||
class SwedishDefaults(Language.Defaults):
|
class SwedishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
lang = "sv"
|
lang = "sv"
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -1,38 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class TamilDefaults(Language.Defaults):
|
||||||
[nlp]
|
lex_attr_getters = LEX_ATTRS
|
||||||
lang = "ta"
|
stop_words = STOP_WORDS
|
||||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ta.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ta.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Tamil(Language):
|
class Tamil(Language):
|
||||||
lang = "ta"
|
lang = "ta"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = TamilDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tamil"]
|
__all__ = ["Tamil"]
|
||||||
|
|
|
@ -1,33 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
class TeluguDefaults(Language.Defaults):
|
||||||
[nlp]
|
lex_attr_getters = LEX_ATTRS
|
||||||
lang = "te"
|
stop_words = STOP_WORDS
|
||||||
stop_words = {"@language_data": "spacy.te.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.te.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.te.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Telugu(Language):
|
class Telugu(Language):
|
||||||
lang = "te"
|
lang = "te"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = TeluguDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Telugu"]
|
__all__ = ["Telugu"]
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "th"
|
|
||||||
stop_words = {"@language_data": "spacy.th.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.ThaiTokenizer.v1"
|
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||||
|
|
||||||
[nlp.vocab_data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.th.stop_words")
|
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.th.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.ThaiTokenizer.v1")
|
|
||||||
def create_thai_tokenizer():
|
def create_thai_tokenizer():
|
||||||
def thai_tokenizer_factory(nlp):
|
def thai_tokenizer_factory(nlp):
|
||||||
return ThaiTokenizer(nlp)
|
return ThaiTokenizer(nlp)
|
||||||
|
@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
|
class ThaiDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Thai(Language):
|
class Thai(Language):
|
||||||
lang = "th"
|
lang = "th"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = ThaiDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Thai"]
|
__all__ = ["Thai"]
|
||||||
|
|
|
@ -1,47 +1,18 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "tl"
|
|
||||||
stop_words = {"@language_data": "spacy.tl.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.tl.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.tl.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class TagalogDefaults(Language.Defaults):
|
class TagalogDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Tagalog(Language):
|
class Tagalog(Language):
|
||||||
lang = "tl"
|
lang = "tl"
|
||||||
Defaults = TagalogDefaults
|
Defaults = TagalogDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tagalog"]
|
__all__ = ["Tagalog"]
|
||||||
|
|
|
@ -1,40 +1,16 @@
|
||||||
from typing import Set
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "tr"
|
|
||||||
stop_words = {"@language_data": "spacy.tr.stop_words"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.tr.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class TurkishDefaults(Language.Defaults):
|
class TurkishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Turkish(Language):
|
class Turkish(Language):
|
||||||
lang = "tr"
|
lang = "tr"
|
||||||
Defaults = TurkishDefaults
|
Defaults = TurkishDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Turkish"]
|
__all__ = ["Turkish"]
|
||||||
|
|
|
@ -1,41 +1,20 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "tt"
|
|
||||||
stop_words = {"@language_data": "spacy.tt.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.tt.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.tt.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class TatarDefaults(Language.Defaults):
|
class TatarDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Tatar(Language):
|
class Tatar(Language):
|
||||||
lang = "tt"
|
lang = "tt"
|
||||||
Defaults = TatarDefaults
|
Defaults = TatarDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tatar"]
|
__all__ = ["Tatar"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "uk"
|
|
||||||
stop_words = {"@language_data": "spacy.uk.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
|
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.uk.stop_words")
|
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
|
||||||
def stop_words() -> Set[str]:
|
def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
|
||||||
return STOP_WORDS
|
def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
|
||||||
|
return UkrainianLemmatizer()
|
||||||
|
|
||||||
|
return lemmatizer_factory
|
||||||
@registry.language_data("spacy.uk.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
|
|
||||||
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
|
||||||
return UkrainianLemmatizer()
|
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Ukrainian(Language):
|
class Ukrainian(Language):
|
||||||
lang = "uk"
|
lang = "uk"
|
||||||
Defaults = UkrainianDefaults
|
Defaults = UkrainianDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,54 +1,19 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "ur"
|
|
||||||
stop_words = {"@language_data": "spacy.ur.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "rtl"
|
|
||||||
has_case = false
|
|
||||||
has_letters = true
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer.data]
|
|
||||||
@language_data = "spacy-lookups-data"
|
|
||||||
lang = ${nlp:lang}
|
|
||||||
tables = ["lemma_lookup"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ur.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.ur.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class UrduDefaults(Language.Defaults):
|
class UrduDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Urdu(Language):
|
class Urdu(Language):
|
||||||
lang = "ur"
|
lang = "ur"
|
||||||
Defaults = UrduDefaults
|
Defaults = UrduDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Urdu"]
|
__all__ = ["Urdu"]
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "vi"
|
|
||||||
stop_words = {"@language_data": "spacy.vi.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.VietnameseTokenizer.v1"
|
@tokenizers = "spacy.vi.VietnameseTokenizer"
|
||||||
use_pyvi = true
|
use_pyvi = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.vi.stop_words")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.vi.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
|
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
|
class VietnameseDefaults(Language.Defaults):
|
||||||
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Vietnamese(Language):
|
class Vietnamese(Language):
|
||||||
lang = "vi"
|
lang = "vi"
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
Defaults = VietnameseDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Vietnamese"]
|
__all__ = ["Vietnamese"]
|
||||||
|
|
|
@ -1,27 +1,12 @@
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "xx"
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class MultiLanguageDefaults(Language.Defaults):
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
|
||||||
|
|
||||||
|
|
||||||
class MultiLanguage(Language):
|
class MultiLanguage(Language):
|
||||||
"""Language class to be used for models that support multiple languages.
|
"""Language class to be used for models that support multiple languages.
|
||||||
This module allows models to specify their language ID as 'xx'.
|
This module allows models to specify their language ID as 'xx'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
lang = "xx"
|
lang = "xx"
|
||||||
Defaults = MultiLanguageDefaults
|
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["MultiLanguage"]
|
__all__ = ["MultiLanguage"]
|
||||||
|
|
|
@ -1,39 +1,16 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "si"
|
|
||||||
stop_words = {"@language_data": "spacy.yo.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.yo.stop_words")
|
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.yo.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class YorubaDefaults(Language.Defaults):
|
class YorubaDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Yoruba(Language):
|
class Yoruba(Language):
|
||||||
lang = "yo"
|
lang = "yo"
|
||||||
Defaults = YorubaDefaults
|
Defaults = YorubaDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Yoruba"]
|
__all__ = ["Yoruba"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, List, Set, Dict, Callable, Any
|
from typing import Optional, List, Dict, Any
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -10,7 +10,6 @@ from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "zh"
|
|
||||||
stop_words = {"@language_data": "spacy.zh.stop_words"}
|
|
||||||
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.ChineseTokenizer.v1"
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmenter = "char"
|
segmenter = "char"
|
||||||
pkuseg_model = null
|
pkuseg_model = null
|
||||||
pkuseg_user_dict = "default"
|
pkuseg_user_dict = "default"
|
||||||
|
|
||||||
[nlp.writing_system]
|
|
||||||
direction = "ltr"
|
|
||||||
has_case = false
|
|
||||||
has_letters = false
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,17 +38,7 @@ class Segmenter(str, Enum):
|
||||||
return list(cls.__members__.keys())
|
return list(cls.__members__.keys())
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.zh.stop_words")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def stop_words() -> Set[str]:
|
|
||||||
return STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.zh.lex_attr_getters")
|
|
||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|
||||||
return LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.ChineseTokenizer.v1")
|
|
||||||
def create_chinese_tokenizer(
|
def create_chinese_tokenizer(
|
||||||
segmenter: Segmenter = Segmenter.char,
|
segmenter: Segmenter = Segmenter.char,
|
||||||
pkuseg_model: Optional[str] = None,
|
pkuseg_model: Optional[str] = None,
|
||||||
|
@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"segmenter": self.segmenter,
|
||||||
|
"pkuseg_model": self.pkuseg_model,
|
||||||
|
"pkuseg_user_dict": self.pkuseg_user_dict,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
|
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||||
|
self.pkuseg_model = config.get("pkuseg_model", None)
|
||||||
|
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
pkuseg_weights_b = b""
|
pkuseg_weights_b = b""
|
||||||
|
@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||||
)
|
)
|
||||||
serializers = {
|
serializers = {
|
||||||
|
"cfg": lambda: srsly.json_dumps(self._get_config()),
|
||||||
"pkuseg_features": lambda: pkuseg_features_b,
|
"pkuseg_features": lambda: pkuseg_features_b,
|
||||||
"pkuseg_weights": lambda: pkuseg_weights_b,
|
"pkuseg_weights": lambda: pkuseg_weights_b,
|
||||||
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||||
|
@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||||
|
|
||||||
deserializers = {
|
deserializers = {
|
||||||
|
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
|
||||||
"pkuseg_features": deserialize_pkuseg_features,
|
"pkuseg_features": deserialize_pkuseg_features,
|
||||||
"pkuseg_weights": deserialize_pkuseg_weights,
|
"pkuseg_weights": deserialize_pkuseg_weights,
|
||||||
"pkuseg_processors": deserialize_pkuseg_processors,
|
"pkuseg_processors": deserialize_pkuseg_processors,
|
||||||
|
@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
srsly.write_msgpack(path, data)
|
srsly.write_msgpack(path, data)
|
||||||
|
|
||||||
serializers = {
|
serializers = {
|
||||||
|
"cfg": lambda p: srsly.write_json(p, self._get_config()),
|
||||||
"pkuseg_model": lambda p: save_pkuseg_model(p),
|
"pkuseg_model": lambda p: save_pkuseg_model(p),
|
||||||
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
|
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
|
||||||
}
|
}
|
||||||
|
@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
|
||||||
serializers = {
|
serializers = {
|
||||||
|
"cfg": lambda p: self._set_config(srsly.read_json(p)),
|
||||||
"pkuseg_model": lambda p: load_pkuseg_model(p),
|
"pkuseg_model": lambda p: load_pkuseg_model(p),
|
||||||
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
|
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
|
||||||
}
|
}
|
||||||
|
@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = "zh"
|
lang = "zh"
|
||||||
Defaults = ChineseDefaults
|
Defaults = ChineseDefaults
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(segmenter: str) -> None:
|
def try_jieba_import(segmenter: str) -> None:
|
||||||
|
|
|
@ -16,27 +16,25 @@ import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||||
from .gold import Example
|
from .gold import Example
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import link_vectors_to_models, create_default_optimizer, registry
|
from .util import link_vectors_to_models, create_default_optimizer, registry
|
||||||
from .util import SimpleFrozenDict
|
from .util import SimpleFrozenDict
|
||||||
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
|
from .lookups import load_lookups
|
||||||
|
from .tokenizer import Tokenizer
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema
|
from .schemas import ConfigSchema
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
# We also need to import these to make sure the functions are registered
|
|
||||||
from .tokenizer import Tokenizer # noqa: F401
|
|
||||||
from .lemmatizer import Lemmatizer # noqa: F401
|
|
||||||
from .lookups import Lookups # noqa: F401
|
|
||||||
from .lang import defaults # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
ENABLE_PIPELINE_ANALYSIS = False
|
ENABLE_PIPELINE_ANALYSIS = False
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
|
@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults:
|
class BaseDefaults:
|
||||||
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
|
config: Config = Config()
|
||||||
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
|
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||||||
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
|
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||||||
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||||||
|
infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
|
||||||
|
token_match: Optional[Pattern] = None
|
||||||
|
url_match: Optional[Pattern] = URL_MATCH
|
||||||
|
syntax_iterators: Dict[str, Callable] = {}
|
||||||
|
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
|
||||||
|
stop_words = set()
|
||||||
|
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||||
|
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
|
def tokenizer_factory(nlp: "Language") -> Tokenizer:
|
||||||
|
prefixes = nlp.Defaults.prefixes
|
||||||
|
suffixes = nlp.Defaults.suffixes
|
||||||
|
infixes = nlp.Defaults.infixes
|
||||||
|
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||||
|
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||||
|
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||||
|
return Tokenizer(
|
||||||
|
nlp.vocab,
|
||||||
|
rules=nlp.Defaults.tokenizer_exceptions,
|
||||||
|
prefix_search=prefix_search,
|
||||||
|
suffix_search=suffix_search,
|
||||||
|
infix_finditer=infix_finditer,
|
||||||
|
token_match=nlp.Defaults.token_match,
|
||||||
|
url_match=nlp.Defaults.url_match,
|
||||||
|
)
|
||||||
|
|
||||||
|
return tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||||
|
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
|
||||||
|
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
|
||||||
|
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
|
||||||
|
lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
|
||||||
|
return Lemmatizer(lookups=lookups)
|
||||||
|
|
||||||
|
return lemmatizer_factory
|
||||||
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
|
@ -65,8 +103,8 @@ class Language:
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang: str = None
|
lang: str = None
|
||||||
default_config = DEFAULT_CONFIG
|
default_config = DEFAULT_CONFIG
|
||||||
factories = SimpleFrozenDict(error=Errors.E957)
|
|
||||||
|
|
||||||
|
factories = SimpleFrozenDict(error=Errors.E957)
|
||||||
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -75,6 +113,7 @@ class Language:
|
||||||
max_length: int = 10 ** 6,
|
max_length: int = 10 ** 6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
|
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
@ -108,7 +147,16 @@ class Language:
|
||||||
|
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
|
if not create_lemmatizer:
|
||||||
|
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
||||||
|
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||||
|
# TODO: where does the vocab data come in?
|
||||||
|
vocab = create_vocab(
|
||||||
|
self.lang,
|
||||||
|
self.Defaults,
|
||||||
|
lemmatizer=create_lemmatizer(self),
|
||||||
|
vectors_name=vectors_name,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -126,7 +174,10 @@ class Language:
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
super().__init_subclass__(**kwargs)
|
super().__init_subclass__(**kwargs)
|
||||||
cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG)
|
cls.default_config = util.deep_merge_configs(
|
||||||
|
cls.Defaults.config, DEFAULT_CONFIG
|
||||||
|
)
|
||||||
|
cls.default_config["nlp"]["lang"] = cls.lang
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self):
|
def path(self):
|
||||||
|
@ -1226,17 +1277,16 @@ class Language:
|
||||||
config = util.deep_merge_configs(config, cls.default_config)
|
config = util.deep_merge_configs(config, cls.default_config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
nlp_config = config["nlp"]
|
config_lang = config["nlp"]["lang"]
|
||||||
config_lang = nlp_config["lang"]
|
|
||||||
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
|
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E958.format(
|
Errors.E958.format(
|
||||||
bad_lang_code=nlp_config["lang"],
|
bad_lang_code=config["nlp"]["lang"],
|
||||||
lang_code=cls.lang,
|
lang_code=cls.lang,
|
||||||
lang=util.get_object_name(cls),
|
lang=util.get_object_name(cls),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
nlp_config["lang"] = cls.lang
|
config["nlp"]["lang"] = cls.lang
|
||||||
# This isn't very elegant, but we remove the [components] block here to prevent
|
# This isn't very elegant, but we remove the [components] block here to prevent
|
||||||
# it from getting resolved (causes problems because we expect to pass in
|
# it from getting resolved (causes problems because we expect to pass in
|
||||||
# the nlp and name args for each component). If we're auto-filling, we're
|
# the nlp and name args for each component). If we're auto-filling, we're
|
||||||
|
@ -1251,22 +1301,12 @@ class Language:
|
||||||
filled["components"] = orig_pipeline
|
filled["components"] = orig_pipeline
|
||||||
config["components"] = orig_pipeline
|
config["components"] = orig_pipeline
|
||||||
create_tokenizer = resolved["nlp"]["tokenizer"]
|
create_tokenizer = resolved["nlp"]["tokenizer"]
|
||||||
lemmatizer = resolved["nlp"]["lemmatizer"]
|
create_lemmatizer = resolved["nlp"]["lemmatizer"]
|
||||||
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
nlp = cls(
|
||||||
stop_words = resolved["nlp"]["stop_words"]
|
create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer,
|
||||||
vocab_data = resolved["nlp"]["vocab_data"]
|
|
||||||
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
|
|
||||||
vocab = Vocab.from_config(
|
|
||||||
filled,
|
|
||||||
lemmatizer=lemmatizer,
|
|
||||||
lex_attr_getters=lex_attr_getters,
|
|
||||||
stop_words=stop_words,
|
|
||||||
vocab_data=vocab_data,
|
|
||||||
get_noun_chunks=get_noun_chunks,
|
|
||||||
)
|
)
|
||||||
nlp = cls(vocab, create_tokenizer=create_tokenizer)
|
|
||||||
pipeline = config.get("components", {})
|
pipeline = config.get("components", {})
|
||||||
for pipe_name in nlp_config["pipeline"]:
|
for pipe_name in config["nlp"]["pipeline"]:
|
||||||
if pipe_name not in pipeline:
|
if pipe_name not in pipeline:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||||
|
|
|
@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict
|
||||||
|
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
from .util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
|
||||||
def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
|
|
||||||
return Lemmatizer(data=data)
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer:
|
class Lemmatizer:
|
||||||
|
@ -21,7 +15,6 @@ class Lemmatizer:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
lookups: Optional[Lookups] = None,
|
lookups: Optional[Lookups] = None,
|
||||||
data: Dict[str, dict] = {},
|
|
||||||
is_base_form: Optional[Callable] = None,
|
is_base_form: Optional[Callable] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
@ -31,9 +24,6 @@ class Lemmatizer:
|
||||||
RETURNS (Lemmatizer): The newly constructed object.
|
RETURNS (Lemmatizer): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = lookups if lookups is not None else Lookups()
|
||||||
for name, table in data.items():
|
|
||||||
if table is not None:
|
|
||||||
self.lookups.add_table(name, table)
|
|
||||||
self.is_base_form = is_base_form
|
self.is_base_form = is_base_form
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|
|
@ -13,7 +13,9 @@ UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy-lookups-data")
|
@registry.language_data("spacy-lookups-data")
|
||||||
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
def load_lookups(
|
||||||
|
lang: str, tables: List[str], strict: bool = True
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
"""Load the data from the spacy-lookups-data package for a given language,
|
"""Load the data from the spacy-lookups-data package for a given language,
|
||||||
if available. Returns an empty dict if there's no data or if the package
|
if available. Returns an empty dict if there's no data or if the package
|
||||||
is not installed.
|
is not installed.
|
||||||
|
@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
||||||
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
||||||
"""
|
"""
|
||||||
# TODO: import spacy_lookups_data instead of going via entry points here?
|
# TODO: import spacy_lookups_data instead of going via entry points here?
|
||||||
|
lookups = Lookups()
|
||||||
if lang not in registry.lookups:
|
if lang not in registry.lookups:
|
||||||
return {}
|
return lookups
|
||||||
data = registry.lookups.get(lang)
|
data = registry.lookups.get(lang)
|
||||||
result = {}
|
|
||||||
for table in tables:
|
for table in tables:
|
||||||
if table not in data:
|
if table not in data:
|
||||||
raise ValueError("TODO: unknown table")
|
if strict:
|
||||||
result[table] = load_language_data(data[table])
|
raise ValueError("TODO: unknown table")
|
||||||
return result
|
language_data = {}
|
||||||
|
else:
|
||||||
|
language_data = load_language_data(data[table])
|
||||||
|
lookups.add_table(table, language_data)
|
||||||
|
return lookups
|
||||||
|
|
||||||
|
|
||||||
class Lookups:
|
class Lookups:
|
||||||
|
|
|
@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel):
|
||||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||||
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
|
|
||||||
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
|
||||||
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
|
||||||
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||||
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -257,7 +257,7 @@ def zh_tokenizer_char():
|
||||||
def zh_tokenizer_jieba():
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.ChineseTokenizer.v1",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
"segmenter": "jieba",
|
"segmenter": "jieba",
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||||
|
@ -268,7 +268,7 @@ def zh_tokenizer_jieba():
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.ChineseTokenizer.v1",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
"segmenter": "pkuseg",
|
"segmenter": "pkuseg",
|
||||||
"pkuseg_model": "default",
|
"pkuseg_model": "default",
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,37 +26,6 @@ from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
|
||||||
def create_tokenizer(
|
|
||||||
# exceptions: Dict[str, List[dict]],
|
|
||||||
# prefixes: Optional[List[Union[str, Pattern]]],
|
|
||||||
# suffixes: Optional[List[Union[str, Pattern]]],
|
|
||||||
# infixes: Optional[List[Union[str, Pattern]]],
|
|
||||||
# We currently can't validate against Pattern because that will cause
|
|
||||||
# Pydantic to parse value *as* pattern
|
|
||||||
token_match: Optional[Any] = None,
|
|
||||||
url_match: Optional[Any] = None,
|
|
||||||
) -> "Tokenizer":
|
|
||||||
def tokenizer_factory(nlp):
|
|
||||||
exceptions = nlp.Defaults.tokenizer_exceptions
|
|
||||||
prefixes = nlp.Defaults.prefixes
|
|
||||||
suffixes = nlp.Defaults.suffixes
|
|
||||||
infixes = nlp.Defaults.infixes
|
|
||||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
|
||||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
|
||||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
|
||||||
return Tokenizer(
|
|
||||||
nlp.vocab,
|
|
||||||
rules=exceptions,
|
|
||||||
prefix_search=prefix_search,
|
|
||||||
suffix_search=suffix_search,
|
|
||||||
infix_finditer=infix_finditer,
|
|
||||||
token_match=token_match,
|
|
||||||
url_match=url_match,
|
|
||||||
)
|
|
||||||
return tokenizer_factory
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
"""Segment text, and create Doc objects with the discovered segment
|
"""Segment text, and create Doc objects with the discovered segment
|
||||||
boundaries.
|
boundaries.
|
||||||
|
|
|
@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||||
|
|
||||||
|
|
||||||
|
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
|
||||||
|
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||||
|
# This is messy, but it's the minimal working fix to Issue #639.
|
||||||
|
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||||
|
# Ensure that getter can be pickled
|
||||||
|
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
|
||||||
|
lex_attrs[NORM] = util.add_lookups(
|
||||||
|
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||||
|
BASE_NORMS,
|
||||||
|
vocab_data.get("lexeme_norm", {}),
|
||||||
|
)
|
||||||
|
lookups = Lookups()
|
||||||
|
for name, data in vocab_data.items():
|
||||||
|
if name not in lookups:
|
||||||
|
data = data if data is not None else {}
|
||||||
|
lookups.add_table(name, data)
|
||||||
|
return Vocab(
|
||||||
|
lex_attr_getters=lex_attrs,
|
||||||
|
lemmatizer=lemmatizer,
|
||||||
|
lookups=lookups,
|
||||||
|
writing_system=defaults.writing_system,
|
||||||
|
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||||
|
vectors_name=vectors_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||||
instance also provides access to the `StringStore`, and owns underlying
|
instance also provides access to the `StringStore`, and owns underlying
|
||||||
|
@ -31,7 +58,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
||||||
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
|
strings=tuple(), lookups=None, tag_map={},
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
oov_prob=-20., vectors_name=None, writing_system={},
|
||||||
get_noun_chunks=None, **deprecated_kwargs):
|
get_noun_chunks=None, **deprecated_kwargs):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
@ -51,10 +78,6 @@ cdef class Vocab:
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
if lookups in (None, True, False):
|
if lookups in (None, True, False):
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
for name, data in vocab_data.items():
|
|
||||||
if name not in lookups:
|
|
||||||
data = data if data is not None else {}
|
|
||||||
lookups.add_table(name, data)
|
|
||||||
if lemmatizer in (None, True, False):
|
if lemmatizer in (None, True, False):
|
||||||
lemmatizer = Lemmatizer(lookups)
|
lemmatizer = Lemmatizer(lookups)
|
||||||
self.cfg = {'oov_prob': oov_prob}
|
self.cfg = {'oov_prob': oov_prob}
|
||||||
|
@ -416,66 +439,6 @@ cdef class Vocab:
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
return orth in self.vectors
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_config(
|
|
||||||
cls,
|
|
||||||
config,
|
|
||||||
lemmatizer=None,
|
|
||||||
lex_attr_getters=None,
|
|
||||||
stop_words=None,
|
|
||||||
vocab_data=None,
|
|
||||||
get_noun_chunks=None,
|
|
||||||
vectors_name=None,
|
|
||||||
):
|
|
||||||
"""Create a Vocab from a config and (currently) language defaults, i.e.
|
|
||||||
nlp.Defaults.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The full config.
|
|
||||||
lemmatizer (Callable): Optional lemmatizer.
|
|
||||||
vectors_name (str): Optional vectors name.
|
|
||||||
RETURNS (Vocab): The vocab.
|
|
||||||
"""
|
|
||||||
# TODO: make this less messy – move lemmatizer out into its own pipeline
|
|
||||||
# component, move language defaults to config
|
|
||||||
lang = config["nlp"]["lang"]
|
|
||||||
writing_system = config["nlp"]["writing_system"]
|
|
||||||
if not lemmatizer:
|
|
||||||
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
|
|
||||||
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
|
||||||
if stop_words is None:
|
|
||||||
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
|
|
||||||
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
|
|
||||||
if vocab_data is None:
|
|
||||||
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
|
||||||
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
|
||||||
if get_noun_chunks is None:
|
|
||||||
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
|
|
||||||
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
|
|
||||||
if lex_attr_getters is None:
|
|
||||||
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
|
||||||
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
|
||||||
lex_attrs = dict(LEX_ATTRS)
|
|
||||||
lex_attrs.update(lex_attr_getters)
|
|
||||||
# This is messy, but it's the minimal working fix to Issue #639.
|
|
||||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words)
|
|
||||||
# Ensure that getter can be pickled
|
|
||||||
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
|
|
||||||
lex_attrs[NORM] = util.add_lookups(
|
|
||||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
|
||||||
BASE_NORMS,
|
|
||||||
vocab_data.get("lexeme_norm", {}),
|
|
||||||
)
|
|
||||||
vocab = cls(
|
|
||||||
lex_attr_getters=lex_attrs,
|
|
||||||
vocab_data=vocab_data,
|
|
||||||
lemmatizer=lemmatizer,
|
|
||||||
writing_system=writing_system,
|
|
||||||
get_noun_chunks=get_noun_chunks
|
|
||||||
)
|
|
||||||
if vocab.vectors.name is None and vectors_name:
|
|
||||||
vocab.vectors.name = vectors_name
|
|
||||||
return vocab
|
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user