WIP: move more language data to config

This commit is contained in:
Ines Montani 2020-07-22 15:59:37 +02:00
parent b84fd70cc3
commit 945f795a3e
41 changed files with 174 additions and 134 deletions

View File

@ -112,10 +112,9 @@ def init_model(
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed
if omit_extra_lookups: if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups.remove_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups.remove_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups.remove_table("lexeme_settings")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:

View File

@ -123,10 +123,9 @@ def train(
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed
if config["training"]["omit_extra_lookups"]: if config["training"]["omit_extra_lookups"]:
nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups.remove_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups.remove_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups.remove_table("lexeme_settings")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
# Load a pretrained tok2vec model - cf. CLI command 'pretrain' # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
if weights_data is not None: if weights_data is not None:

View File

@ -2,6 +2,7 @@
lang = null lang = null
stop_words = [] stop_words = []
lex_attr_getters = {} lex_attr_getters = {}
vocab_data = {}
pipeline = [] pipeline = []
[nlp.tokenizer] [nlp.tokenizer]
@ -9,6 +10,7 @@ pipeline = []
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
data = {}
[nlp.writing_system] [nlp.writing_system]
direction = "ltr" direction = "ltr"

View File

@ -434,9 +434,6 @@ class Errors:
E170 = ("Cannot apply transition {name}: invalid for the current state.") E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected " E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}") "callable or None, but got: {arg_type}")
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer")
E175 = ("Can't remove rule for unknown match pattern ID: {key}") E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}") E177 = ("Ill-formed IOB input detected: {tag}")
@ -601,7 +598,7 @@ class Errors:
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when " E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline:\n" "initializing the pipeline:\n"
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m' 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
'nlp = Chinese(config=cfg)') 'nlp = Chinese(config=cfg)')

View File

@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
lower = True lower = True
if raw is not None: if raw is not None:
raw = raw.lower() raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndpv = nlp.Defaults.paired_orth_variants ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("pairsed", [])
words = token_dict.get("words", []) words = token_dict.get("words", [])
tags = token_dict.get("tags", []) tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined # keep unmodified if words or tags are not defined

View File

@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_rules"]
""" """

View File

@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -19,9 +19,15 @@ stop_words = {"@language_data": "spacy.de.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
""" """
@ -36,20 +42,6 @@ class GermanDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language): class German(Language):

View File

@ -21,15 +21,21 @@ lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1" @lemmatizers = "spacy.GreekLemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
""" """
@registry.lemmatizers("spacy.GreekLemmatizer.v1") @registry.lemmatizers("spacy.GreekLemmatizer.v1")
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer: def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
return GreekLemmatizer(data_paths=data_paths) return GreekLemmatizer(data=data)
@registry.language_data("spacy.el.stop_words") @registry.language_data("spacy.el.stop_words")

View File

@ -22,9 +22,15 @@ lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1" @lemmatizers = "spacy.EnglishLemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
""" """
@ -39,22 +45,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
@registry.lemmatizers("spacy.EnglishLemmatizer.v1") @registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form) return Lemmatizer(data=data, is_base_form=is_base_form)
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
class English(Language): class English(Language):

View File

@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
""" """

View File

@ -24,9 +24,10 @@ has_letters = true
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
""" """

View File

@ -22,15 +22,16 @@ lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1" @lemmatizers = "spacy.FrenchLemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
""" """
@registry.lemmatizers("spacy.FrenchLemmatizer.v1") @registry.lemmatizers("spacy.FrenchLemmatizer.v1")
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer: def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form) return FrenchLemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.fr.stop_words") @registry.language_data("spacy.fr.stop_words")

View File

@ -15,9 +15,10 @@ stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -19,9 +19,10 @@ stop_words = {"@language_data": "spacy.nb.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
""" """

View File

@ -21,9 +21,10 @@ lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1" @lemmatizers = "spacy.DutchLemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
""" """
@ -38,8 +39,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
@registry.lemmatizers("spacy.DutchLemmatizer.v1") @registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer: def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
return DutchLemmatizer(data_paths=data_paths) return DutchLemmatizer(data=data)
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):

View File

@ -20,9 +20,10 @@ lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1" @lemmatizers = "spacy.PolishLemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
""" """
@ -37,8 +38,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
@registry.lemmatizers("spacy.PolishLemmatizer.v1") @registry.lemmatizers("spacy.PolishLemmatizer.v1")
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer: def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
return PolishLemmatizer(data_paths=data_paths) return PolishLemmatizer(data=data)
class PolishDefaults(Language.Defaults): class PolishDefaults(Language.Defaults):

View File

@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -22,9 +22,10 @@ stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -18,6 +18,11 @@ lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1" @lemmatizers = "spacy.RussianLemmatizer.v1"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -18,9 +18,15 @@ lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -22,9 +22,10 @@ lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup", "lemma_rules"]
""" """

View File

@ -12,6 +12,11 @@ DEFAULT_CONFIG = """
lang = "ta" lang = "ta"
stop_words = {"@language_data": "spacy.ta.stop_words"} stop_words = {"@language_data": "spacy.ta.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -16,6 +16,11 @@ lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1" @tokenizers = "spacy.ThaiTokenizer.v1"
[nlp.vocab_data]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
tables = ["lexeme_norm"]
""" """

View File

@ -18,9 +18,10 @@ lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -16,9 +16,10 @@ stop_words = {"@language_data": "spacy.tr.stop_words"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -23,9 +23,10 @@ has_letters = true
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths] [nlp.lemmatizer.data]
@language_data = "spacy-lookups-data" @language_data = "spacy-lookups-data"
lang = ${nlp:lang} lang = ${nlp:lang}
tables = ["lemma_lookup"]
""" """

View File

@ -55,8 +55,6 @@ class BaseDefaults:
tokenizer_exceptions: Dict[str, List[dict]] = {} tokenizer_exceptions: Dict[str, List[dict]] = {}
morph_rules: Dict[str, Dict[str, dict]] = {} morph_rules: Dict[str, Dict[str, dict]] = {}
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {} syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
single_orth_variants: List[Dict[str, List[str]]] = []
paired_orth_variants: List[Dict[str, Union[List[str], List[Tuple[str, str]]]]] = []
class Language: class Language:
@ -1268,11 +1266,13 @@ class Language:
lemmatizer = resolved["nlp"]["lemmatizer"] lemmatizer = resolved["nlp"]["lemmatizer"]
lex_attr_getters = resolved["nlp"]["lex_attr_getters"] lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
stop_words = resolved["nlp"]["stop_words"] stop_words = resolved["nlp"]["stop_words"]
vocab_data = resolved["nlp"]["vocab_data"]
vocab = Vocab.from_config( vocab = Vocab.from_config(
filled, filled,
lemmatizer=lemmatizer, lemmatizer=lemmatizer,
lex_attr_getters=lex_attr_getters, lex_attr_getters=lex_attr_getters,
stop_words=stop_words, stop_words=stop_words,
vocab_data=vocab_data,
# TODO: what should we do with these? # TODO: what should we do with these?
tag_map=cls.Defaults.tag_map, tag_map=cls.Defaults.tag_map,
morph_rules=cls.Defaults.morph_rules, morph_rules=cls.Defaults.morph_rules,

View File

@ -1,14 +1,13 @@
from typing import Optional, Callable, List, Dict from typing import Optional, Callable, List, Dict
from .lookups import Lookups from .lookups import Lookups
from .errors import Errors
from .parts_of_speech import NAMES as UPOS_NAMES from .parts_of_speech import NAMES as UPOS_NAMES
from .util import registry, load_language_data, SimpleFrozenDict from .util import registry
@registry.lemmatizers("spacy.Lemmatizer.v1") @registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths) return Lemmatizer(data=data)
class Lemmatizer: class Lemmatizer:
@ -19,14 +18,10 @@ class Lemmatizer:
DOCS: https://spacy.io/api/lemmatizer DOCS: https://spacy.io/api/lemmatizer
""" """
@classmethod
def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172)
def __init__( def __init__(
self, self,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
data_paths: dict = SimpleFrozenDict(), data: Dict[str, dict] = {},
is_base_form: Optional[Callable] = None, is_base_form: Optional[Callable] = None,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -36,9 +31,9 @@ class Lemmatizer:
RETURNS (Lemmatizer): The newly constructed object. RETURNS (Lemmatizer): The newly constructed object.
""" """
self.lookups = lookups if lookups is not None else Lookups() self.lookups = lookups if lookups is not None else Lookups()
for name, filename in data_paths.items(): for name, table in data.items():
data = load_language_data(filename) if table is not None:
self.lookups.add_table(name, data) self.lookups.add_table(name, table)
self.is_base_form = is_base_form self.is_base_form = is_base_form
def __call__( def __call__(

View File

@ -251,11 +251,11 @@ cdef class Lexeme:
property cluster: property cluster:
"""RETURNS (int): Brown cluster ID.""" """RETURNS (int): Brown cluster ID."""
def __get__(self): def __get__(self):
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0) return cluster_table.get(self.c.orth, 0)
def __set__(self, int x): def __set__(self, int x):
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x cluster_table[self.c.orth] = x
property lang: property lang:
@ -270,13 +270,13 @@ cdef class Lexeme:
"""RETURNS (float): Smoothed log probability estimate of the lexeme's """RETURNS (float): Smoothed log probability estimate of the lexeme's
type.""" type."""
def __get__(self): def __get__(self):
prob_table = self.vocab.load_extra_lookups("lexeme_prob") prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.load_extra_lookups("lexeme_settings") settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0) default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob) return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x): def __set__(self, float x):
prob_table = self.vocab.load_extra_lookups("lexeme_prob") prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x prob_table[self.c.orth] = x
property lower_: property lower_:

View File

@ -5,7 +5,7 @@ from preshed.bloom import BloomFilter
from collections import OrderedDict from collections import OrderedDict
from .errors import Errors from .errors import Errors
from .util import SimpleFrozenDict, ensure_path, registry from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
from .strings import get_string_id from .strings import get_string_id
@ -13,18 +13,26 @@ UNSET = object()
@registry.language_data("spacy-lookups-data") @registry.language_data("spacy-lookups-data")
def get_lookups(lang: str) -> Dict[str, Any]: def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
"""Load the data from the spacy-lookups-data package for a given language, """Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package if available. Returns an empty dict if there's no data or if the package
is not installed. is not installed.
lang (str): The language code (corresponds to entry point exposed by lang (str): The language code (corresponds to entry point exposed by
the spacy-lookups-data package). the spacy-lookups-data package).
tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
RETURNS (Dict[str, Any]): The lookups, keyed by table name. RETURNS (Dict[str, Any]): The lookups, keyed by table name.
""" """
if lang in registry.lookups: # TODO: import spacy_lookups_data instead of going via entry points here?
return registry.lookups.get(lang) if lang not in registry.lookups:
return {} return {}
data = registry.lookups.get(lang)
result = {}
for table in tables:
if table not in data:
raise ValueError("TODO: unknown table")
result[table] = load_language_data(data[table])
return result
class Lookups: class Lookups:

View File

@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system") writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -5,6 +5,7 @@ from spacy.lookups import Lookups
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
def test_lemmatizer_reflects_lookups_changes(): def test_lemmatizer_reflects_lookups_changes():
"""Test for an issue that'd cause lookups available in a model loaded from """Test for an issue that'd cause lookups available in a model loaded from
disk to not be reflected in the lemmatizer.""" disk to not be reflected in the lemmatizer."""
@ -56,4 +57,8 @@ def test_lemmatizer_without_is_base_form_implementation():
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
lemmatizer = Lemmatizer(lookups, is_base_form=None) lemmatizer = Lemmatizer(lookups, is_base_form=None)
assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"] assert lemmatizer(
"Formuesskatten",
"noun",
{"Definite": "def", "Gender": "masc", "Number": "sing"},
) == ["formuesskatt"]

View File

@ -29,7 +29,6 @@ cdef class Vocab:
cpdef public Morphology morphology cpdef public Morphology morphology
cpdef public object vectors cpdef public object vectors
cpdef public object lookups cpdef public object lookups
cpdef public object lookups_extra
cpdef public object writing_system cpdef public object writing_system
cdef readonly int length cdef readonly int length
cdef public object data_dir cdef public object data_dir

View File

@ -31,7 +31,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), lookups=None, lookups_extra=None, strings=tuple(), lookups=None, vocab_data={},
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., vectors_name=None, writing_system={},
**deprecated_kwargs): **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
@ -44,7 +44,6 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and strings (StringStore): StringStore that maps strings to integers, and
vice versa. vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
oov_prob (float): Default OOV probability. oov_prob (float): Default OOV probability.
vectors_name (unicode): Optional name to identify the vectors table. vectors_name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object. RETURNS (Vocab): The newly constructed object.
@ -53,12 +52,12 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lookups in (None, True, False): if lookups in (None, True, False):
lookups = Lookups() lookups = Lookups()
if "lexeme_norm" not in lookups: for name, data in vocab_data.items():
lookups.add_table("lexeme_norm") if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups) lemmatizer = Lemmatizer(lookups)
if lookups_extra in (None, True, False):
lookups_extra = Lookups()
self.cfg = {'oov_prob': oov_prob} self.cfg = {'oov_prob': oov_prob}
self.mem = Pool() self.mem = Pool()
self._by_orth = PreshMap() self._by_orth = PreshMap()
@ -71,7 +70,6 @@ cdef class Vocab:
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(name=vectors_name) self.vectors = Vectors(name=vectors_name)
self.lookups = lookups self.lookups = lookups
self.lookups_extra = lookups_extra
self.writing_system = writing_system self.writing_system = writing_system
@property @property
@ -425,6 +423,7 @@ cdef class Vocab:
lemmatizer=None, lemmatizer=None,
lex_attr_getters=None, lex_attr_getters=None,
stop_words=None, stop_words=None,
vocab_data=None,
vectors_name=None, vectors_name=None,
tag_map=None, tag_map=None,
morph_rules=None morph_rules=None
@ -444,12 +443,12 @@ cdef class Vocab:
if not lemmatizer: if not lemmatizer:
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]} lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
lookups = lemmatizer.lookups
if "lexeme_norm" not in lookups:
lookups.add_table("lexeme_norm")
if stop_words is None: if stop_words is None:
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]} stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"] stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
if vocab_data is None:
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
if lex_attr_getters is None: if lex_attr_getters is None:
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
@ -462,14 +461,12 @@ cdef class Vocab:
lex_attrs[NORM] = util.add_lookups( lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]), lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS, BASE_NORMS,
# TODO: we need to move the lexeme norms to their own entry vocab_data.get("lexeme_norm", {}),
# points so we can specify them separately from the lemma lookups
lookups.get_table("lexeme_norm"),
) )
vocab = cls( vocab = cls(
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
vocab_data=vocab_data,
lemmatizer=lemmatizer, lemmatizer=lemmatizer,
lookups=lookups,
writing_system=writing_system, writing_system=writing_system,
tag_map=tag_map, tag_map=tag_map,
) )
@ -498,8 +495,6 @@ cdef class Vocab:
self.vectors.to_disk(path) self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None: if "lookups" not in "exclude" and self.lookups is not None:
self.lookups.to_disk(path) self.lookups.to_disk(path)
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
def from_disk(self, path, exclude=tuple()): def from_disk(self, path, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
@ -522,8 +517,6 @@ cdef class Vocab:
link_vectors_to_models(self) link_vectors_to_models(self)
if "lookups" not in exclude: if "lookups" not in exclude:
self.lookups.from_disk(path) self.lookups.from_disk(path)
if "lookups_extra" not in exclude:
self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
if "lexeme_norm" in self.lookups: if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups( self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
@ -550,7 +543,6 @@ cdef class Vocab:
"strings": lambda: self.strings.to_bytes(), "strings": lambda: self.strings.to_bytes(),
"vectors": deserialize_vectors, "vectors": deserialize_vectors,
"lookups": lambda: self.lookups.to_bytes(), "lookups": lambda: self.lookups.to_bytes(),
"lookups_extra": lambda: self.lookups_extra.to_bytes()
} }
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
@ -574,7 +566,6 @@ cdef class Vocab:
"lexemes": lambda b: self.lexemes_from_bytes(b), "lexemes": lambda b: self.lexemes_from_bytes(b),
"vectors": lambda b: serialize_vectors(b), "vectors": lambda b: serialize_vectors(b),
"lookups": lambda b: self.lookups.from_bytes(b), "lookups": lambda b: self.lookups.from_bytes(b),
"lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
} }
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
if "lexeme_norm" in self.lookups: if "lexeme_norm" in self.lookups:
@ -592,19 +583,6 @@ cdef class Vocab:
raise NotImplementedError raise NotImplementedError
def load_extra_lookups(self, table_name):
if table_name not in self.lookups_extra:
if self.lang + "_extra" in util.registry.lookups:
tables = util.registry.lookups.get(self.lang + "_extra")
for name, filename in tables.items():
if table_name == name:
data = util.load_language_data(filename)
self.lookups_extra.add_table(name, data)
if table_name not in self.lookups_extra:
self.lookups_extra.add_table(table_name)
return self.lookups_extra.get_table(table_name)
def pickle_vocab(vocab): def pickle_vocab(vocab):
sstore = vocab.strings sstore = vocab.strings
vectors = vocab.vectors vectors = vocab.vectors
@ -612,13 +590,12 @@ def pickle_vocab(vocab):
data_dir = vocab.data_dir data_dir = vocab.data_dir
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lookups = vocab.lookups lookups = vocab.lookups
lookups_extra = vocab.lookups_extra
return (unpickle_vocab, return (unpickle_vocab,
(sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra)) (sstore, vectors, morph, data_dir, lex_attr_getters, lookups))
def unpickle_vocab(sstore, vectors, morphology, data_dir, def unpickle_vocab(sstore, vectors, morphology, data_dir,
lex_attr_getters, lookups, lookups_extra): lex_attr_getters, lookups):
cdef Vocab vocab = Vocab() cdef Vocab vocab = Vocab()
vocab.vectors = vectors vocab.vectors = vectors
vocab.strings = sstore vocab.strings = sstore
@ -626,7 +603,6 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
vocab.data_dir = data_dir vocab.data_dir = data_dir
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lookups = lookups vocab.lookups = lookups
vocab.lookups_extra = lookups_extra
return vocab return vocab