mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
WIP: move more language data to config
This commit is contained in:
parent
b84fd70cc3
commit
945f795a3e
|
@ -112,10 +112,9 @@ def init_model(
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
# isn't loaded if these features are accessed
|
# isn't loaded if these features are accessed
|
||||||
if omit_extra_lookups:
|
if omit_extra_lookups:
|
||||||
nlp.vocab.lookups_extra = Lookups()
|
nlp.vocab.lookups.remove_table("lexeme_cluster")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
nlp.vocab.lookups.remove_table("lexeme_prob")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
nlp.vocab.lookups.remove_table("lexeme_settings")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
|
|
|
@ -123,10 +123,9 @@ def train(
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
# isn't loaded if these features are accessed
|
# isn't loaded if these features are accessed
|
||||||
if config["training"]["omit_extra_lookups"]:
|
if config["training"]["omit_extra_lookups"]:
|
||||||
nlp.vocab.lookups_extra = Lookups()
|
nlp.vocab.lookups.remove_table("lexeme_cluster")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
nlp.vocab.lookups.remove_table("lexeme_prob")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
nlp.vocab.lookups.remove_table("lexeme_settings")
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
|
||||||
|
|
||||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
lang = null
|
lang = null
|
||||||
stop_words = []
|
stop_words = []
|
||||||
lex_attr_getters = {}
|
lex_attr_getters = {}
|
||||||
|
vocab_data = {}
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
|
@ -9,6 +10,7 @@ pipeline = []
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
data = {}
|
||||||
|
|
||||||
[nlp.writing_system]
|
[nlp.writing_system]
|
||||||
direction = "ltr"
|
direction = "ltr"
|
||||||
|
|
|
@ -434,9 +434,6 @@ class Errors:
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||||
"callable or None, but got: {arg_type}")
|
"callable or None, but got: {arg_type}")
|
||||||
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
|
||||||
"Lemmatizer, initialize the class directly. See the docs for "
|
|
||||||
"details: https://spacy.io/api/lemmatizer")
|
|
||||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||||
|
@ -601,7 +598,7 @@ class Errors:
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||||
"initializing the pipeline:\n"
|
"initializing the pipeline:\n"
|
||||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
|
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
|
||||||
'nlp = Chinese(config=cfg)')
|
'nlp = Chinese(config=cfg)')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||||
lower = True
|
lower = True
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
raw = raw.lower()
|
raw = raw.lower()
|
||||||
ndsv = nlp.Defaults.single_orth_variants
|
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
||||||
ndpv = nlp.Defaults.paired_orth_variants
|
ndsv = orth_variants.get("single", [])
|
||||||
|
ndpv = orth_variants.get("pairsed", [])
|
||||||
words = token_dict.get("words", [])
|
words = token_dict.get("words", [])
|
||||||
tags = token_dict.get("tags", [])
|
tags = token_dict.get("tags", [])
|
||||||
# keep unmodified if words or tags are not defined
|
# keep unmodified if words or tags are not defined
|
||||||
|
|
|
@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.bn.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_rules"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,15 @@ stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,20 +42,6 @@ class GermanDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
single_orth_variants = [
|
|
||||||
{"tags": ["$("], "variants": ["…", "..."]},
|
|
||||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
|
||||||
]
|
|
||||||
paired_orth_variants = [
|
|
||||||
{
|
|
||||||
"tags": ["$("],
|
|
||||||
"variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tags": ["$("],
|
|
||||||
"variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -21,15 +21,21 @@ lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_index", "lemma_exc", "lemma_rules"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
||||||
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
|
def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
|
||||||
return GreekLemmatizer(data_paths=data_paths)
|
return GreekLemmatizer(data=data)
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.el.stop_words")
|
@registry.language_data("spacy.el.stop_words")
|
||||||
|
|
|
@ -22,9 +22,15 @@ lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,22 +45,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
||||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
|
||||||
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
return Lemmatizer(data=data, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
single_orth_variants = [
|
|
||||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
|
||||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
|
||||||
]
|
|
||||||
paired_orth_variants = [
|
|
||||||
{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
|
||||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
|
|
@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,9 +24,10 @@ has_letters = true
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_rules", "lemma_index", "lemma_exc"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,15 +22,16 @@ lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
||||||
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
|
def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
|
||||||
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.stop_words")
|
@registry.language_data("spacy.fr.stop_words")
|
||||||
|
|
|
@ -15,9 +15,10 @@ stop_words = {"@language_data": "spacy.hr.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.it.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,10 @@ stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,10 @@ lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,8 +39,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
||||||
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
|
def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
|
||||||
return DutchLemmatizer(data_paths=data_paths)
|
return DutchLemmatizer(data=data)
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
|
|
|
@ -20,9 +20,10 @@ lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,8 +38,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
||||||
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
|
def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
|
||||||
return PolishLemmatizer(data_paths=data_paths)
|
return PolishLemmatizer(data=data)
|
||||||
|
|
||||||
|
|
||||||
class PolishDefaults(Language.Defaults):
|
class PolishDefaults(Language.Defaults):
|
||||||
|
|
|
@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,9 +22,10 @@ stop_words = {"@language_data": "spacy.ro.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,11 @@ lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,15 @@ lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,9 +22,10 @@ lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup", "lemma_rules"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,11 @@ DEFAULT_CONFIG = """
|
||||||
lang = "ta"
|
lang = "ta"
|
||||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,11 @@ lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.ThaiTokenizer.v1"
|
@tokenizers = "spacy.ThaiTokenizer.v1"
|
||||||
|
|
||||||
|
[nlp.vocab_data]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,10 @@ lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,9 +16,10 @@ stop_words = {"@language_data": "spacy.tr.stop_words"}
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,9 +23,10 @@ has_letters = true
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
[nlp.lemmatizer.data_paths]
|
[nlp.lemmatizer.data]
|
||||||
@language_data = "spacy-lookups-data"
|
@language_data = "spacy-lookups-data"
|
||||||
lang = ${nlp:lang}
|
lang = ${nlp:lang}
|
||||||
|
tables = ["lemma_lookup"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -55,8 +55,6 @@ class BaseDefaults:
|
||||||
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
||||||
morph_rules: Dict[str, Dict[str, dict]] = {}
|
morph_rules: Dict[str, Dict[str, dict]] = {}
|
||||||
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
|
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
|
||||||
single_orth_variants: List[Dict[str, List[str]]] = []
|
|
||||||
paired_orth_variants: List[Dict[str, Union[List[str], List[Tuple[str, str]]]]] = []
|
|
||||||
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
|
@ -1268,11 +1266,13 @@ class Language:
|
||||||
lemmatizer = resolved["nlp"]["lemmatizer"]
|
lemmatizer = resolved["nlp"]["lemmatizer"]
|
||||||
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
||||||
stop_words = resolved["nlp"]["stop_words"]
|
stop_words = resolved["nlp"]["stop_words"]
|
||||||
|
vocab_data = resolved["nlp"]["vocab_data"]
|
||||||
vocab = Vocab.from_config(
|
vocab = Vocab.from_config(
|
||||||
filled,
|
filled,
|
||||||
lemmatizer=lemmatizer,
|
lemmatizer=lemmatizer,
|
||||||
lex_attr_getters=lex_attr_getters,
|
lex_attr_getters=lex_attr_getters,
|
||||||
stop_words=stop_words,
|
stop_words=stop_words,
|
||||||
|
vocab_data=vocab_data,
|
||||||
# TODO: what should we do with these?
|
# TODO: what should we do with these?
|
||||||
tag_map=cls.Defaults.tag_map,
|
tag_map=cls.Defaults.tag_map,
|
||||||
morph_rules=cls.Defaults.morph_rules,
|
morph_rules=cls.Defaults.morph_rules,
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
from typing import Optional, Callable, List, Dict
|
from typing import Optional, Callable, List, Dict
|
||||||
|
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .errors import Errors
|
|
||||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
from .util import registry, load_language_data, SimpleFrozenDict
|
from .util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
|
||||||
return Lemmatizer(data_paths=data_paths)
|
return Lemmatizer(data=data)
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer:
|
class Lemmatizer:
|
||||||
|
@ -19,14 +18,10 @@ class Lemmatizer:
|
||||||
DOCS: https://spacy.io/api/lemmatizer
|
DOCS: https://spacy.io/api/lemmatizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, *args, **kwargs):
|
|
||||||
raise NotImplementedError(Errors.E172)
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
lookups: Optional[Lookups] = None,
|
lookups: Optional[Lookups] = None,
|
||||||
data_paths: dict = SimpleFrozenDict(),
|
data: Dict[str, dict] = {},
|
||||||
is_base_form: Optional[Callable] = None,
|
is_base_form: Optional[Callable] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
@ -36,9 +31,9 @@ class Lemmatizer:
|
||||||
RETURNS (Lemmatizer): The newly constructed object.
|
RETURNS (Lemmatizer): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = lookups if lookups is not None else Lookups()
|
||||||
for name, filename in data_paths.items():
|
for name, table in data.items():
|
||||||
data = load_language_data(filename)
|
if table is not None:
|
||||||
self.lookups.add_table(name, data)
|
self.lookups.add_table(name, table)
|
||||||
self.is_base_form = is_base_form
|
self.is_base_form = is_base_form
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|
|
@ -251,11 +251,11 @@ cdef class Lexeme:
|
||||||
property cluster:
|
property cluster:
|
||||||
"""RETURNS (int): Brown cluster ID."""
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
return cluster_table.get(self.c.orth, 0)
|
return cluster_table.get(self.c.orth, 0)
|
||||||
|
|
||||||
def __set__(self, int x):
|
def __set__(self, int x):
|
||||||
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
cluster_table[self.c.orth] = x
|
cluster_table[self.c.orth] = x
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
|
@ -270,13 +270,13 @@ cdef class Lexeme:
|
||||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
type."""
|
type."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
prob_table = self.vocab.load_extra_lookups("lexeme_prob")
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
settings_table = self.vocab.load_extra_lookups("lexeme_settings")
|
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||||
return prob_table.get(self.c.orth, default_oov_prob)
|
return prob_table.get(self.c.orth, default_oov_prob)
|
||||||
|
|
||||||
def __set__(self, float x):
|
def __set__(self, float x):
|
||||||
prob_table = self.vocab.load_extra_lookups("lexeme_prob")
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
prob_table[self.c.orth] = x
|
prob_table[self.c.orth] = x
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
|
|
|
@ -5,7 +5,7 @@ from preshed.bloom import BloomFilter
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import SimpleFrozenDict, ensure_path, registry
|
from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
|
||||||
from .strings import get_string_id
|
from .strings import get_string_id
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,18 +13,26 @@ UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy-lookups-data")
|
@registry.language_data("spacy-lookups-data")
|
||||||
def get_lookups(lang: str) -> Dict[str, Any]:
|
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
||||||
"""Load the data from the spacy-lookups-data package for a given language,
|
"""Load the data from the spacy-lookups-data package for a given language,
|
||||||
if available. Returns an empty dict if there's no data or if the package
|
if available. Returns an empty dict if there's no data or if the package
|
||||||
is not installed.
|
is not installed.
|
||||||
|
|
||||||
lang (str): The language code (corresponds to entry point exposed by
|
lang (str): The language code (corresponds to entry point exposed by
|
||||||
the spacy-lookups-data package).
|
the spacy-lookups-data package).
|
||||||
|
tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
|
||||||
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
||||||
"""
|
"""
|
||||||
if lang in registry.lookups:
|
# TODO: import spacy_lookups_data instead of going via entry points here?
|
||||||
return registry.lookups.get(lang)
|
if lang not in registry.lookups:
|
||||||
return {}
|
return {}
|
||||||
|
data = registry.lookups.get(lang)
|
||||||
|
result = {}
|
||||||
|
for table in tables:
|
||||||
|
if table not in data:
|
||||||
|
raise ValueError("TODO: unknown table")
|
||||||
|
result[table] = load_language_data(data[table])
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class Lookups:
|
class Lookups:
|
||||||
|
|
|
@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
|
||||||
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
|
writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
|
||||||
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
||||||
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
||||||
|
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.lookups import Lookups
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
|
||||||
def test_lemmatizer_reflects_lookups_changes():
|
def test_lemmatizer_reflects_lookups_changes():
|
||||||
"""Test for an issue that'd cause lookups available in a model loaded from
|
"""Test for an issue that'd cause lookups available in a model loaded from
|
||||||
disk to not be reflected in the lemmatizer."""
|
disk to not be reflected in the lemmatizer."""
|
||||||
|
@ -56,4 +57,8 @@ def test_lemmatizer_without_is_base_form_implementation():
|
||||||
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
|
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
|
||||||
|
|
||||||
lemmatizer = Lemmatizer(lookups, is_base_form=None)
|
lemmatizer = Lemmatizer(lookups, is_base_form=None)
|
||||||
assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]
|
assert lemmatizer(
|
||||||
|
"Formuesskatten",
|
||||||
|
"noun",
|
||||||
|
{"Definite": "def", "Gender": "masc", "Number": "sing"},
|
||||||
|
) == ["formuesskatt"]
|
||||||
|
|
|
@ -29,7 +29,6 @@ cdef class Vocab:
|
||||||
cpdef public Morphology morphology
|
cpdef public Morphology morphology
|
||||||
cpdef public object vectors
|
cpdef public object vectors
|
||||||
cpdef public object lookups
|
cpdef public object lookups
|
||||||
cpdef public object lookups_extra
|
|
||||||
cpdef public object writing_system
|
cpdef public object writing_system
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object data_dir
|
cdef public object data_dir
|
||||||
|
|
|
@ -31,7 +31,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
strings=tuple(), lookups=None, lookups_extra=None,
|
strings=tuple(), lookups=None, vocab_data={},
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
oov_prob=-20., vectors_name=None, writing_system={},
|
||||||
**deprecated_kwargs):
|
**deprecated_kwargs):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
@ -44,7 +44,6 @@ cdef class Vocab:
|
||||||
strings (StringStore): StringStore that maps strings to integers, and
|
strings (StringStore): StringStore that maps strings to integers, and
|
||||||
vice versa.
|
vice versa.
|
||||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
|
|
||||||
oov_prob (float): Default OOV probability.
|
oov_prob (float): Default OOV probability.
|
||||||
vectors_name (unicode): Optional name to identify the vectors table.
|
vectors_name (unicode): Optional name to identify the vectors table.
|
||||||
RETURNS (Vocab): The newly constructed object.
|
RETURNS (Vocab): The newly constructed object.
|
||||||
|
@ -53,12 +52,12 @@ cdef class Vocab:
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
tag_map = tag_map if tag_map is not None else {}
|
||||||
if lookups in (None, True, False):
|
if lookups in (None, True, False):
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
if "lexeme_norm" not in lookups:
|
for name, data in vocab_data.items():
|
||||||
lookups.add_table("lexeme_norm")
|
if name not in lookups:
|
||||||
|
data = data if data is not None else {}
|
||||||
|
lookups.add_table(name, data)
|
||||||
if lemmatizer in (None, True, False):
|
if lemmatizer in (None, True, False):
|
||||||
lemmatizer = Lemmatizer(lookups)
|
lemmatizer = Lemmatizer(lookups)
|
||||||
if lookups_extra in (None, True, False):
|
|
||||||
lookups_extra = Lookups()
|
|
||||||
self.cfg = {'oov_prob': oov_prob}
|
self.cfg = {'oov_prob': oov_prob}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
|
@ -71,7 +70,6 @@ cdef class Vocab:
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
self.vectors = Vectors(name=vectors_name)
|
self.vectors = Vectors(name=vectors_name)
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.lookups_extra = lookups_extra
|
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -425,6 +423,7 @@ cdef class Vocab:
|
||||||
lemmatizer=None,
|
lemmatizer=None,
|
||||||
lex_attr_getters=None,
|
lex_attr_getters=None,
|
||||||
stop_words=None,
|
stop_words=None,
|
||||||
|
vocab_data=None,
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
tag_map=None,
|
tag_map=None,
|
||||||
morph_rules=None
|
morph_rules=None
|
||||||
|
@ -444,12 +443,12 @@ cdef class Vocab:
|
||||||
if not lemmatizer:
|
if not lemmatizer:
|
||||||
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
|
lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
|
||||||
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||||
lookups = lemmatizer.lookups
|
|
||||||
if "lexeme_norm" not in lookups:
|
|
||||||
lookups.add_table("lexeme_norm")
|
|
||||||
if stop_words is None:
|
if stop_words is None:
|
||||||
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
|
stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
|
||||||
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
|
stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
|
||||||
|
if vocab_data is None:
|
||||||
|
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
||||||
|
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
||||||
if lex_attr_getters is None:
|
if lex_attr_getters is None:
|
||||||
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
||||||
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
||||||
|
@ -462,14 +461,12 @@ cdef class Vocab:
|
||||||
lex_attrs[NORM] = util.add_lookups(
|
lex_attrs[NORM] = util.add_lookups(
|
||||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||||
BASE_NORMS,
|
BASE_NORMS,
|
||||||
# TODO: we need to move the lexeme norms to their own entry
|
vocab_data.get("lexeme_norm", {}),
|
||||||
# points so we can specify them separately from the lemma lookups
|
|
||||||
lookups.get_table("lexeme_norm"),
|
|
||||||
)
|
)
|
||||||
vocab = cls(
|
vocab = cls(
|
||||||
lex_attr_getters=lex_attrs,
|
lex_attr_getters=lex_attrs,
|
||||||
|
vocab_data=vocab_data,
|
||||||
lemmatizer=lemmatizer,
|
lemmatizer=lemmatizer,
|
||||||
lookups=lookups,
|
|
||||||
writing_system=writing_system,
|
writing_system=writing_system,
|
||||||
tag_map=tag_map,
|
tag_map=tag_map,
|
||||||
)
|
)
|
||||||
|
@ -498,8 +495,6 @@ cdef class Vocab:
|
||||||
self.vectors.to_disk(path)
|
self.vectors.to_disk(path)
|
||||||
if "lookups" not in "exclude" and self.lookups is not None:
|
if "lookups" not in "exclude" and self.lookups is not None:
|
||||||
self.lookups.to_disk(path)
|
self.lookups.to_disk(path)
|
||||||
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
|
|
||||||
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
|
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple()):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
@ -522,8 +517,6 @@ cdef class Vocab:
|
||||||
link_vectors_to_models(self)
|
link_vectors_to_models(self)
|
||||||
if "lookups" not in exclude:
|
if "lookups" not in exclude:
|
||||||
self.lookups.from_disk(path)
|
self.lookups.from_disk(path)
|
||||||
if "lookups_extra" not in exclude:
|
|
||||||
self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
|
|
||||||
if "lexeme_norm" in self.lookups:
|
if "lexeme_norm" in self.lookups:
|
||||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
|
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
|
||||||
|
@ -550,7 +543,6 @@ cdef class Vocab:
|
||||||
"strings": lambda: self.strings.to_bytes(),
|
"strings": lambda: self.strings.to_bytes(),
|
||||||
"vectors": deserialize_vectors,
|
"vectors": deserialize_vectors,
|
||||||
"lookups": lambda: self.lookups.to_bytes(),
|
"lookups": lambda: self.lookups.to_bytes(),
|
||||||
"lookups_extra": lambda: self.lookups_extra.to_bytes()
|
|
||||||
}
|
}
|
||||||
return util.to_bytes(getters, exclude)
|
return util.to_bytes(getters, exclude)
|
||||||
|
|
||||||
|
@ -574,7 +566,6 @@ cdef class Vocab:
|
||||||
"lexemes": lambda b: self.lexemes_from_bytes(b),
|
"lexemes": lambda b: self.lexemes_from_bytes(b),
|
||||||
"vectors": lambda b: serialize_vectors(b),
|
"vectors": lambda b: serialize_vectors(b),
|
||||||
"lookups": lambda b: self.lookups.from_bytes(b),
|
"lookups": lambda b: self.lookups.from_bytes(b),
|
||||||
"lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
|
|
||||||
}
|
}
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
if "lexeme_norm" in self.lookups:
|
if "lexeme_norm" in self.lookups:
|
||||||
|
@ -592,19 +583,6 @@ cdef class Vocab:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
def load_extra_lookups(self, table_name):
|
|
||||||
if table_name not in self.lookups_extra:
|
|
||||||
if self.lang + "_extra" in util.registry.lookups:
|
|
||||||
tables = util.registry.lookups.get(self.lang + "_extra")
|
|
||||||
for name, filename in tables.items():
|
|
||||||
if table_name == name:
|
|
||||||
data = util.load_language_data(filename)
|
|
||||||
self.lookups_extra.add_table(name, data)
|
|
||||||
if table_name not in self.lookups_extra:
|
|
||||||
self.lookups_extra.add_table(table_name)
|
|
||||||
return self.lookups_extra.get_table(table_name)
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_vocab(vocab):
|
def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
vectors = vocab.vectors
|
vectors = vocab.vectors
|
||||||
|
@ -612,13 +590,12 @@ def pickle_vocab(vocab):
|
||||||
data_dir = vocab.data_dir
|
data_dir = vocab.data_dir
|
||||||
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
||||||
lookups = vocab.lookups
|
lookups = vocab.lookups
|
||||||
lookups_extra = vocab.lookups_extra
|
|
||||||
return (unpickle_vocab,
|
return (unpickle_vocab,
|
||||||
(sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))
|
(sstore, vectors, morph, data_dir, lex_attr_getters, lookups))
|
||||||
|
|
||||||
|
|
||||||
def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
||||||
lex_attr_getters, lookups, lookups_extra):
|
lex_attr_getters, lookups):
|
||||||
cdef Vocab vocab = Vocab()
|
cdef Vocab vocab = Vocab()
|
||||||
vocab.vectors = vectors
|
vocab.vectors = vectors
|
||||||
vocab.strings = sstore
|
vocab.strings = sstore
|
||||||
|
@ -626,7 +603,6 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
||||||
vocab.data_dir = data_dir
|
vocab.data_dir = data_dir
|
||||||
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
||||||
vocab.lookups = lookups
|
vocab.lookups = lookups
|
||||||
vocab.lookups_extra = lookups_extra
|
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user