From 945f795a3e4ebb0bab6e4c0420ec1dc590437422 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 15:59:37 +0200 Subject: [PATCH] WIP: move more language data to config --- spacy/cli/init_model.py | 7 ++--- spacy/cli/train.py | 7 ++--- spacy/default_config.cfg | 2 ++ spacy/errors.py | 5 +--- spacy/gold/augment.py | 5 ++-- spacy/lang/bn/__init__.py | 3 +- spacy/lang/ca/__init__.py | 3 +- spacy/lang/da/__init__.py | 8 +++++- spacy/lang/de/__init__.py | 22 +++++---------- spacy/lang/el/__init__.py | 12 ++++++-- spacy/lang/en/__init__.py | 20 ++++++-------- spacy/lang/es/__init__.py | 8 +++++- spacy/lang/fa/__init__.py | 3 +- spacy/lang/fr/__init__.py | 7 +++-- spacy/lang/hr/__init__.py | 3 +- spacy/lang/hu/__init__.py | 3 +- spacy/lang/id/__init__.py | 8 +++++- spacy/lang/it/__init__.py | 3 +- spacy/lang/lb/__init__.py | 8 +++++- spacy/lang/lt/__init__.py | 3 +- spacy/lang/nb/__init__.py | 3 +- spacy/lang/nl/__init__.py | 7 +++-- spacy/lang/pl/__init__.py | 7 +++-- spacy/lang/pt/__init__.py | 3 +- spacy/lang/ro/__init__.py | 3 +- spacy/lang/ru/__init__.py | 5 ++++ spacy/lang/sr/__init__.py | 8 +++++- spacy/lang/sv/__init__.py | 3 +- spacy/lang/ta/__init__.py | 5 ++++ spacy/lang/th/__init__.py | 5 ++++ spacy/lang/tl/__init__.py | 3 +- spacy/lang/tr/__init__.py | 3 +- spacy/lang/ur/__init__.py | 3 +- spacy/language.py | 4 +-- spacy/lemmatizer.py | 19 +++++-------- spacy/lexeme.pyx | 10 +++---- spacy/lookups.py | 18 ++++++++---- spacy/schemas.py | 1 + spacy/tests/test_lemmatizer.py | 7 ++++- spacy/vocab.pxd | 1 - spacy/vocab.pyx | 50 +++++++++------------------------- 41 files changed, 174 insertions(+), 134 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 9fb346006..f0c80bb8c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -112,10 +112,9 @@ def init_model( # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") + nlp.vocab.lookups.remove_table("lexeme_cluster") + nlp.vocab.lookups.remove_table("lexeme_prob") + nlp.vocab.lookups.remove_table("lexeme_settings") msg.good("Successfully created model") if vectors_loc is not None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ff665368..310580dbb 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -123,10 +123,9 @@ def train( # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if config["training"]["omit_extra_lookups"]: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") + nlp.vocab.lookups.remove_table("lexeme_cluster") + nlp.vocab.lookups.remove_table("lexeme_prob") + nlp.vocab.lookups.remove_table("lexeme_settings") # Load a pretrained tok2vec model - cf. CLI command 'pretrain' if weights_data is not None: diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7e6c7a6ec..747194cb4 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -2,6 +2,7 @@ lang = null stop_words = [] lex_attr_getters = {} +vocab_data = {} pipeline = [] [nlp.tokenizer] @@ -9,6 +10,7 @@ pipeline = [] [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" +data = {} [nlp.writing_system] direction = "ltr" diff --git a/spacy/errors.py b/spacy/errors.py index f6c7a569f..719e0204b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -434,9 +434,6 @@ class Errors: E170 = ("Cannot apply transition {name}: invalid for the current state.") E171 = ("Matcher.add received invalid on_match callback argument: expected " "callable or None, but got: {arg_type}") - E172 = ("The Lemmatizer.load classmethod is deprecated. To create a " - "Lemmatizer, initialize the class directly. See the docs for " - "details: https://spacy.io/api/lemmatizer") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") @@ -601,7 +598,7 @@ class Errors: "the same `Vocab`.") E1000 = ("No pkuseg model available. Provide a pkuseg model when " "initializing the pipeline:\n" - 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m' + 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' 'nlp = Chinese(config=cfg)') diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index 45cfc0abe..790762617 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): lower = True if raw is not None: raw = raw.lower() - ndsv = nlp.Defaults.single_orth_variants - ndpv = nlp.Defaults.paired_orth_variants + orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) + ndsv = orth_variants.get("single", []) + ndpv = orth_variants.get("pairsed", []) words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 2ac771537..4b80e0c41 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.bn.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules"] """ diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index d2924e902..cab47555d 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 82ed5ed34..4f3802b21 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index a5c38bd39..d620ded58 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -19,9 +19,15 @@ stop_words = {"@language_data": "spacy.de.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] """ @@ -36,20 +42,6 @@ class GermanDefaults(Language.Defaults): suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES syntax_iterators = SYNTAX_ITERATORS - single_orth_variants = [ - {"tags": ["$("], "variants": ["…", "..."]}, - {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}, - ] - paired_orth_variants = [ - { - "tags": ["$("], - "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")], - }, - { - "tags": ["$("], - "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")], - }, - ] class German(Language): diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 2fd8647fb..65c634340 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -21,15 +21,21 @@ lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.GreekLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_index", "lemma_exc", "lemma_rules"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"] """ @registry.lemmatizers("spacy.GreekLemmatizer.v1") -def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer: - return GreekLemmatizer(data_paths=data_paths) +def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: + return GreekLemmatizer(data=data) @registry.language_data("spacy.el.stop_words") diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 4a69b2a41..3e21cf21b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -22,9 +22,15 @@ lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.EnglishLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] """ @@ -39,22 +45,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.EnglishLemmatizer.v1") -def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": - return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form) +def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer": + return Lemmatizer(data=data, is_base_form=is_base_form) class EnglishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) syntax_iterators = SYNTAX_ITERATORS infixes = TOKENIZER_INFIXES - single_orth_variants = [ - {"tags": ["NFP"], "variants": ["…", "..."]}, - {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, - ] - paired_orth_variants = [ - {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, - {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, - ] class English(Language): diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 4425bfc01..52aef4521 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"] """ diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 085f400a4..41e40ca30 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -24,9 +24,10 @@ has_letters = true [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc"] """ diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 8140a21b6..4ec30cbd9 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -22,15 +22,16 @@ lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.FrenchLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] """ @registry.lemmatizers("spacy.FrenchLemmatizer.v1") -def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer: - return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form) +def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: + return FrenchLemmatizer(data=data, is_base_form=is_base_form) @registry.language_data("spacy.fr.stop_words") diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 648186093..e841ee24d 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -15,9 +15,10 @@ stop_words = {"@language_data": "spacy.hr.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 3e83e971a..2cfd61dfa 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.hu.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index b8b34aa26..8998addb4 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1b0a15348..f6b6afa59 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.it.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 54e4e82c0..d381bb2e7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 656df79c9..23c11f3a1 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index e472b0c60..3b386344b 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -19,9 +19,10 @@ stop_words = {"@language_data": "spacy.nb.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules", "lemma_exc"] """ diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 7e9806bc3..ab2cf3a94 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -21,9 +21,10 @@ lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.DutchLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] """ @@ -38,8 +39,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.DutchLemmatizer.v1") -def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer: - return DutchLemmatizer(data_paths=data_paths) +def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer: + return DutchLemmatizer(data=data) class DutchDefaults(Language.Defaults): diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 87a174ec8..82957dc7a 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -20,9 +20,10 @@ lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.PolishLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"] """ @@ -37,8 +38,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.PolishLemmatizer.v1") -def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer: - return PolishLemmatizer(data_paths=data_paths) +def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer: + return PolishLemmatizer(data=data) class PolishDefaults(Language.Defaults): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 6dc22ed61..045bd3bc1 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index b66b7767c..740bd7911 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -22,9 +22,10 @@ stop_words = {"@language_data": "spacy.ro.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 004a8d83a..e9e28dfb5 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -18,6 +18,11 @@ lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.RussianLemmatizer.v1" + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index fd53d3826..f69ad3a89 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -18,9 +18,15 @@ lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 5c376fd51..c18ad775d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -22,9 +22,10 @@ lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules"] """ diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index 983bd5de4..c429127c9 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -12,6 +12,11 @@ DEFAULT_CONFIG = """ lang = "ta" stop_words = {"@language_data": "spacy.ta.stop_words"} lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"} + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 116355342..1fdf4311e 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -16,6 +16,11 @@ lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"} [nlp.tokenizer] @tokenizers = "spacy.ThaiTokenizer.v1" + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index c52adb046..a7158e6f6 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -18,9 +18,10 @@ lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index f6782b419..dff56e945 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -16,9 +16,10 @@ stop_words = {"@language_data": "spacy.tr.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index c7977d6b8..db714c296 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -23,9 +23,10 @@ has_letters = true [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/language.py b/spacy/language.py index 97c8f31b7..77d0b4b0e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -55,8 +55,6 @@ class BaseDefaults: tokenizer_exceptions: Dict[str, List[dict]] = {} morph_rules: Dict[str, Dict[str, dict]] = {} syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {} - single_orth_variants: List[Dict[str, List[str]]] = [] - paired_orth_variants: List[Dict[str, Union[List[str], List[Tuple[str, str]]]]] = [] class Language: @@ -1268,11 +1266,13 @@ class Language: lemmatizer = resolved["nlp"]["lemmatizer"] lex_attr_getters = resolved["nlp"]["lex_attr_getters"] stop_words = resolved["nlp"]["stop_words"] + vocab_data = resolved["nlp"]["vocab_data"] vocab = Vocab.from_config( filled, lemmatizer=lemmatizer, lex_attr_getters=lex_attr_getters, stop_words=stop_words, + vocab_data=vocab_data, # TODO: what should we do with these? tag_map=cls.Defaults.tag_map, morph_rules=cls.Defaults.morph_rules, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 81dbf4ea3..8255b4b36 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,14 +1,13 @@ from typing import Optional, Callable, List, Dict from .lookups import Lookups -from .errors import Errors from .parts_of_speech import NAMES as UPOS_NAMES -from .util import registry, load_language_data, SimpleFrozenDict +from .util import registry @registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": - return Lemmatizer(data_paths=data_paths) +def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer": + return Lemmatizer(data=data) class Lemmatizer: @@ -19,14 +18,10 @@ class Lemmatizer: DOCS: https://spacy.io/api/lemmatizer """ - @classmethod - def load(cls, *args, **kwargs): - raise NotImplementedError(Errors.E172) - def __init__( self, lookups: Optional[Lookups] = None, - data_paths: dict = SimpleFrozenDict(), + data: Dict[str, dict] = {}, is_base_form: Optional[Callable] = None, ) -> None: """Initialize a Lemmatizer. @@ -36,9 +31,9 @@ class Lemmatizer: RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() - for name, filename in data_paths.items(): - data = load_language_data(filename) - self.lookups.add_table(name, data) + for name, table in data.items(): + if table is not None: + self.lookups.add_table(name, table) self.is_base_form = is_base_form def __call__( diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index edaf874a3..25461b4b7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -251,11 +251,11 @@ cdef class Lexeme: property cluster: """RETURNS (int): Brown cluster ID.""" def __get__(self): - cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) return cluster_table.get(self.c.orth, 0) def __set__(self, int x): - cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) cluster_table[self.c.orth] = x property lang: @@ -270,13 +270,13 @@ cdef class Lexeme: """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" def __get__(self): - prob_table = self.vocab.load_extra_lookups("lexeme_prob") - settings_table = self.vocab.load_extra_lookups("lexeme_settings") + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) default_oov_prob = settings_table.get("oov_prob", -20.0) return prob_table.get(self.c.orth, default_oov_prob) def __set__(self, float x): - prob_table = self.vocab.load_extra_lookups("lexeme_prob") + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) prob_table[self.c.orth] = x property lower_: diff --git a/spacy/lookups.py b/spacy/lookups.py index b03a326b6..d5def882e 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -5,7 +5,7 @@ from preshed.bloom import BloomFilter from collections import OrderedDict from .errors import Errors -from .util import SimpleFrozenDict, ensure_path, registry +from .util import SimpleFrozenDict, ensure_path, registry, load_language_data from .strings import get_string_id @@ -13,18 +13,26 @@ UNSET = object() @registry.language_data("spacy-lookups-data") -def get_lookups(lang: str) -> Dict[str, Any]: +def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty dict if there's no data or if the package is not installed. lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). + tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] RETURNS (Dict[str, Any]): The lookups, keyed by table name. """ - if lang in registry.lookups: - return registry.lookups.get(lang) - return {} + # TODO: import spacy_lookups_data instead of going via entry points here? + if lang not in registry.lookups: + return {} + data = registry.lookups.get(lang) + result = {} + for table in tables: + if table not in data: + raise ValueError("TODO: unknown table") + result[table] = load_language_data(data[table]) + return result class Lookups: diff --git a/spacy/schemas.py b/spacy/schemas.py index bd4939392..ba5e812ee 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel): writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system") stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") + vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") # fmt: on class Config: diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 44f540132..3c904cb01 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -5,6 +5,7 @@ from spacy.lookups import Lookups from spacy.lemmatizer import Lemmatizer +@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?") def test_lemmatizer_reflects_lookups_changes(): """Test for an issue that'd cause lookups available in a model loaded from disk to not be reflected in the lemmatizer.""" @@ -56,4 +57,8 @@ def test_lemmatizer_without_is_base_form_implementation(): lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) lemmatizer = Lemmatizer(lookups, is_base_form=None) - assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"] + assert lemmatizer( + "Formuesskatten", + "noun", + {"Definite": "def", "Gender": "masc", "Number": "sing"}, + ) == ["formuesskatt"] diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index f93b6cffe..a31c984ad 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -29,7 +29,6 @@ cdef class Vocab: cpdef public Morphology morphology cpdef public object vectors cpdef public object lookups - cpdef public object lookups_extra cpdef public object writing_system cdef readonly int length cdef public object data_dir diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3ab90dd2f..1afee4f69 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -31,7 +31,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, lookups_extra=None, + strings=tuple(), lookups=None, vocab_data={}, oov_prob=-20., vectors_name=None, writing_system={}, **deprecated_kwargs): """Create the vocabulary. @@ -44,7 +44,6 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. - lookups_extra (Lookups): Container for optional lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. @@ -53,12 +52,12 @@ cdef class Vocab: tag_map = tag_map if tag_map is not None else {} if lookups in (None, True, False): lookups = Lookups() - if "lexeme_norm" not in lookups: - lookups.add_table("lexeme_norm") + for name, data in vocab_data.items(): + if name not in lookups: + data = data if data is not None else {} + lookups.add_table(name, data) if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) - if lookups_extra in (None, True, False): - lookups_extra = Lookups() self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -71,7 +70,6 @@ cdef class Vocab: self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors(name=vectors_name) self.lookups = lookups - self.lookups_extra = lookups_extra self.writing_system = writing_system @property @@ -425,6 +423,7 @@ cdef class Vocab: lemmatizer=None, lex_attr_getters=None, stop_words=None, + vocab_data=None, vectors_name=None, tag_map=None, morph_rules=None @@ -444,12 +443,12 @@ cdef class Vocab: if not lemmatizer: lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]} lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - lookups = lemmatizer.lookups - if "lexeme_norm" not in lookups: - lookups.add_table("lexeme_norm") if stop_words is None: stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]} stop_words = registry.make_from_config(stop_words_cfg)["stop_words"] + if vocab_data is None: + vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} + vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] if lex_attr_getters is None: lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] @@ -462,14 +461,12 @@ cdef class Vocab: lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - # TODO: we need to move the lexeme norms to their own entry - # points so we can specify them separately from the lemma lookups - lookups.get_table("lexeme_norm"), + vocab_data.get("lexeme_norm", {}), ) vocab = cls( lex_attr_getters=lex_attrs, + vocab_data=vocab_data, lemmatizer=lemmatizer, - lookups=lookups, writing_system=writing_system, tag_map=tag_map, ) @@ -498,8 +495,6 @@ cdef class Vocab: self.vectors.to_disk(path) if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) - if "lookups_extra" not in "exclude" and self.lookups_extra is not None: - self.lookups_extra.to_disk(path, filename="lookups_extra.bin") def from_disk(self, path, exclude=tuple()): """Loads state from a directory. Modifies the object in place and @@ -522,8 +517,6 @@ cdef class Vocab: link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) - if "lookups_extra" not in exclude: - self.lookups_extra.from_disk(path, filename="lookups_extra.bin") if "lexeme_norm" in self.lookups: self.lex_attr_getters[NORM] = util.add_lookups( self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") @@ -550,7 +543,6 @@ cdef class Vocab: "strings": lambda: self.strings.to_bytes(), "vectors": deserialize_vectors, "lookups": lambda: self.lookups.to_bytes(), - "lookups_extra": lambda: self.lookups_extra.to_bytes() } return util.to_bytes(getters, exclude) @@ -574,7 +566,6 @@ cdef class Vocab: "lexemes": lambda b: self.lexemes_from_bytes(b), "vectors": lambda b: serialize_vectors(b), "lookups": lambda b: self.lookups.from_bytes(b), - "lookups_extra": lambda b: self.lookups_extra.from_bytes(b) } util.from_bytes(bytes_data, setters, exclude) if "lexeme_norm" in self.lookups: @@ -592,19 +583,6 @@ cdef class Vocab: raise NotImplementedError - def load_extra_lookups(self, table_name): - if table_name not in self.lookups_extra: - if self.lang + "_extra" in util.registry.lookups: - tables = util.registry.lookups.get(self.lang + "_extra") - for name, filename in tables.items(): - if table_name == name: - data = util.load_language_data(filename) - self.lookups_extra.add_table(name, data) - if table_name not in self.lookups_extra: - self.lookups_extra.add_table(table_name) - return self.lookups_extra.get_table(table_name) - - def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors @@ -612,13 +590,12 @@ def pickle_vocab(vocab): data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups - lookups_extra = vocab.lookups_extra return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, lookups, lookups_extra): + lex_attr_getters, lookups): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore @@ -626,7 +603,6 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups - vocab.lookups_extra = lookups_extra return vocab