From 945f795a3e4ebb0bab6e4c0420ec1dc590437422 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 15:59:37 +0200 Subject: [PATCH 01/14] WIP: move more language data to config --- spacy/cli/init_model.py | 7 ++--- spacy/cli/train.py | 7 ++--- spacy/default_config.cfg | 2 ++ spacy/errors.py | 5 +--- spacy/gold/augment.py | 5 ++-- spacy/lang/bn/__init__.py | 3 +- spacy/lang/ca/__init__.py | 3 +- spacy/lang/da/__init__.py | 8 +++++- spacy/lang/de/__init__.py | 22 +++++---------- spacy/lang/el/__init__.py | 12 ++++++-- spacy/lang/en/__init__.py | 20 ++++++-------- spacy/lang/es/__init__.py | 8 +++++- spacy/lang/fa/__init__.py | 3 +- spacy/lang/fr/__init__.py | 7 +++-- spacy/lang/hr/__init__.py | 3 +- spacy/lang/hu/__init__.py | 3 +- spacy/lang/id/__init__.py | 8 +++++- spacy/lang/it/__init__.py | 3 +- spacy/lang/lb/__init__.py | 8 +++++- spacy/lang/lt/__init__.py | 3 +- spacy/lang/nb/__init__.py | 3 +- spacy/lang/nl/__init__.py | 7 +++-- spacy/lang/pl/__init__.py | 7 +++-- spacy/lang/pt/__init__.py | 3 +- spacy/lang/ro/__init__.py | 3 +- spacy/lang/ru/__init__.py | 5 ++++ spacy/lang/sr/__init__.py | 8 +++++- spacy/lang/sv/__init__.py | 3 +- spacy/lang/ta/__init__.py | 5 ++++ spacy/lang/th/__init__.py | 5 ++++ spacy/lang/tl/__init__.py | 3 +- spacy/lang/tr/__init__.py | 3 +- spacy/lang/ur/__init__.py | 3 +- spacy/language.py | 4 +-- spacy/lemmatizer.py | 19 +++++-------- spacy/lexeme.pyx | 10 +++---- spacy/lookups.py | 18 ++++++++---- spacy/schemas.py | 1 + spacy/tests/test_lemmatizer.py | 7 ++++- spacy/vocab.pxd | 1 - spacy/vocab.pyx | 50 +++++++++------------------------- 41 files changed, 174 insertions(+), 134 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 9fb346006..f0c80bb8c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -112,10 +112,9 @@ def init_model( # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") + nlp.vocab.lookups.remove_table("lexeme_cluster") + nlp.vocab.lookups.remove_table("lexeme_prob") + nlp.vocab.lookups.remove_table("lexeme_settings") msg.good("Successfully created model") if vectors_loc is not None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ff665368..310580dbb 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -123,10 +123,9 @@ def train( # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if config["training"]["omit_extra_lookups"]: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") + nlp.vocab.lookups.remove_table("lexeme_cluster") + nlp.vocab.lookups.remove_table("lexeme_prob") + nlp.vocab.lookups.remove_table("lexeme_settings") # Load a pretrained tok2vec model - cf. CLI command 'pretrain' if weights_data is not None: diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7e6c7a6ec..747194cb4 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -2,6 +2,7 @@ lang = null stop_words = [] lex_attr_getters = {} +vocab_data = {} pipeline = [] [nlp.tokenizer] @@ -9,6 +10,7 @@ pipeline = [] [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" +data = {} [nlp.writing_system] direction = "ltr" diff --git a/spacy/errors.py b/spacy/errors.py index f6c7a569f..719e0204b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -434,9 +434,6 @@ class Errors: E170 = ("Cannot apply transition {name}: invalid for the current state.") E171 = ("Matcher.add received invalid on_match callback argument: expected " "callable or None, but got: {arg_type}") - E172 = ("The Lemmatizer.load classmethod is deprecated. To create a " - "Lemmatizer, initialize the class directly. See the docs for " - "details: https://spacy.io/api/lemmatizer") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") @@ -601,7 +598,7 @@ class Errors: "the same `Vocab`.") E1000 = ("No pkuseg model available. Provide a pkuseg model when " "initializing the pipeline:\n" - 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m' + 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' 'nlp = Chinese(config=cfg)') diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index 45cfc0abe..790762617 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): lower = True if raw is not None: raw = raw.lower() - ndsv = nlp.Defaults.single_orth_variants - ndpv = nlp.Defaults.paired_orth_variants + orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) + ndsv = orth_variants.get("single", []) + ndpv = orth_variants.get("pairsed", []) words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 2ac771537..4b80e0c41 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.bn.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules"] """ diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index d2924e902..cab47555d 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 82ed5ed34..4f3802b21 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index a5c38bd39..d620ded58 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -19,9 +19,15 @@ stop_words = {"@language_data": "spacy.de.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] """ @@ -36,20 +42,6 @@ class GermanDefaults(Language.Defaults): suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES syntax_iterators = SYNTAX_ITERATORS - single_orth_variants = [ - {"tags": ["$("], "variants": ["…", "..."]}, - {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}, - ] - paired_orth_variants = [ - { - "tags": ["$("], - "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")], - }, - { - "tags": ["$("], - "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")], - }, - ] class German(Language): diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 2fd8647fb..65c634340 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -21,15 +21,21 @@ lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.GreekLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_index", "lemma_exc", "lemma_rules"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"] """ @registry.lemmatizers("spacy.GreekLemmatizer.v1") -def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer: - return GreekLemmatizer(data_paths=data_paths) +def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: + return GreekLemmatizer(data=data) @registry.language_data("spacy.el.stop_words") diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 4a69b2a41..3e21cf21b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -22,9 +22,15 @@ lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.EnglishLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] """ @@ -39,22 +45,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.EnglishLemmatizer.v1") -def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": - return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form) +def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer": + return Lemmatizer(data=data, is_base_form=is_base_form) class EnglishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) syntax_iterators = SYNTAX_ITERATORS infixes = TOKENIZER_INFIXES - single_orth_variants = [ - {"tags": ["NFP"], "variants": ["…", "..."]}, - {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, - ] - paired_orth_variants = [ - {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, - {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, - ] class English(Language): diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 4425bfc01..52aef4521 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"] """ diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 085f400a4..41e40ca30 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -24,9 +24,10 @@ has_letters = true [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc"] """ diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 8140a21b6..4ec30cbd9 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -22,15 +22,16 @@ lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.FrenchLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] """ @registry.lemmatizers("spacy.FrenchLemmatizer.v1") -def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer: - return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form) +def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: + return FrenchLemmatizer(data=data, is_base_form=is_base_form) @registry.language_data("spacy.fr.stop_words") diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 648186093..e841ee24d 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -15,9 +15,10 @@ stop_words = {"@language_data": "spacy.hr.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 3e83e971a..2cfd61dfa 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.hu.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index b8b34aa26..8998addb4 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1b0a15348..f6b6afa59 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.it.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 54e4e82c0..d381bb2e7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 656df79c9..23c11f3a1 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index e472b0c60..3b386344b 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -19,9 +19,10 @@ stop_words = {"@language_data": "spacy.nb.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules", "lemma_exc"] """ diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 7e9806bc3..ab2cf3a94 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -21,9 +21,10 @@ lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.DutchLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] """ @@ -38,8 +39,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.DutchLemmatizer.v1") -def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer: - return DutchLemmatizer(data_paths=data_paths) +def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer: + return DutchLemmatizer(data=data) class DutchDefaults(Language.Defaults): diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 87a174ec8..82957dc7a 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -20,9 +20,10 @@ lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.PolishLemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"] """ @@ -37,8 +38,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: @registry.lemmatizers("spacy.PolishLemmatizer.v1") -def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer: - return PolishLemmatizer(data_paths=data_paths) +def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer: + return PolishLemmatizer(data=data) class PolishDefaults(Language.Defaults): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 6dc22ed61..045bd3bc1 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index b66b7767c..740bd7911 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -22,9 +22,10 @@ stop_words = {"@language_data": "spacy.ro.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 004a8d83a..e9e28dfb5 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -18,6 +18,11 @@ lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.RussianLemmatizer.v1" + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index fd53d3826..f69ad3a89 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -18,9 +18,15 @@ lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 5c376fd51..c18ad775d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -22,9 +22,10 @@ lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup", "lemma_rules"] """ diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index 983bd5de4..c429127c9 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -12,6 +12,11 @@ DEFAULT_CONFIG = """ lang = "ta" stop_words = {"@language_data": "spacy.ta.stop_words"} lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"} + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 116355342..1fdf4311e 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -16,6 +16,11 @@ lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"} [nlp.tokenizer] @tokenizers = "spacy.ThaiTokenizer.v1" + +[nlp.vocab_data] +@language_data = "spacy-lookups-data" +lang = ${nlp:lang} +tables = ["lexeme_norm"] """ diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index c52adb046..a7158e6f6 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -18,9 +18,10 @@ lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index f6782b419..dff56e945 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -16,9 +16,10 @@ stop_words = {"@language_data": "spacy.tr.stop_words"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index c7977d6b8..db714c296 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -23,9 +23,10 @@ has_letters = true [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -[nlp.lemmatizer.data_paths] +[nlp.lemmatizer.data] @language_data = "spacy-lookups-data" lang = ${nlp:lang} +tables = ["lemma_lookup"] """ diff --git a/spacy/language.py b/spacy/language.py index 97c8f31b7..77d0b4b0e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -55,8 +55,6 @@ class BaseDefaults: tokenizer_exceptions: Dict[str, List[dict]] = {} morph_rules: Dict[str, Dict[str, dict]] = {} syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {} - single_orth_variants: List[Dict[str, List[str]]] = [] - paired_orth_variants: List[Dict[str, Union[List[str], List[Tuple[str, str]]]]] = [] class Language: @@ -1268,11 +1266,13 @@ class Language: lemmatizer = resolved["nlp"]["lemmatizer"] lex_attr_getters = resolved["nlp"]["lex_attr_getters"] stop_words = resolved["nlp"]["stop_words"] + vocab_data = resolved["nlp"]["vocab_data"] vocab = Vocab.from_config( filled, lemmatizer=lemmatizer, lex_attr_getters=lex_attr_getters, stop_words=stop_words, + vocab_data=vocab_data, # TODO: what should we do with these? tag_map=cls.Defaults.tag_map, morph_rules=cls.Defaults.morph_rules, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 81dbf4ea3..8255b4b36 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,14 +1,13 @@ from typing import Optional, Callable, List, Dict from .lookups import Lookups -from .errors import Errors from .parts_of_speech import NAMES as UPOS_NAMES -from .util import registry, load_language_data, SimpleFrozenDict +from .util import registry @registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer": - return Lemmatizer(data_paths=data_paths) +def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer": + return Lemmatizer(data=data) class Lemmatizer: @@ -19,14 +18,10 @@ class Lemmatizer: DOCS: https://spacy.io/api/lemmatizer """ - @classmethod - def load(cls, *args, **kwargs): - raise NotImplementedError(Errors.E172) - def __init__( self, lookups: Optional[Lookups] = None, - data_paths: dict = SimpleFrozenDict(), + data: Dict[str, dict] = {}, is_base_form: Optional[Callable] = None, ) -> None: """Initialize a Lemmatizer. @@ -36,9 +31,9 @@ class Lemmatizer: RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() - for name, filename in data_paths.items(): - data = load_language_data(filename) - self.lookups.add_table(name, data) + for name, table in data.items(): + if table is not None: + self.lookups.add_table(name, table) self.is_base_form = is_base_form def __call__( diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index edaf874a3..25461b4b7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -251,11 +251,11 @@ cdef class Lexeme: property cluster: """RETURNS (int): Brown cluster ID.""" def __get__(self): - cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) return cluster_table.get(self.c.orth, 0) def __set__(self, int x): - cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) cluster_table[self.c.orth] = x property lang: @@ -270,13 +270,13 @@ cdef class Lexeme: """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" def __get__(self): - prob_table = self.vocab.load_extra_lookups("lexeme_prob") - settings_table = self.vocab.load_extra_lookups("lexeme_settings") + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) default_oov_prob = settings_table.get("oov_prob", -20.0) return prob_table.get(self.c.orth, default_oov_prob) def __set__(self, float x): - prob_table = self.vocab.load_extra_lookups("lexeme_prob") + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) prob_table[self.c.orth] = x property lower_: diff --git a/spacy/lookups.py b/spacy/lookups.py index b03a326b6..d5def882e 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -5,7 +5,7 @@ from preshed.bloom import BloomFilter from collections import OrderedDict from .errors import Errors -from .util import SimpleFrozenDict, ensure_path, registry +from .util import SimpleFrozenDict, ensure_path, registry, load_language_data from .strings import get_string_id @@ -13,18 +13,26 @@ UNSET = object() @registry.language_data("spacy-lookups-data") -def get_lookups(lang: str) -> Dict[str, Any]: +def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty dict if there's no data or if the package is not installed. lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). + tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] RETURNS (Dict[str, Any]): The lookups, keyed by table name. """ - if lang in registry.lookups: - return registry.lookups.get(lang) - return {} + # TODO: import spacy_lookups_data instead of going via entry points here? + if lang not in registry.lookups: + return {} + data = registry.lookups.get(lang) + result = {} + for table in tables: + if table not in data: + raise ValueError("TODO: unknown table") + result[table] = load_language_data(data[table]) + return result class Lookups: diff --git a/spacy/schemas.py b/spacy/schemas.py index bd4939392..ba5e812ee 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel): writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system") stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") + vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") # fmt: on class Config: diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 44f540132..3c904cb01 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -5,6 +5,7 @@ from spacy.lookups import Lookups from spacy.lemmatizer import Lemmatizer +@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?") def test_lemmatizer_reflects_lookups_changes(): """Test for an issue that'd cause lookups available in a model loaded from disk to not be reflected in the lemmatizer.""" @@ -56,4 +57,8 @@ def test_lemmatizer_without_is_base_form_implementation(): lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) lemmatizer = Lemmatizer(lookups, is_base_form=None) - assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"] + assert lemmatizer( + "Formuesskatten", + "noun", + {"Definite": "def", "Gender": "masc", "Number": "sing"}, + ) == ["formuesskatt"] diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index f93b6cffe..a31c984ad 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -29,7 +29,6 @@ cdef class Vocab: cpdef public Morphology morphology cpdef public object vectors cpdef public object lookups - cpdef public object lookups_extra cpdef public object writing_system cdef readonly int length cdef public object data_dir diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3ab90dd2f..1afee4f69 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -31,7 +31,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, lookups_extra=None, + strings=tuple(), lookups=None, vocab_data={}, oov_prob=-20., vectors_name=None, writing_system={}, **deprecated_kwargs): """Create the vocabulary. @@ -44,7 +44,6 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. - lookups_extra (Lookups): Container for optional lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. @@ -53,12 +52,12 @@ cdef class Vocab: tag_map = tag_map if tag_map is not None else {} if lookups in (None, True, False): lookups = Lookups() - if "lexeme_norm" not in lookups: - lookups.add_table("lexeme_norm") + for name, data in vocab_data.items(): + if name not in lookups: + data = data if data is not None else {} + lookups.add_table(name, data) if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) - if lookups_extra in (None, True, False): - lookups_extra = Lookups() self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -71,7 +70,6 @@ cdef class Vocab: self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors(name=vectors_name) self.lookups = lookups - self.lookups_extra = lookups_extra self.writing_system = writing_system @property @@ -425,6 +423,7 @@ cdef class Vocab: lemmatizer=None, lex_attr_getters=None, stop_words=None, + vocab_data=None, vectors_name=None, tag_map=None, morph_rules=None @@ -444,12 +443,12 @@ cdef class Vocab: if not lemmatizer: lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]} lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - lookups = lemmatizer.lookups - if "lexeme_norm" not in lookups: - lookups.add_table("lexeme_norm") if stop_words is None: stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]} stop_words = registry.make_from_config(stop_words_cfg)["stop_words"] + if vocab_data is None: + vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} + vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] if lex_attr_getters is None: lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] @@ -462,14 +461,12 @@ cdef class Vocab: lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - # TODO: we need to move the lexeme norms to their own entry - # points so we can specify them separately from the lemma lookups - lookups.get_table("lexeme_norm"), + vocab_data.get("lexeme_norm", {}), ) vocab = cls( lex_attr_getters=lex_attrs, + vocab_data=vocab_data, lemmatizer=lemmatizer, - lookups=lookups, writing_system=writing_system, tag_map=tag_map, ) @@ -498,8 +495,6 @@ cdef class Vocab: self.vectors.to_disk(path) if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) - if "lookups_extra" not in "exclude" and self.lookups_extra is not None: - self.lookups_extra.to_disk(path, filename="lookups_extra.bin") def from_disk(self, path, exclude=tuple()): """Loads state from a directory. Modifies the object in place and @@ -522,8 +517,6 @@ cdef class Vocab: link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) - if "lookups_extra" not in exclude: - self.lookups_extra.from_disk(path, filename="lookups_extra.bin") if "lexeme_norm" in self.lookups: self.lex_attr_getters[NORM] = util.add_lookups( self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") @@ -550,7 +543,6 @@ cdef class Vocab: "strings": lambda: self.strings.to_bytes(), "vectors": deserialize_vectors, "lookups": lambda: self.lookups.to_bytes(), - "lookups_extra": lambda: self.lookups_extra.to_bytes() } return util.to_bytes(getters, exclude) @@ -574,7 +566,6 @@ cdef class Vocab: "lexemes": lambda b: self.lexemes_from_bytes(b), "vectors": lambda b: serialize_vectors(b), "lookups": lambda b: self.lookups.from_bytes(b), - "lookups_extra": lambda b: self.lookups_extra.from_bytes(b) } util.from_bytes(bytes_data, setters, exclude) if "lexeme_norm" in self.lookups: @@ -592,19 +583,6 @@ cdef class Vocab: raise NotImplementedError - def load_extra_lookups(self, table_name): - if table_name not in self.lookups_extra: - if self.lang + "_extra" in util.registry.lookups: - tables = util.registry.lookups.get(self.lang + "_extra") - for name, filename in tables.items(): - if table_name == name: - data = util.load_language_data(filename) - self.lookups_extra.add_table(name, data) - if table_name not in self.lookups_extra: - self.lookups_extra.add_table(table_name) - return self.lookups_extra.get_table(table_name) - - def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors @@ -612,13 +590,12 @@ def pickle_vocab(vocab): data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups - lookups_extra = vocab.lookups_extra return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, lookups, lookups_extra): + lex_attr_getters, lookups): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore @@ -626,7 +603,6 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups - vocab.lookups_extra = lookups_extra return vocab From 0fcd352179fb6fe0a5cad0b00238d215b38c7ba6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 16:01:17 +0200 Subject: [PATCH 02/14] Remove omit_extra_lookups --- spacy/cli/init_model.py | 10 ---------- spacy/cli/train.py | 7 ------- spacy/default_config.cfg | 1 - spacy/schemas.py | 1 - 4 files changed, 19 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index f0c80bb8c..e1dca2395 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -41,7 +41,6 @@ def init_model_cli( truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), - omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)") # fmt: on ): @@ -60,7 +59,6 @@ def init_model_cli( truncate_vectors=truncate_vectors, vectors_name=vectors_name, model_name=model_name, - omit_extra_lookups=omit_extra_lookups, base_model=base_model, silent=False, ) @@ -77,7 +75,6 @@ def init_model( truncate_vectors: int = 0, vectors_name: Optional[str] = None, model_name: Optional[str] = None, - omit_extra_lookups: bool = False, base_model: Optional[str] = None, silent: bool = True, ) -> Language: @@ -109,13 +106,6 @@ def init_model( with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) - # Create empty extra lexeme tables so the data from spacy-lookups-data - # isn't loaded if these features are accessed - if omit_extra_lookups: - nlp.vocab.lookups.remove_table("lexeme_cluster") - nlp.vocab.lookups.remove_table("lexeme_prob") - nlp.vocab.lookups.remove_table("lexeme_settings") - msg.good("Successfully created model") if vectors_loc is not None: add_vectors( diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 310580dbb..e86fb58c9 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -120,13 +120,6 @@ def train( # Load morph rules nlp.vocab.morphology.load_morph_exceptions(morph_rules) - # Create empty extra lexeme tables so the data from spacy-lookups-data - # isn't loaded if these features are accessed - if config["training"]["omit_extra_lookups"]: - nlp.vocab.lookups.remove_table("lexeme_cluster") - nlp.vocab.lookups.remove_table("lexeme_prob") - nlp.vocab.lookups.remove_table("lexeme_settings") - # Load a pretrained tok2vec model - cf. CLI command 'pretrain' if weights_data is not None: tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 747194cb4..125273c43 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -47,7 +47,6 @@ score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4} # These settings are invalid for the transformer models. init_tok2vec = null discard_oversize = false -omit_extra_lookups = false batch_by = "sequences" raw_text = null tag_map = null diff --git a/spacy/schemas.py b/spacy/schemas.py index ba5e812ee..590032559 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -211,7 +211,6 @@ class ConfigSchemaTraining(BaseModel): score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size") - omit_extra_lookups: StrictBool = Field(..., title="Don't include extra lookups in model") batch_by: StrictStr = Field(..., title="Batch examples by type") raw_text: Optional[StrictStr] = Field(..., title="Raw text") tag_map: Optional[StrictStr] = Field(..., title="Path to JSON-formatted tag map") From 7fc4dadd22b7535401bdd698806c8216fb15f2e4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 20:27:22 +0200 Subject: [PATCH 03/14] Fix typo --- spacy/gold/augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index 790762617..4a01c8589 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -27,7 +27,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): raw = raw.lower() orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) ndsv = orth_variants.get("single", []) - ndpv = orth_variants.get("pairsed", []) + ndpv = orth_variants.get("paired", []) words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined From b507f616292cbda3b293f1265a791b8221445a69 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 22:18:46 +0200 Subject: [PATCH 04/14] Tidy up and move noun_chunks, token_match, url_match --- spacy/default_config.cfg | 3 + spacy/lang/ar/__init__.py | 6 +- spacy/lang/ar/tokenizer_exceptions.py | 4 +- spacy/lang/bn/__init__.py | 5 +- spacy/lang/bn/tokenizer_exceptions.py | 4 +- spacy/lang/ca/__init__.py | 5 +- spacy/lang/ca/tokenizer_exceptions.py | 4 +- spacy/lang/da/__init__.py | 5 +- spacy/lang/da/tokenizer_exceptions.py | 4 +- spacy/lang/de/__init__.py | 16 +- spacy/lang/de/syntax_iterators.py | 29 +- spacy/lang/de/tokenizer_exceptions.py | 4 +- spacy/lang/defaults.py | 9 + spacy/lang/el/__init__.py | 14 +- spacy/lang/el/syntax_iterators.py | 11 +- spacy/lang/el/tag_map_fine.py | 4265 ----------------------- spacy/lang/el/tokenizer_exceptions.py | 5 +- spacy/lang/en/__init__.py | 14 +- spacy/lang/en/syntax_iterators.py | 25 +- spacy/lang/en/tokenizer_exceptions.py | 4 +- spacy/lang/es/__init__.py | 14 +- spacy/lang/es/syntax_iterators.py | 28 +- spacy/lang/es/tokenizer_exceptions.py | 4 +- spacy/lang/fa/__init__.py | 13 +- spacy/lang/fi/__init__.py | 5 +- spacy/lang/fi/tokenizer_exceptions.py | 4 +- spacy/lang/fr/__init__.py | 26 +- spacy/lang/fr/syntax_iterators.py | 24 +- spacy/lang/fr/tokenizer_exceptions.py | 5 +- spacy/lang/ga/__init__.py | 5 +- spacy/lang/ga/tokenizer_exceptions.py | 4 +- spacy/lang/he/__init__.py | 4 +- spacy/lang/hr/__init__.py | 4 +- spacy/lang/hu/__init__.py | 17 +- spacy/lang/hu/tokenizer_exceptions.py | 4 +- spacy/lang/id/__init__.py | 14 +- spacy/lang/id/syntax_iterators.py | 20 +- spacy/lang/id/tokenizer_exceptions.py | 5 +- spacy/lang/it/__init__.py | 5 +- spacy/lang/it/tokenizer_exceptions.py | 5 +- spacy/lang/ja/__init__.py | 18 +- spacy/lang/ja/syntax_iterators.py | 36 +- spacy/lang/ko/__init__.py | 7 +- spacy/lang/lb/__init__.py | 5 +- spacy/lang/lb/tokenizer_exceptions.py | 5 +- spacy/lang/lij/__init__.py | 5 +- spacy/lang/lij/tokenizer_exceptions.py | 5 +- spacy/lang/lt/__init__.py | 9 +- spacy/lang/lt/tokenizer_exceptions.py | 270 +- spacy/lang/nb/__init__.py | 16 +- spacy/lang/nb/syntax_iterators.py | 24 +- spacy/lang/nb/tokenizer_exceptions.py | 4 +- spacy/lang/nl/__init__.py | 5 +- spacy/lang/nl/tokenizer_exceptions.py | 5 +- spacy/lang/pt/__init__.py | 5 +- spacy/lang/pt/tokenizer_exceptions.py | 4 +- spacy/lang/ro/__init__.py | 5 +- spacy/lang/ro/tokenizer_exceptions.py | 4 +- spacy/lang/ru/__init__.py | 5 +- spacy/lang/ru/tokenizer_exceptions.py | 4 +- spacy/lang/sr/__init__.py | 5 +- spacy/lang/sr/tokenizer_exceptions.py | 4 +- spacy/lang/sv/__init__.py | 14 +- spacy/lang/sv/syntax_iterators.py | 25 +- spacy/lang/sv/tokenizer_exceptions.py | 4 +- spacy/lang/tag_map.py | 25 - spacy/lang/tl/__init__.py | 5 +- spacy/lang/tl/tokenizer_exceptions.py | 4 +- spacy/lang/tokenizer_exceptions.py | 1 - spacy/lang/tr/__init__.py | 5 +- spacy/lang/tr/tokenizer_exceptions.py | 5 +- spacy/lang/tt/__init__.py | 5 +- spacy/lang/tt/tokenizer_exceptions.py | 5 +- spacy/lang/uk/__init__.py | 5 +- spacy/lang/uk/tokenizer_exceptions.py | 4 +- spacy/language.py | 23 +- spacy/schemas.py | 1 + spacy/tests/lang/en/test_noun_chunks.py | 4 +- spacy/tokenizer.pyx | 10 +- spacy/tokens/doc.pyx | 12 +- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 18 +- 82 files changed, 373 insertions(+), 4899 deletions(-) create mode 100644 spacy/lang/defaults.py delete mode 100644 spacy/lang/el/tag_map_fine.py delete mode 100644 spacy/lang/tag_map.py diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 125273c43..21dbf1798 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -3,10 +3,13 @@ lang = null stop_words = [] lex_attr_getters = {} vocab_data = {} +get_noun_chunks = null pipeline = [] [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" +token_match = null +url_match = {"@language_data": "spacy.xx.url_match"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index d46b18b6f..f387d0310 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -4,11 +4,9 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -35,7 +33,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class ArabicDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py index a11f3b43a..ce0f91ef7 100644 --- a/spacy/lang/ar/tokenizer_exceptions.py +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = {} @@ -43,4 +45,4 @@ for exc_data in [ for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 4b80e0c41..da2ca0c8d 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -30,7 +29,7 @@ def stop_words() -> Set[str]: class BengaliDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 18e313a25..d896b4914 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = {} @@ -21,4 +23,4 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index cab47555d..1fe7516ad 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry from .punctuation import TOKENIZER_INFIXES @@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class CatalanDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index b4ae61a2d..6928de46b 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = {} @@ -35,4 +37,4 @@ for h in range(1, 12 + 1): _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 4f3802b21..4e6ee9383 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class DanishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 36d03bde3..826a6077b 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -2,7 +2,9 @@ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc _exc = {} @@ -576,4 +578,4 @@ for h in range(1, 31 + 1): _custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _exc.update(_custom_base_exc) -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index d620ded58..58ee71247 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,20 +1,20 @@ -from typing import Set +from typing import Set, Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import SYNTAX_ITERATORS -from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .syntax_iterators import noun_chunks from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ [nlp] lang = "de" stop_words = {"@language_data": "spacy.de.stop_words"} +get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -36,12 +36,16 @@ def stop_words() -> Set[str]: return STOP_WORDS +@registry.language_data("spacy.de.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class GermanDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES - syntax_iterators = SYNTAX_ITERATORS class German(Language): diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index e322e1add..bd495f792 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -1,39 +1,26 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # this iterator extracts spans headed by NOUNs starting from the left-most # syntactic dependent until the NOUN itself for close apposition and # measurement construction, the span is sometimes extended to the right of # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" # and not just "eine Tasse", same for "das Thema Familie". - labels = [ - "sb", - "oa", - "da", - "nk", - "mo", - "ag", - "ROOT", - "root", - "cj", - "pd", - "og", - "app", - ] + # fmt: off + labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_label = doc.vocab.strings.add("NP") np_deps = set(doc.vocab.strings.add(label) for label in labels) close_app = doc.vocab.strings.add("nk") - rbracket = 0 for i, word in enumerate(doclike): if i < rbracket: diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 3c2f02c7a..d7860ace6 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...util import update_exc _exc = { @@ -254,4 +256,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/defaults.py b/spacy/lang/defaults.py new file mode 100644 index 000000000..6d692d6a5 --- /dev/null +++ b/spacy/lang/defaults.py @@ -0,0 +1,9 @@ +from typing import Pattern + +from .tokenizer_exceptions import URL_MATCH +from ..util import registry + + +@registry.language_data("spacy.xx.url_match") +def url_match() -> Pattern: + return URL_MATCH diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 65c634340..defe53891 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -5,11 +5,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import GreekLemmatizer -from .syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import noun_chunks from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -17,6 +16,7 @@ DEFAULT_CONFIG = """ lang = "el" stop_words = {"@language_data": "spacy.el.stop_words"} lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.GreekLemmatizer.v1" @@ -38,6 +38,11 @@ def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: return GreekLemmatizer(data=data) +@registry.language_data("spacy.el.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + @registry.language_data("spacy.el.stop_words") def stop_words() -> Set[str]: return STOP_WORDS @@ -49,11 +54,10 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class GreekDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES - syntax_iterators = SYNTAX_ITERATORS class Greek(Language): diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 1bb21d24d..0a13edcc0 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -1,21 +1,20 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases. Works on both Doc and Span. - """ +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # It follows the logic of the noun chunks finder of English language, # adjusted to some Greek language special characteristics. # obj tag corrects some DEP tagger mistakes. # Further improvement of the models will eliminate the need for this tag. labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") diff --git a/spacy/lang/el/tag_map_fine.py b/spacy/lang/el/tag_map_fine.py deleted file mode 100644 index f37f84c57..000000000 --- a/spacy/lang/el/tag_map_fine.py +++ /dev/null @@ -1,4265 +0,0 @@ -from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX - - -TAG_MAP = { - "ABBR": {POS: NOUN, "Abbr": "Yes"}, - "AdXxBa": {POS: ADV, "Degree": ""}, - "AdXxCp": {POS: ADV, "Degree": "Cmp"}, - "AdXxSu": {POS: ADV, "Degree": "Sup"}, - "AjBaFePlAc": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "AjBaFePlDa": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Plur", - "Case": "Dat", - }, - "AjBaFePlGe": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "AjBaFePlNm": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "AjBaFePlVo": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "AjBaFeSgAc": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "AjBaFeSgDa": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - }, - "AjBaFeSgGe": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "AjBaFeSgNm": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "AjBaFeSgVo": { - POS: ADJ, - "Degree": "", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "AjBaMaPlAc": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "AjBaMaPlDa": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Plur", - "Case": "Dat", - }, - "AjBaMaPlGe": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "AjBaMaPlNm": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "AjBaMaPlVo": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "AjBaMaSgAc": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "AjBaMaSgDa": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Sing", - "Case": "Dat", - }, - "AjBaMaSgGe": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "AjBaMaSgNm": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "AjBaMaSgVo": { - POS: ADJ, - "Degree": "", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "AjBaNePlAc": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "AjBaNePlDa": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Plur", - "Case": "Dat", - }, - "AjBaNePlGe": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "AjBaNePlNm": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "AjBaNePlVo": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "AjBaNeSgAc": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "AjBaNeSgDa": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Sing", - "Case": "Dat", - }, - "AjBaNeSgGe": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "AjBaNeSgNm": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "AjBaNeSgVo": { - POS: ADJ, - "Degree": "", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "AjCpFePlAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "AjCpFePlDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Plur", - "Case": "Dat", - }, - "AjCpFePlGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "AjCpFePlNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "AjCpFePlVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "AjCpFeSgAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "AjCpFeSgDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - }, - "AjCpFeSgGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "AjCpFeSgNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "AjCpFeSgVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "AjCpMaPlAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "AjCpMaPlDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Plur", - "Case": "Dat", - }, - "AjCpMaPlGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "AjCpMaPlNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "AjCpMaPlVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "AjCpMaSgAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "AjCpMaSgDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Sing", - "Case": "Dat", - }, - "AjCpMaSgGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "AjCpMaSgNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "AjCpMaSgVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "AjCpNePlAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "AjCpNePlDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Plur", - "Case": "Dat", - }, - "AjCpNePlGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "AjCpNePlNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "AjCpNePlVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "AjCpNeSgAc": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "AjCpNeSgDa": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Sing", - "Case": "Dat", - }, - "AjCpNeSgGe": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "AjCpNeSgNm": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "AjCpNeSgVo": { - POS: ADJ, - "Degree": "Cmp", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "AjSuFePlAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "AjSuFePlDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Plur", - "Case": "Dat", - }, - "AjSuFePlGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "AjSuFePlNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "AjSuFePlVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "AjSuFeSgAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "AjSuFeSgDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - }, - "AjSuFeSgGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "AjSuFeSgNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "AjSuFeSgVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "AjSuMaPlAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "AjSuMaPlDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Plur", - "Case": "Dat", - }, - "AjSuMaPlGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "AjSuMaPlNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "AjSuMaPlVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "AjSuMaSgAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "AjSuMaSgDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Sing", - "Case": "Dat", - }, - "AjSuMaSgGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "AjSuMaSgNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "AjSuMaSgVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "AjSuNePlAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "AjSuNePlDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Plur", - "Case": "Dat", - }, - "AjSuNePlGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "AjSuNePlNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "AjSuNePlVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "AjSuNeSgAc": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "AjSuNeSgDa": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Sing", - "Case": "Dat", - }, - "AjSuNeSgGe": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "AjSuNeSgNm": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "AjSuNeSgVo": { - POS: ADJ, - "Degree": "Sup", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "AsPpPaFePlAc": {POS: ADP, "Gender": "Fem", "Number": "Plur", "Case": "Acc"}, - "AsPpPaFePlGe": {POS: ADP, "Gender": "Fem", "Number": "Plur", "Case": "Gen"}, - "AsPpPaFeSgAc": {POS: ADP, "Gender": "Fem", "Number": "Sing", "Case": "Acc"}, - "AsPpPaFeSgGe": {POS: ADP, "Gender": "Fem", "Number": "Sing", "Case": "Gen"}, - "AsPpPaMaPlAc": {POS: ADP, "Gender": "Masc", "Number": "Plur", "Case": "Acc"}, - "AsPpPaMaPlGe": {POS: ADP, "Gender": "Masc", "Number": "Plur", "Case": "Gen"}, - "AsPpPaMaSgAc": {POS: ADP, "Gender": "Masc", "Number": "Sing", "Case": "Acc"}, - "AsPpPaMaSgGe": {POS: ADP, "Gender": "Masc", "Number": "Sing", "Case": "Gen"}, - "AsPpPaNePlAc": {POS: ADP, "Gender": "Neut", "Number": "Plur", "Case": "Acc"}, - "AsPpPaNePlGe": {POS: ADP, "Gender": "Neut", "Number": "Plur", "Case": "Gen"}, - "AsPpPaNeSgAc": {POS: ADP, "Gender": "Neut", "Number": "Sing", "Case": "Acc"}, - "AsPpPaNeSgGe": {POS: ADP, "Gender": "Neut", "Number": "Sing", "Case": "Gen"}, - "AsPpSp": {POS: ADP}, - "AtDfFePlAc": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfFePlGe": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfFePlNm": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - "Definite": "Def", - }, - "AtDfFeSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfFeSgDa": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - "Definite": "Def", - }, - "AtDfFeSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfFeSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - "Definite": "Def", - }, - "AtDfMaPlAc": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfMaPlGe": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfMaPlNm": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - "Definite": "Def", - }, - "AtDfMaSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfMaSgDa": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Dat", - "Definite": "Def", - }, - "AtDfMaSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfMaSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - "Definite": "Def", - }, - "AtDfNePlAc": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfNePlDa": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Plur", - "Case": "Dat", - "Definite": "Def", - }, - "AtDfNePlGe": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfNePlNm": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - "Definite": "Def", - }, - "AtDfNeSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - "Definite": "Def", - }, - "AtDfNeSgDa": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Dat", - "Definite": "Def", - }, - "AtDfNeSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - "Definite": "Def", - }, - "AtDfNeSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - "Definite": "Def", - }, - "AtIdFeSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - "Definite": "Ind", - }, - "AtIdFeSgDa": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - "Definite": "Ind", - }, - "AtIdFeSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - "Definite": "Ind", - }, - "AtIdFeSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - "Definite": "Ind", - }, - "AtIdMaSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - "Definite": "Ind", - }, - "AtIdMaSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - "Definite": "Ind", - }, - "AtIdMaSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - "Definite": "Ind", - }, - "AtIdNeSgAc": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - "Definite": "Ind", - }, - "AtIdNeSgGe": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - "Definite": "Ind", - }, - "AtIdNeSgNm": { - POS: DET, - "PronType": "Art", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - "Definite": "Ind", - }, - "CjCo": {POS: CCONJ}, - "CjSb": {POS: SCONJ}, - "CPUNCT": {POS: PUNCT}, - "DATE": {POS: NUM}, - "DIG": {POS: NUM}, - "ENUM": {POS: NUM}, - "Ij": {POS: INTJ}, - "INIT": {POS: SYM}, - "NBABBR": {POS: NOUN, "Abbr": "Yes"}, - "NmAnFePlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "NmAnFePlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "NmAnFePlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "NmAnFePlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "NmAnFeSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "NmAnFeSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "NmAnFeSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "NmAnFeSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "NmAnMaPlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "NmAnMaPlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "NmAnMaPlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "NmAnMaPlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "NmAnMaSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "NmAnMaSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "NmAnMaSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "NmAnMaSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "NmAnNePlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "NmAnNePlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "NmAnNePlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "NmAnNePlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "NmAnNeSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "NmAnNeSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "NmAnNeSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "NmAnNeSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "NmAnXxXxXxAd": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc|Fem|Neut", - "Number": "Sing|Plur", - "Case": "Acc|Gen|Nom|Voc", - }, - "NmCdFePlAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "NmCdFePlGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "NmCdFePlNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "NmCdFePlVoAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "NmCdFeSgAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "NmCdFeSgDaAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Sing", - "Case": "Dat", - }, - "NmCdFeSgGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "NmCdFeSgNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "NmCdMaPlAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "NmCdMaPlGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "NmCdMaPlNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "NmCdMaPlVoAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "NmCdMaSgAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "NmCdMaSgGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "NmCdMaSgNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "NmCdNePlAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "NmCdNePlDaAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Plur", - "Case": "Dat", - }, - "NmCdNePlGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "NmCdNePlNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "NmCdNePlVoAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "NmCdNeSgAcAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "NmCdNeSgGeAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "NmCdNeSgNmAj": { - POS: NUM, - "NumType": "Card", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "NmCtFePlAcNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "NmCtFePlGeNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "NmCtFePlNmNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "NmCtFePlVoNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "NmCtFeSgAcNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "NmCtFeSgGeNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "NmCtFeSgNmNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "NmCtFeSgVoNo": { - POS: NUM, - "NumType": "Sets", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "NmMlFePlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "NmMlFePlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "NmMlFePlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "NmMlFePlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "NmMlFeSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "NmMlFeSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "NmMlFeSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "NmMlFeSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "NmMlMaPlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "NmMlMaPlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "NmMlMaPlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "NmMlMaPlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "NmMlMaSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "NmMlMaSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "NmMlMaSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "NmMlMaSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "NmMlNePlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "NmMlNePlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "NmMlNePlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "NmMlNePlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "NmMlNeSgAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "NmMlNeSgGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "NmMlNeSgNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "NmMlNeSgVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "NmMlXxXxXxAd": { - POS: NUM, - "NumType": "Mult", - "Gender": "Masc|Fem|Neut", - "Number": "Sing|Plur", - "Case": "Acc|Gen|Nom|Voc", - }, - "NmOdFePlAcAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Acc", - }, - "NmOdFePlGeAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Gen", - }, - "NmOdFePlNmAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Nom", - }, - "NmOdFePlVoAj": { - POS: NUM, - "NumType": "Mult", - "Gender": "Fem", - "Number": "Plur", - "Case": "Voc", - }, - "NmOdFeSgAcAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Fem", - "Number": "Sing", - "Case": "Acc", - }, - "NmOdFeSgGeAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Fem", - "Number": "Sing", - "Case": "Gen", - }, - "NmOdFeSgNmAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Fem", - "Number": "Sing", - "Case": "Nom", - }, - "NmOdFeSgVoAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Fem", - "Number": "Sing", - "Case": "Voc", - }, - "NmOdMaPlAcAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Plur", - "Case": "Acc", - }, - "NmOdMaPlGeAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Plur", - "Case": "Gen", - }, - "NmOdMaPlNmAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Plur", - "Case": "Nom", - }, - "NmOdMaPlVoAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Plur", - "Case": "Voc", - }, - "NmOdMaSgAcAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Sing", - "Case": "Acc", - }, - "NmOdMaSgGeAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Sing", - "Case": "Gen", - }, - "NmOdMaSgNmAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Sing", - "Case": "Nom", - }, - "NmOdMaSgVoAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Masc", - "Number": "Sing", - "Case": "Voc", - }, - "NmOdNePlAcAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Plur", - "Case": "Acc", - }, - "NmOdNePlGeAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Plur", - "Case": "Gen", - }, - "NmOdNePlNmAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Plur", - "Case": "Nom", - }, - "NmOdNePlVoAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Plur", - "Case": "Voc", - }, - "NmOdNeSgAcAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Sing", - "Case": "Acc", - }, - "NmOdNeSgGeAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Sing", - "Case": "Gen", - }, - "NmOdNeSgNmAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Sing", - "Case": "Nom", - }, - "NmOdNeSgVoAj": { - POS: NUM, - "NumType": "Ord", - "Gender": "Neut", - "Number": "Sing", - "Case": "Voc", - }, - "NoCmFePlAc": {POS: NOUN, "Gender": "Fem", "Number": "Plur", "Case": "Acc"}, - "NoCmFePlDa": {POS: NOUN, "Gender": "Fem", "Number": "Plur", "Case": "Dat"}, - "NoCmFePlGe": {POS: NOUN, "Gender": "Fem", "Number": "Plur", "Case": "Gen"}, - "NoCmFePlNm": {POS: NOUN, "Gender": "Fem", "Number": "Plur", "Case": "Nom"}, - "NoCmFePlVo": {POS: NOUN, "Gender": "Fem", "Number": "Plur", "Case": "Voc"}, - "NoCmFeSgAc": {POS: NOUN, "Gender": "Fem", "Number": "Sing", "Case": "Acc"}, - "NoCmFeSgDa": {POS: NOUN, "Gender": "Fem", "Number": "Sing", "Case": "Dat"}, - "NoCmFeSgGe": {POS: NOUN, "Gender": "Fem", "Number": "Sing", "Case": "Gen"}, - "NoCmFeSgNm": {POS: NOUN, "Gender": "Fem", "Number": "Sing", "Case": "Nom"}, - "NoCmFeSgVo": {POS: NOUN, "Gender": "Fem", "Number": "Sing", "Case": "Voc"}, - "NoCmMaPlAc": {POS: NOUN, "Gender": "Masc", "Number": "Plur", "Case": "Acc"}, - "NoCmMaPlDa": {POS: NOUN, "Gender": "Masc", "Number": "Plur", "Case": "Dat"}, - "NoCmMaPlGe": {POS: NOUN, "Gender": "Masc", "Number": "Plur", "Case": "Gen"}, - "NoCmMaPlNm": {POS: NOUN, "Gender": "Masc", "Number": "Plur", "Case": "Nom"}, - "NoCmMaPlVo": {POS: NOUN, "Gender": "Masc", "Number": "Plur", "Case": "Voc"}, - "NoCmMaSgAc": {POS: NOUN, "Gender": "Masc", "Number": "Sing", "Case": "Acc"}, - "NoCmMaSgDa": {POS: NOUN, "Gender": "Masc", "Number": "Sing", "Case": "Dat"}, - "NoCmMaSgGe": {POS: NOUN, "Gender": "Masc", "Number": "Sing", "Case": "Gen"}, - "NoCmMaSgNm": {POS: NOUN, "Gender": "Masc", "Number": "Sing", "Case": "Nom"}, - "NoCmMaSgVo": {POS: NOUN, "Gender": "Masc", "Number": "Sing", "Case": "Voc"}, - "NoCmNePlAc": {POS: NOUN, "Gender": "Neut", "Number": "Plur", "Case": "Acc"}, - "NoCmNePlDa": {POS: NOUN, "Gender": "Neut", "Number": "Plur", "Case": "Dat"}, - "NoCmNePlGe": {POS: NOUN, "Gender": "Neut", "Number": "Plur", "Case": "Gen"}, - "NoCmNePlNm": {POS: NOUN, "Gender": "Neut", "Number": "Plur", "Case": "Nom"}, - "NoCmNePlVo": {POS: NOUN, "Gender": "Neut", "Number": "Plur", "Case": "Voc"}, - "NoCmNeSgAc": {POS: NOUN, "Gender": "Neut", "Number": "Sing", "Case": "Acc"}, - "NoCmNeSgDa": {POS: NOUN, "Gender": "Neut", "Number": "Sing", "Case": "Dat"}, - "NoCmNeSgGe": {POS: NOUN, "Gender": "Neut", "Number": "Sing", "Case": "Gen"}, - "NoCmNeSgNm": {POS: NOUN, "Gender": "Neut", "Number": "Sing", "Case": "Nom"}, - "NoCmNeSgVo": {POS: NOUN, "Gender": "Neut", "Number": "Sing", "Case": "Voc"}, - "NoPrFePlAc": {POS: PROPN, "Gender": "Fem", "Number": "Plur", "Case": "Acc"}, - "NoPrFePlDa": {POS: PROPN, "Gender": "Fem", "Number": "Plur", "Case": "Dat"}, - "NoPrFePlGe": {POS: PROPN, "Gender": "Fem", "Number": "Plur", "Case": "Gen"}, - "NoPrFePlNm": {POS: PROPN, "Gender": "Fem", "Number": "Plur", "Case": "Nom"}, - "NoPrFePlVo": {POS: PROPN, "Gender": "Fem", "Number": "Plur", "Case": "Voc"}, - "NoPrFeSgAc": {POS: PROPN, "Gender": "Fem", "Number": "Sing", "Case": "Acc"}, - "NoPrFeSgDa": {POS: PROPN, "Gender": "Fem", "Number": "Sing", "Case": "Dat"}, - "NoPrFeSgGe": {POS: PROPN, "Gender": "Fem", "Number": "Sing", "Case": "Gen"}, - "NoPrFeSgNm": {POS: PROPN, "Gender": "Fem", "Number": "Sing", "Case": "Nom"}, - "NoPrFeSgVo": {POS: PROPN, "Gender": "Fem", "Number": "Sing", "Case": "Voc"}, - "NoPrMaPlAc": {POS: PROPN, "Gender": "Masc", "Number": "Plur", "Case": "Acc"}, - "NoPrMaPlGe": {POS: PROPN, "Gender": "Masc", "Number": "Plur", "Case": "Gen"}, - "NoPrMaPlNm": {POS: PROPN, "Gender": "Masc", "Number": "Plur", "Case": "Nom"}, - "NoPrMaPlVo": {POS: PROPN, "Gender": "Masc", "Number": "Plur", "Case": "Voc"}, - "NoPrMaSgAc": {POS: PROPN, "Gender": "Masc", "Number": "Sing", "Case": "Acc"}, - "NoPrMaSgDa": {POS: PROPN, "Gender": "Masc", "Number": "Sing", "Case": "Dat"}, - "NoPrMaSgGe": {POS: PROPN, "Gender": "Masc", "Number": "Sing", "Case": "Gen"}, - "NoPrMaSgNm": {POS: PROPN, "Gender": "Masc", "Number": "Sing", "Case": "Nom"}, - "NoPrMaSgVo": {POS: PROPN, "Gender": "Masc", "Number": "Sing", "Case": "Voc"}, - "NoPrNePlAc": {POS: PROPN, "Gender": "Neut", "Number": "Plur", "Case": "Acc"}, - "NoPrNePlGe": {POS: PROPN, "Gender": "Neut", "Number": "Plur", "Case": "Gen"}, - "NoPrNePlNm": {POS: PROPN, "Gender": "Neut", "Number": "Plur", "Case": "Nom"}, - "NoPrNeSgAc": {POS: PROPN, "Gender": "Neut", "Number": "Sing", "Case": "Acc"}, - "NoPrNeSgGe": {POS: PROPN, "Gender": "Neut", "Number": "Sing", "Case": "Gen"}, - "NoPrNeSgNm": {POS: PROPN, "Gender": "Neut", "Number": "Sing", "Case": "Nom"}, - "OPUNCT": {POS: PUNCT}, - "PnDfFe03PlAcXx": { - POS: PRON, - "PronType": "", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnDfFe03SgAcXx": { - POS: PRON, - "PronType": "", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnDfMa03PlGeXx": { - POS: PRON, - "PronType": "", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnDmFe03PlAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnDmFe03PlGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnDmFe03PlNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnDmFe03SgAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnDmFe03SgDaXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Dat", - }, - "PnDmFe03SgGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnDmFe03SgNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnDmMa03PlAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnDmMa03PlDaXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Dat", - }, - "PnDmMa03PlGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnDmMa03PlNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnDmMa03SgAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnDmMa03SgGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnDmMa03SgNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnDmNe03PlAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnDmNe03PlDaXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Dat", - }, - "PnDmNe03PlGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnDmNe03PlNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnDmNe03SgAcXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnDmNe03SgDaXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Dat", - }, - "PnDmNe03SgGeXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnDmNe03SgNmXx": { - POS: PRON, - "PronType": "Dem", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIdFe03PlAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIdFe03PlGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIdFe03PlNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIdFe03SgAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIdFe03SgGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIdFe03SgNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIdMa03PlAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIdMa03PlGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIdMa03PlNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIdMa03SgAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIdMa03SgGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIdMa03SgNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIdNe03PlAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIdNe03PlGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIdNe03PlNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIdNe03SgAcXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIdNe03SgDaXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Dat", - }, - "PnIdNe03SgGeXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIdNe03SgNmXx": { - POS: PRON, - "PronType": "Ind", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIrFe03PlAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIrFe03PlGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIrFe03PlNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIrFe03SgAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIrFe03SgGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIrFe03SgNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIrMa03PlAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIrMa03PlGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIrMa03PlNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIrMa03SgAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIrMa03SgGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIrMa03SgNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnIrNe03PlAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnIrNe03PlGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnIrNe03PlNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnIrNe03SgAcXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnIrNe03SgGeXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnIrNe03SgNmXx": { - POS: PRON, - "PronType": "Int", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeFe01PlAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe01PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe01PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeFe01PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeFe01SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe01SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe01SgGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeFe01SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeFe01SgNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeFe02PlAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe02PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe02PlGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeFe02PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeFe02PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeFe02SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe02SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe02SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeFe02SgNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "2", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeFe03PlAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe03PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeFe03PlGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeFe03PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeFe03PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeFe03SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe03SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeFe03SgGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeFe03SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa01PlAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeMa01PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeMa01PlDaSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Dat", - }, - "PnPeMa01PlGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeMa01PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeMa01PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeMa01SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa01SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa01SgGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa01SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa01SgNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeMa02PlAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeMa02PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeMa02PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeMa02PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeMa02PlVoSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Voc", - }, - "PnPeMa02SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa02SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa02SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa02SgNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "2", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeMa03PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeMa03PlGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeMa03PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeMa03PlNmSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnPeMa03SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa03SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeMa03SgGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa03SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeMa03SgNmWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnPeNe03PlAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnPeNe03PlGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeNe03PlGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPeNe03SgAcSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeNe03SgAcWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnPeNe03SgGeSt": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPeNe03SgGeWe": { - POS: PRON, - "PronType": "Prs", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoFe01PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "1", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoFe01SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoFe02PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "2", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoFe02SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "2", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoFe03PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoFe03SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoMa01PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "1", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoMa01SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "1", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoMa02PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "2", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoMa02SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "2", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoMa03PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoMa03SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnPoNe03PlGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnPoNe03SgGeXx": { - POS: PRON, - "Poss": "Yes", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnReFe03PlAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnReFe03PlGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnReFe03PlNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnReFe03SgAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnReFe03SgGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnReFe03SgNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnReMa03PlAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnReMa03PlGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnReMa03PlNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnReMa03SgAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnReMa03SgGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnReMa03SgNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnReNe03PlAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnReNe03PlGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnReNe03PlNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnReNe03SgAcXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnReNe03SgGeXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnReNe03SgNmXx": { - POS: PRON, - "PronType": "Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnRiFe03PlAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnRiFe03PlGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnRiFe03PlNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnRiFe03SgAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnRiFe03SgGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnRiFe03SgNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Fem", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnRiMa03PlAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnRiMa03PlGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnRiMa03PlNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnRiMa03SgAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnRiMa03SgGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnRiMa03SgNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Masc", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PnRiNe03PlAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Acc", - }, - "PnRiNe03PlGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Gen", - }, - "PnRiNe03PlNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Plur", - "Case": "Nom", - }, - "PnRiNe03SgAcXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Acc", - }, - "PnRiNe03SgGeXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Gen", - }, - "PnRiNe03SgNmXx": { - POS: PRON, - "PronType": "Ind,Rel", - "Gender": "Neut", - "Person": "3", - "Number": "Sing", - "Case": "Nom", - }, - "PTERM_P": {POS: PUNCT}, - "PtFu": {POS: PART}, - "PtNg": {POS: PART}, - "PtOt": {POS: PART}, - "PtSj": {POS: PART}, - "Pu": {POS: SYM}, - "PUNCT": {POS: PUNCT}, - "RgAbXx": {POS: X}, - "RgAnXx": {POS: X}, - "RgFwOr": {POS: X, "Foreign": "Yes"}, - "RgFwTr": {POS: X, "Foreign": "Yes"}, - "RgSyXx": {POS: SYM}, - "VbIsIdPa03SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdPa03SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdPa03SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdPa03SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdPr03SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdPr03SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdXx03SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsIdXx03SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbIsNfXxXxXxXxPeAvXx": { - POS: VERB, - "VerbForm": "Inf", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing|Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa01SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa02SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPa03SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr01PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr01PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr01SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr01SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr02PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr02PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr02SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr02SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr03PlXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr03PlXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr03SgXxIpAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdPr03SgXxIpPvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx01PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx01PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "1", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx01SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx01SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "1", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx02PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx02PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx02SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx02SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx03PlXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx03PlXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx03SgXxPeAvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnIdXx03SgXxPePvXx": { - POS: VERB, - "VerbForm": "Fin", - "Mood": "Ind", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02PlXxIpAvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02PlXxIpPvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02PlXxPeAvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02PlXxPePvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02SgXxIpAvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02SgXxIpPvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02SgXxPeAvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx02SgXxPePvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "2", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnMpXx03SgXxIpPvXx": { - POS: VERB, - "VerbForm": "", - "Mood": "Imp", - "Tense": "Pres|Past", - "Person": "3", - "Number": "Sing", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnNfXxXxXxXxPeAvXx": { - POS: VERB, - "VerbForm": "Inf", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing|Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnNfXxXxXxXxPePvXx": { - POS: VERB, - "VerbForm": "Inf", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing|Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnPpPrXxXxXxIpAvXx": { - POS: VERB, - "VerbForm": "Conv", - "Mood": "", - "Tense": "Pres", - "Person": "1|2|3", - "Number": "Sing|Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "VbMnPpXxXxPlFePePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxPlFePePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxPlFePePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxPlFePePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxPlMaPePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxPlMaPePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxPlMaPePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxPlMaPePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxPlNePePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxPlNePePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxPlNePePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxPlNePePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Plur", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxSgFePePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxSgFePePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxSgFePePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxSgFePePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Fem", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxSgMaPePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxSgMaPePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxSgMaPePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxSgMaPePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Masc", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxSgNePePvAc": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Acc", - }, - "VbMnPpXxXxSgNePePvGe": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Gen", - }, - "VbMnPpXxXxSgNePePvNm": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Nom", - }, - "VbMnPpXxXxSgNePePvVo": { - POS: VERB, - "VerbForm": "Part", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing", - "Gender": "Neut", - "Aspect": "Perf", - "Voice": "Pass", - "Case": "Voc", - }, - "VbMnPpXxXxXxXxIpAvXx": { - POS: VERB, - "VerbForm": "Conv", - "Mood": "", - "Tense": "Pres|Past", - "Person": "1|2|3", - "Number": "Sing|Plur", - "Gender": "Masc|Fem|Neut", - "Aspect": "Imp", - "Voice": "Act", - "Case": "Nom|Gen|Dat|Acc|Voc", - }, - "ADJ": {POS: ADJ}, - "ADP": {POS: ADP}, - "ADV": {POS: ADV}, - "AtDf": {POS: DET}, - "AUX": {POS: AUX}, - "CCONJ": {POS: CCONJ}, - "DET": {POS: DET}, - "NOUN": {POS: NOUN}, - "NUM": {POS: NUM}, - "PART": {POS: PART}, - "PRON": {POS: PRON}, - "PROPN": {POS: PROPN}, - "SCONJ": {POS: SCONJ}, - "SYM": {POS: SYM}, - "VERB": {POS: VERB}, - "X": {POS: X}, -} diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index 112fd991b..f9810828b 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -1,5 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM - +from ...util import update_exc _exc = {} @@ -392,4 +393,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 3e21cf21b..ebe2d1d53 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -4,13 +4,12 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import noun_chunks from .lemmatizer import is_base_form from .punctuation import TOKENIZER_INFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language from ...lemmatizer import Lemmatizer -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -18,6 +17,7 @@ DEFAULT_CONFIG = """ lang = "en" stop_words = {"@language_data": "spacy.en.stop_words"} lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.EnglishLemmatizer.v1" @@ -49,9 +49,13 @@ def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer": return Lemmatizer(data=data, is_base_form=is_base_form) +@registry.language_data("spacy.en.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class EnglishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - syntax_iterators = SYNTAX_ITERATORS + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index b63db3539..59ae733bd 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -1,27 +1,18 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "dobj", - "nsubjpass", - "pcomp", - "pobj", - "dative", - "appos", - "attr", - "ROOT", - ] +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index dc8a5c04d..226678430 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...util import update_exc _exc = {} @@ -555,4 +557,4 @@ for string in _exclude: _exc.pop(string) -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 52aef4521..bc378f3db 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -4,11 +4,10 @@ from thinc.config import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import noun_chunks from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -16,6 +15,7 @@ DEFAULT_CONFIG = """ lang = "es" stop_words = {"@language_data": "spacy.es.stop_words"} lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -32,6 +32,11 @@ tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"] """ +@registry.language_data("spacy.es.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + @registry.language_data("spacy.es.stop_words") def stop_words() -> Set[str]: return STOP_WORDS @@ -43,10 +48,9 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class SpanishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - syntax_iterators = SYNTAX_ITERATORS class Spanish(Language): diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 3c65bd441..c33412693 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -1,13 +1,15 @@ +from typing import Union, Iterator, Optional, List, Tuple + from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors +from ...tokens import Doc, Span, Token -def noun_chunks(doclike): +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" doc = doclike.doc - if not doc.is_parsed: raise ValueError(Errors.E029) - if not len(doc): return np_label = doc.vocab.strings.add("NP") @@ -28,18 +30,24 @@ def noun_chunks(doclike): token = next_token(token) -def is_verb_token(token): +def is_verb_token(token: Token) -> bool: return token.pos in [VERB, AUX] -def next_token(token): +def next_token(token: Token) -> Optional[Token]: try: return token.nbor() except IndexError: return None -def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): +def noun_bounds( + doc: Doc, + root: Token, + np_left_deps: List[str], + np_right_deps: List[str], + stop_deps: List[str], +) -> Tuple[Token, Token]: left_bound = root for token in reversed(list(root.lefts)): if token.dep in np_left_deps: @@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): left, right = noun_bounds( doc, token, np_left_deps, np_right_deps, stop_deps ) - if list( - filter( - lambda t: is_verb_token(t) or t.dep in stop_deps, - doc[left_bound.i : right.i], - ) - ): + filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps + if list(filter(filter_func, doc[left_bound.i : right.i],)): break else: right_bound = right diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 50f2988b1..63124578e 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA +from ...util import update_exc _exc = { @@ -73,4 +75,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 41e40ca30..a1ab0712f 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -2,12 +2,12 @@ from typing import Set, Dict, Callable, Any from thinc.api import Config from ...language import Language -from ...util import update_exc, registry +from ...util import registry from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES -from .syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import noun_chunks DEFAULT_CONFIG = """ @@ -15,6 +15,7 @@ DEFAULT_CONFIG = """ lang = "fa" stop_words = {"@language_data": "spacy.fa.stop_words"} lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} [nlp.writing_system] direction = "rtl" @@ -41,10 +42,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: return LEX_ATTRS +@registry.language_data("spacy.fa.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class PersianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES - syntax_iterators = SYNTAX_ITERATORS class Persian(Language): diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 69a6412f0..33313aeb6 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -31,7 +30,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class FinnishDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Finnish(Language): diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 51ca45d63..faaf609f9 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = {} @@ -78,4 +80,4 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 4ec30cbd9..41014aa34 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Set, Dict, Callable, Any, Pattern from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH @@ -7,10 +7,9 @@ from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import FrenchLemmatizer, is_base_form -from .syntax_iterators import SYNTAX_ITERATORS -from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .syntax_iterators import noun_chunks from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -18,6 +17,11 @@ DEFAULT_CONFIG = """ lang = "fr" stop_words = {"@language_data": "spacy.fr.stop_words"} lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"} + +[nlp.tokenizer] +@tokenizers = "spacy.Tokenizer.v1" +token_match = {"@language_data": "spacy.fr.token_match"} [nlp.lemmatizer] @lemmatizers = "spacy.FrenchLemmatizer.v1" @@ -34,6 +38,11 @@ def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: return FrenchLemmatizer(data=data, is_base_form=is_base_form) +@registry.language_data("spacy.fr.token_match") +def token_match() -> Pattern: + return TOKEN_MATCH + + @registry.language_data("spacy.fr.stop_words") def stop_words() -> Set[str]: return STOP_WORDS @@ -44,13 +53,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: return LEX_ATTRS +@registry.language_data("spacy.fr.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class FrenchDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - token_match = TOKEN_MATCH - syntax_iterators = SYNTAX_ITERATORS class French(Language): diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index ca711593f..d297203e3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -1,26 +1,18 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "nsubj:pass", - "obj", - "iobj", - "ROOT", - "appos", - "nmod", - "nmod:poss", - ] +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 6806ea4fe..a1ad7bcbb 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -1,8 +1,11 @@ import re +from ..tokenizer_exceptions import BASE_EXCEPTIONS from .punctuation import ELISION, HYPHENS from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA +from ...util import update_exc + # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS @@ -452,7 +455,7 @@ _regular_exp += [ ] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKEN_MATCH = re.compile( "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index d88051a65..3c13f56fb 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -3,9 +3,8 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -21,7 +20,7 @@ def stop_words() -> Set[str]: class IrishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Irish(Language): diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 0c587c67e..fbd6fa0f5 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,5 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc _exc = { @@ -81,4 +83,4 @@ for orth in ["d'", "D'"]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index f979879a1..cd07d405e 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -4,7 +4,7 @@ from thinc.api import Config from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -25,7 +25,7 @@ def stop_words() -> Set[str]: class HebrewDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + tokenizer_exceptions = BASE_EXCEPTIONS class Hebrew(Language): diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index e841ee24d..54c1a8f1f 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -4,7 +4,7 @@ from thinc.api import Config from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -28,7 +28,7 @@ def stop_words() -> Set[str]: class CroatianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + tokenizer_exceptions = BASE_EXCEPTIONS class Croatian(Language): diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 2cfd61dfa..b9f5a5c34 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,12 +1,11 @@ -from typing import Set +from typing import Set, Pattern from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -14,6 +13,10 @@ DEFAULT_CONFIG = """ lang = "hu" stop_words = {"@language_data": "spacy.hu.stop_words"} +[nlp.tokenizer] +@tokenizers = "spacy.Tokenizer.v1" +token_match = {"@language_data": "spacy.hu.token_match"} + [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -29,12 +32,16 @@ def stop_words() -> Set[str]: return STOP_WORDS +@registry.language_data("spacy.hu.token_match") +def token_match() -> Pattern: + return TOKEN_MATCH + + class HungarianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES - token_match = TOKEN_MATCH class Hungarian(Language): diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index ffb4d1472..4a64a1d2c 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -1,7 +1,9 @@ import re +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..punctuation import ALPHA_LOWER, CURRENCY from ...symbols import ORTH +from ...util import update_exc _exc = {} @@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( ) -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 8998addb4..ecefd0a66 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -5,10 +5,9 @@ from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import SYNTAX_ITERATORS -from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .syntax_iterators import noun_chunks from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -16,6 +15,7 @@ DEFAULT_CONFIG = """ lang = "id" stop_words = {"@language_data": "spacy.id.stop_words"} lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -42,12 +42,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: return LEX_ATTRS +@registry.language_data("spacy.id.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class IndonesianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES - syntax_iterators = SYNTAX_ITERATORS class Indonesian(Language): diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index ca711593f..f6d261643 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -1,26 +1,20 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ - labels = [ - "nsubj", - "nsubj:pass", - "obj", - "iobj", - "ROOT", - "appos", - "nmod", - "nmod:poss", - ] + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 5259bddf8..50ccfa33a 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -1,5 +1,8 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc + # Daftar singkatan dan Akronim dari: # https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A @@ -221,4 +224,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index f6b6afa59..107018392 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -30,7 +29,7 @@ def stop_words() -> Set[str]: class ItalianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 7237443b5..c9c729d63 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc + _exc = { "all'art.": [{ORTH: "all'"}, {ORTH: "art."}], @@ -52,4 +55,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 2e56c08d8..642b59a4b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,11 +1,11 @@ -from typing import Optional, Union, Dict, Any, Set +from typing import Optional, Union, Dict, Any, Set, Callable from pathlib import Path import srsly from collections import namedtuple from thinc.api import Config from .stop_words import STOP_WORDS -from .syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import noun_chunks from .tag_map import TAG_MAP from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP @@ -22,6 +22,7 @@ DEFAULT_CONFIG = """ [nlp] lang = "ja" stop_words = {"@language_data": "spacy.ja.stop_words"} +get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"} [nlp.tokenizer] @tokenizers = "spacy.JapaneseTokenizer.v1" @@ -39,6 +40,11 @@ def stop_words() -> Set[str]: return STOP_WORDS +@registry.language_data("spacy.ja.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + @registry.tokenizers("spacy.JapaneseTokenizer.v1") def create_japanese_tokenizer(split_mode: Optional[str] = None): def japanese_tokenizer_factory(nlp): @@ -50,6 +56,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None): class JapaneseTokenizer(DummyTokenizer): def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: self.vocab = nlp.vocab + # TODO: is this the right way to do it? + self.vocab.morphology.load_tag_map(TAG_MAP) self.split_mode = split_mode self.tokenizer = try_sudachi_import(self.split_mode) @@ -171,14 +179,8 @@ class JapaneseTokenizer(DummyTokenizer): return self -class JapaneseDefaults(Language.Defaults): - tag_map = TAG_MAP - syntax_iterators = SYNTAX_ITERATORS - - class Japanese(Language): lang = "ja" - Defaults = JapaneseDefaults default_config = Config().from_str(DEFAULT_CONFIG) diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index bb0554cf9..cca4902ab 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -1,33 +1,23 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON, VERB - -# XXX this can probably be pruned a bit -labels = [ - "nsubj", - "nmod", - "dobj", - "nsubjpass", - "pcomp", - "pobj", - "obj", - "obl", - "dative", - "appos", - "attr", - "ROOT", -] +from ...tokens import Doc, Span -def noun_chunks(obj): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ +# TODO: this can probably be pruned a bit +# fmt: off +labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"] +# fmt: on - doc = obj.doc # Ensure works on both Doc and Span. + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + doc = doclike.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced @@ -37,12 +27,10 @@ def noun_chunks(obj): unseen = [w.i for w in word.subtree if w.i not in seen] if not unseen: continue - # this takes care of particles etc. seen.update(j.i for j in word.subtree) # This avoids duplicating embedded clauses seen.update(range(word.i + 1)) - # if the head of this is a verb, mark that and rights seen # Don't do the subtree as that can hide other phrases if word.head.pos == VERB: diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index f55660745..83cd44ded 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -40,6 +40,8 @@ def create_korean_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, nlp: Optional[Language] = None): self.vocab = nlp.vocab + # TODO: is this the right way to do it? + self.vocab.morphology.load_tag_map(TAG_MAP) MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") @@ -72,13 +74,8 @@ class KoreanTokenizer(DummyTokenizer): yield {"surface": surface, "lemma": lemma, "tag": tag} -class KoreanDefaults(Language.Defaults): - tag_map = TAG_MAP - - class Korean(Language): lang = "ko" - Defaults = KoreanDefaults default_config = Config().from_str(DEFAULT_CONFIG) diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index d381bb2e7..56b09208f 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class LuxembourgishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index 070bb0fd7..f6cdc7f34 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc + # TODO # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) @@ -47,4 +50,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index c7a17bee9..02f9a72b6 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -22,7 +21,7 @@ def stop_words() -> Set[str]: class LigurianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index 2befabca3..61fa0df52 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc + _exc = {} @@ -47,4 +50,4 @@ for prep, prep_lemma in [ {ORTH: prep, LEMMA: prep_lemma}, ] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 23c11f3a1..e82c4c4e0 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -5,9 +5,8 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -39,11 +38,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class LithuanianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - mod_base_exceptions = { - exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") - } - del mod_base_exceptions["8)"] - tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Lithuanian(Language): diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index 012dfbd20..118fb2190 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -1,267 +1,15 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH +from ...util import update_exc + _exc = {} -for orth in [ - "n-tosios", - "?!", - # "G.", - # "J. E.", - # "J. Em.", - # "J.E.", - # "J.Em.", - # "K.", - # "N.", - # "V.", - # "Vt.", - # "a.", - # "a.k.", - # "a.s.", - # "adv.", - # "akad.", - # "aklg.", - # "akt.", - # "al.", - # "ang.", - # "angl.", - # "aps.", - # "apskr.", - # "apyg.", - # "arbat.", - # "asist.", - # "asm.", - # "asm.k.", - # "asmv.", - # "atk.", - # "atsak.", - # "atsisk.", - # "atsisk.sąsk.", - # "atv.", - # "aut.", - # "avd.", - # "b.k.", - # "baud.", - # "biol.", - # "bkl.", - # "bot.", - # "bt.", - # "buv.", - # "ch.", - # "chem.", - # "corp.", - # "d.", - # "dab.", - # "dail.", - # "dek.", - # "deš.", - # "dir.", - # "dirig.", - # "doc.", - # "dol.", - # "dr.", - # "drp.", - # "dvit.", - # "dėst.", - # "dš.", - # "dž.", - # "e.b.", - # "e.bankas", - # "e.p.", - # "e.parašas", - # "e.paštas", - # "e.v.", - # "e.valdžia", - # "egz.", - # "eil.", - # "ekon.", - # "el.", - # "el.bankas", - # "el.p.", - # "el.parašas", - # "el.paštas", - # "el.valdžia", - # "etc.", - # "ež.", - # "fak.", - # "faks.", - # "feat.", - # "filol.", - # "filos.", - # "g.", - # "gen.", - # "geol.", - # "gerb.", - # "gim.", - # "gr.", - # "gv.", - # "gyd.", - # "gyv.", - # "habil.", - # "inc.", - # "insp.", - # "inž.", - # "ir pan.", - # "ir t. t.", - # "isp.", - # "istor.", - # "it.", - # "just.", - # "k.", - # "k. a.", - # "k.a.", - # "kab.", - # "kand.", - # "kart.", - # "kat.", - # "ketv.", - # "kh.", - # "kl.", - # "kln.", - # "km.", - # "kn.", - # "koresp.", - # "kpt.", - # "kr.", - # "kt.", - # "kub.", - # "kun.", - # "kv.", - # "kyš.", - # "l. e. p.", - # "l.e.p.", - # "lenk.", - # "liet.", - # "lot.", - # "lt.", - # "ltd.", - # "ltn.", - # "m.", - # "m.e..", - # "m.m.", - # "mat.", - # "med.", - # "mgnt.", - # "mgr.", - # "min.", - # "mjr.", - # "ml.", - # "mln.", - # "mlrd.", - # "mob.", - # "mok.", - # "moksl.", - # "mokyt.", - # "mot.", - # "mr.", - # "mst.", - # "mstl.", - # "mėn.", - # "nkt.", - # "no.", - # "nr.", - # "ntk.", - # "nuotr.", - # "op.", - # "org.", - # "orig.", - # "p.", - # "p.d.", - # "p.m.e.", - # "p.s.", - # "pab.", - # "pan.", - # "past.", - # "pav.", - # "pavad.", - # "per.", - # "perd.", - # "pirm.", - # "pl.", - # "plg.", - # "plk.", - # "pr.", - # "pr.Kr.", - # "pranc.", - # "proc.", - # "prof.", - # "prom.", - # "prot.", - # "psl.", - # "pss.", - # "pvz.", - # "pšt.", - # "r.", - # "raj.", - # "red.", - # "rez.", - # "rež.", - # "rus.", - # "rš.", - # "s.", - # "sav.", - # "saviv.", - # "sek.", - # "sekr.", - # "sen.", - # "sh.", - # "sk.", - # "skg.", - # "skv.", - # "skyr.", - # "sp.", - # "spec.", - # "sr.", - # "st.", - # "str.", - # "stud.", - # "sąs.", - # "t.", - # "t. p.", - # "t. y.", - # "t.p.", - # "t.t.", - # "t.y.", - # "techn.", - # "tel.", - # "teol.", - # "th.", - # "tir.", - # "trit.", - # "trln.", - # "tšk.", - # "tūks.", - # "tūkst.", - # "up.", - # "upl.", - # "v.s.", - # "vad.", - # "val.", - # "valg.", - # "ved.", - # "vert.", - # "vet.", - # "vid.", - # "virš.", - # "vlsč.", - # "vnt.", - # "vok.", - # "vs.", - # "vtv.", - # "vv.", - # "vyr.", - # "vyresn.", - # "zool.", - # "Įn", - # "įl.", - # "š.m.", - # "šnek.", - # "šv.", - # "švč.", - # "ž.ū.", - # "žin.", - # "žml.", - # "žr.", -]: +for orth in ["n-tosios", "?!"]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +mod_base_exceptions = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") +} +del mod_base_exceptions["8)"] +TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 3b386344b..f26c68e91 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,20 +1,20 @@ -from typing import Set +from typing import Set, Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import SYNTAX_ITERATORS -from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .syntax_iterators import noun_chunks from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ [nlp] lang = "nb" stop_words = {"@language_data": "spacy.nb.stop_words"} +get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -31,12 +31,16 @@ def stop_words() -> Set[str]: return STOP_WORDS +@registry.language_data("spacy.nb.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class NorwegianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - syntax_iterators = SYNTAX_ITERATORS class Norwegian(Language): diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index ca711593f..d297203e3 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -1,26 +1,18 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "nsubj:pass", - "obj", - "iobj", - "ROOT", - "appos", - "nmod", - "nmod:poss", - ] +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index eb67e4c89..9a604cedc 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = {} @@ -218,4 +220,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index ab2cf3a94..9bf58fddd 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -7,9 +7,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -44,7 +43,7 @@ def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer: class DutchDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index df69c7a8a..489d10d71 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH +from ...util import update_exc + # Extensive list of both common and uncommon dutch abbreviations copied from # github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based @@ -1602,4 +1605,4 @@ for orth in abbrevs: _exc[i] = [{ORTH: i}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 045bd3bc1..fce12393d 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -4,10 +4,9 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class PortugueseDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index c5c5d49e8..187fc65ea 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH +from ...util import update_exc _exc = {} @@ -50,4 +52,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 740bd7911..881188b21 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry # Lemma data note: # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ @@ -35,7 +34,7 @@ def stop_words() -> Set[str]: class RomanianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py index eb5f95dfb..b8af0b1d6 100644 --- a/spacy/lang/ro/tokenizer_exceptions.py +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH +from ...util import update_exc from .punctuation import _make_ro_variants @@ -91,4 +93,4 @@ for orth in [ _exc[variant] = [{ORTH: variant}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index e9e28dfb5..b37ac6226 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -5,8 +5,7 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer -from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...util import update_exc, registry +from ...util import registry from ...language import Language @@ -42,7 +41,7 @@ def create_russian_lemmatizer() -> RussianLemmatizer: class RussianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Russian(Language): diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index df3169baf..e4fbd2d75 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc _exc = {} @@ -63,4 +65,4 @@ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f69ad3a89..36703aa5f 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -41,7 +40,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class SerbianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Serbian(Language): diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 82df15186..a41fe7e4e 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc _exc = {} @@ -90,4 +92,4 @@ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index c18ad775d..dc9f71ac6 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -4,10 +4,9 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry -from .syntax_iterators import SYNTAX_ITERATORS +from ...util import registry +from .syntax_iterators import noun_chunks # Punctuation stolen from Danish from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES @@ -18,6 +17,7 @@ DEFAULT_CONFIG = """ lang = "sv" stop_words = {"@language_data": "spacy.sv.stop_words"} lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} +get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" @@ -39,11 +39,15 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: return LEX_ATTRS +@registry.language_data("spacy.sv.get_noun_chunks") +def get_noun_chunks() -> Callable: + return noun_chunks + + class SwedishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - syntax_iterators = SYNTAX_ITERATORS class Swedish(Language): diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index c3de21921..662b508ed 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -1,27 +1,18 @@ +from typing import Union, Iterator + from ...symbols import NOUN, PROPN, PRON from ...errors import Errors +from ...tokens import Doc, Span -def noun_chunks(doclike): - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "nsubj:pass", - "dobj", - "obj", - "iobj", - "ROOT", - "appos", - "nmod", - "nmod:poss", - ] +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index a78a51f31..f1b914bff 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA +from ...util import update_exc _exc = {} @@ -154,4 +156,4 @@ for orth in ABBREVIATIONS: for orth in ["i", "m"]: _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/tag_map.py b/spacy/lang/tag_map.py deleted file mode 100644 index 5bff905bd..000000000 --- a/spacy/lang/tag_map.py +++ /dev/null @@ -1,25 +0,0 @@ -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "_SP": {POS: SPACE}, -} diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index a7158e6f6..7176e07d4 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -4,9 +4,8 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -36,7 +35,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class TagalogDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Tagalog(Language): diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index ea14746c4..f81d35f20 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA +from ...util import update_exc _exc = { @@ -14,4 +16,4 @@ _exc = { } -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index f732a9097..45391332e 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -55,7 +55,6 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = None URL_MATCH = re.compile("(?u)" + URL_PATTERN).match diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index dff56e945..3bb1e0d06 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -3,9 +3,8 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -29,7 +28,7 @@ def stop_words() -> Set[str]: class TurkishDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Turkish(Language): diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py index 97f524a87..b84ef89a2 100644 --- a/spacy/lang/tr/tokenizer_exceptions.py +++ b/spacy/lang/tr/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM +from ...util import update_exc + _exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]} @@ -113,4 +116,4 @@ for orth in ["Dr.", "yy."]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index 45f9a24b0..d4828d96c 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -5,9 +5,8 @@ from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import update_exc, registry +from ...util import registry DEFAULT_CONFIG = """ @@ -29,7 +28,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: class TatarDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py index efe9e1fc0..4ee6b6cd2 100644 --- a/spacy/lang/tt/tokenizer_exceptions.py +++ b/spacy/lang/tt/tokenizer_exceptions.py @@ -1,4 +1,7 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc + _exc = {} @@ -43,4 +46,4 @@ for exc_data in [ # "etc." abbreviations exc_data[LEMMA] = exc_data[NORM] _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 72b70caa9..24a859951 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -4,8 +4,7 @@ from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...util import update_exc, registry +from ...util import registry from ...language import Language from .lemmatizer import UkrainianLemmatizer @@ -37,7 +36,7 @@ def create_ukrainian_lemmatizer() -> UkrainianLemmatizer: class UkrainianDefaults(Language.Defaults): - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS class Ukrainian(Language): diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index 36f0b2e72..8ae82a48c 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,4 +1,6 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, POS, NORM, NOUN +from ...util import update_exc _exc = {} @@ -21,4 +23,4 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/language.py b/spacy/language.py index 77d0b4b0e..53fc286b3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -24,9 +24,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry from .util import SimpleFrozenDict from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH -from .lang.tag_map import TAG_MAP -from .tokens import Doc, Span +from .tokens import Doc from .errors import Errors, Warnings from .schemas import ConfigSchema from .git_info import GIT_VERSION @@ -37,6 +35,7 @@ from . import about from .tokenizer import Tokenizer # noqa: F401 from .lemmatizer import Lemmatizer # noqa: F401 from .lookups import Lookups # noqa: F401 +from .lang import defaults # noqa: F401 ENABLE_PIPELINE_ANALYSIS = False @@ -46,15 +45,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) class BaseDefaults: - token_match: Optional[Pattern] = TOKEN_MATCH - url_match: Pattern = URL_MATCH prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES) suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES) infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES) - tag_map: Dict[str, dict] = dict(TAG_MAP) tokenizer_exceptions: Dict[str, List[dict]] = {} - morph_rules: Dict[str, Dict[str, dict]] = {} - syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {} class Language: @@ -114,13 +108,7 @@ class Language: if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - vocab = Vocab.from_config( - self._config, - vectors_name=vectors_name, - # TODO: what should we do with these? - tag_map=self.Defaults.tag_map, - morph_rules=self.Defaults.morph_rules, - ) + vocab = Vocab.from_config(self._config, vectors_name=vectors_name) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -1267,15 +1255,14 @@ class Language: lex_attr_getters = resolved["nlp"]["lex_attr_getters"] stop_words = resolved["nlp"]["stop_words"] vocab_data = resolved["nlp"]["vocab_data"] + get_noun_chunks = resolved["nlp"]["get_noun_chunks"] vocab = Vocab.from_config( filled, lemmatizer=lemmatizer, lex_attr_getters=lex_attr_getters, stop_words=stop_words, vocab_data=vocab_data, - # TODO: what should we do with these? - tag_map=cls.Defaults.tag_map, - morph_rules=cls.Defaults.morph_rules, + get_noun_chunks=get_noun_chunks, ) nlp = cls(vocab, create_tokenizer=create_tokenizer) pipeline = config.get("components", {}) diff --git a/spacy/schemas.py b/spacy/schemas.py index 590032559..8b6e3ebab 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel): stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") + get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc") # fmt: on class Config: diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 2d3362317..5395dbabe 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -1,7 +1,7 @@ import numpy from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root -from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS +from spacy.lang.en.syntax_iterators import noun_chunks import pytest @@ -41,7 +41,7 @@ def test_en_noun_chunks_not_nested(en_vocab): dtype="uint64", ), ) - doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] + doc.noun_chunks_iterator = noun_chunks word_occurred = {} for chunk in doc.noun_chunks: for word in chunk: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 114d227c8..5fffa4503 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -9,7 +9,7 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap cimport cython -from typing import Dict, List, Union, Pattern, Optional +from typing import Dict, List, Union, Pattern, Optional, Any import re import warnings @@ -32,16 +32,16 @@ def create_tokenizer( # prefixes: Optional[List[Union[str, Pattern]]], # suffixes: Optional[List[Union[str, Pattern]]], # infixes: Optional[List[Union[str, Pattern]]], - # token_match: Optional[Pattern], - # url_match: Optional[Pattern], + # We currently can't validate against Pattern because that will cause + # Pydantic to parse value *as* pattern + token_match: Optional[Any] = None, + url_match: Optional[Any] = None, ) -> "Tokenizer": def tokenizer_factory(nlp): exceptions = nlp.Defaults.tokenizer_exceptions prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes infixes = nlp.Defaults.infixes - url_match = nlp.Defaults.url_match - token_match = nlp.Defaults.token_match prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ce8053796..adc7059e5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -89,16 +89,6 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) return get_token_attr(token, feat_name) -def _get_chunker(lang): - try: - cls = util.get_lang_class(lang) - except ImportError: - return None - except KeyError: - return None - return cls.Defaults.syntax_iterators.get("noun_chunks") - - cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -212,7 +202,7 @@ cdef class Doc: self.tensor = numpy.zeros((0,), dtype="float32") self.user_data = {} if user_data is None else user_data self._vector = None - self.noun_chunks_iterator = _get_chunker(self.vocab.lang) + self.noun_chunks_iterator = self.vocab.get_noun_chunks cdef bint has_space if words is None and spaces is not None: raise ValueError("words must be set if spaces is set") diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index a31c984ad..69cec7d3d 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -30,6 +30,7 @@ cdef class Vocab: cpdef public object vectors cpdef public object lookups cpdef public object writing_system + cpdef public object get_noun_chunks cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1afee4f69..1a4959833 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -30,10 +30,10 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, vocab_data={}, + def __init__(self, lex_attr_getters=None, lemmatizer=None, + strings=tuple(), lookups=None, tag_map={}, vocab_data={}, oov_prob=-20., vectors_name=None, writing_system={}, - **deprecated_kwargs): + get_noun_chunks=None, **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -49,7 +49,6 @@ cdef class Vocab: RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} - tag_map = tag_map if tag_map is not None else {} if lookups in (None, True, False): lookups = Lookups() for name, data in vocab_data.items(): @@ -71,6 +70,7 @@ cdef class Vocab: self.vectors = Vectors(name=vectors_name) self.lookups = lookups self.writing_system = writing_system + self.get_noun_chunks = get_noun_chunks @property def lang(self): @@ -424,9 +424,8 @@ cdef class Vocab: lex_attr_getters=None, stop_words=None, vocab_data=None, + get_noun_chunks=None, vectors_name=None, - tag_map=None, - morph_rules=None ): """Create a Vocab from a config and (currently) language defaults, i.e. nlp.Defaults. @@ -449,6 +448,9 @@ cdef class Vocab: if vocab_data is None: vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] + if get_noun_chunks is None: + noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]} + get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"] if lex_attr_getters is None: lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] @@ -468,10 +470,8 @@ cdef class Vocab: vocab_data=vocab_data, lemmatizer=lemmatizer, writing_system=writing_system, - tag_map=tag_map, + get_noun_chunks=get_noun_chunks ) - if morph_rules is not None: - vocab.morphology.load_morph_exceptions(morph_rules) if vocab.vectors.name is None and vectors_name: vocab.vectors.name = vectors_name return vocab From a624ae06756cb883c4a6b870f58e4bb3f39d9e45 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Jul 2020 23:09:01 +0200 Subject: [PATCH 05/14] Remove POS, TAG and LEMMA from tokenizer exceptions --- spacy/lang/ar/tokenizer_exceptions.py | 46 +- spacy/lang/bn/tokenizer_exceptions.py | 26 +- spacy/lang/ca/tokenizer_exceptions.py | 44 +- spacy/lang/da/tokenizer_exceptions.py | 112 +- spacy/lang/de/tokenizer_exceptions.py | 270 +- spacy/lang/el/tokenizer_exceptions.py | 178 +- spacy/lang/en/tokenizer_exceptions.py | 475 ++- spacy/lang/es/tokenizer_exceptions.py | 36 +- spacy/lang/fa/tokenizer_exceptions.py | 3494 ++++------------- spacy/lang/fi/tokenizer_exceptions.py | 138 +- spacy/lang/fr/tokenizer_exceptions.py | 92 +- spacy/lang/ga/tokenizer_exceptions.py | 124 +- spacy/lang/id/tokenizer_exceptions.py | 52 +- spacy/lang/it/tokenizer_exceptions.py | 4 +- spacy/lang/lb/tokenizer_exceptions.py | 28 +- spacy/lang/lij/tokenizer_exceptions.py | 71 +- spacy/lang/nb/tokenizer_exceptions.py | 24 +- spacy/lang/ru/tokenizer_exceptions.py | 84 +- spacy/lang/sr/tokenizer_exceptions.py | 130 +- spacy/lang/sv/tokenizer_exceptions.py | 87 +- spacy/lang/th/tokenizer_exceptions.py | 865 ++-- spacy/lang/tl/tokenizer_exceptions.py | 20 +- spacy/lang/tokenizer_exceptions.py | 14 +- spacy/lang/tt/tokenizer_exceptions.py | 49 +- spacy/lang/uk/tokenizer_exceptions.py | 28 +- spacy/tests/lang/ar/test_exceptions.py | 1 - spacy/tests/lang/ca/test_exception.py | 2 - spacy/tests/lang/de/test_exceptions.py | 1 - spacy/tests/lang/en/test_exceptions.py | 2 - spacy/tests/lang/es/test_exception.py | 2 - spacy/tests/lang/fr/test_exceptions.py | 19 +- spacy/tests/lang/lb/test_exceptions.py | 1 - spacy/tests/regression/test_issue1501-2000.py | 2 - spacy/tests/regression/test_issue2501-3000.py | 1 - 34 files changed, 2173 insertions(+), 4349 deletions(-) diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py index ce0f91ef7..7c385bef8 100644 --- a/spacy/lang/ar/tokenizer_exceptions.py +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc @@ -8,41 +8,41 @@ _exc = {} # Time for exc_data in [ - {LEMMA: "قبل الميلاد", ORTH: "ق.م"}, - {LEMMA: "بعد الميلاد", ORTH: "ب. م"}, - {LEMMA: "ميلادي", ORTH: ".م"}, - {LEMMA: "هجري", ORTH: ".هـ"}, - {LEMMA: "توفي", ORTH: ".ت"}, + {NORM: "قبل الميلاد", ORTH: "ق.م"}, + {NORM: "بعد الميلاد", ORTH: "ب. م"}, + {NORM: "ميلادي", ORTH: ".م"}, + {NORM: "هجري", ORTH: ".هـ"}, + {NORM: "توفي", ORTH: ".ت"}, ]: _exc[exc_data[ORTH]] = [exc_data] # Scientific abv. for exc_data in [ - {LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"}, - {LEMMA: "الشارح", ORTH: "الشـ"}, - {LEMMA: "الظاهر", ORTH: "الظـ"}, - {LEMMA: "أيضًا", ORTH: "أيضـ"}, - {LEMMA: "إلى آخره", ORTH: "إلخ"}, - {LEMMA: "انتهى", ORTH: "اهـ"}, - {LEMMA: "حدّثنا", ORTH: "ثنا"}, - {LEMMA: "حدثني", ORTH: "ثنى"}, - {LEMMA: "أنبأنا", ORTH: "أنا"}, - {LEMMA: "أخبرنا", ORTH: "نا"}, - {LEMMA: "مصدر سابق", ORTH: "م. س"}, - {LEMMA: "مصدر نفسه", ORTH: "م. ن"}, + {NORM: "صلى الله عليه وسلم", ORTH: "صلعم"}, + {NORM: "الشارح", ORTH: "الشـ"}, + {NORM: "الظاهر", ORTH: "الظـ"}, + {NORM: "أيضًا", ORTH: "أيضـ"}, + {NORM: "إلى آخره", ORTH: "إلخ"}, + {NORM: "انتهى", ORTH: "اهـ"}, + {NORM: "حدّثنا", ORTH: "ثنا"}, + {NORM: "حدثني", ORTH: "ثنى"}, + {NORM: "أنبأنا", ORTH: "أنا"}, + {NORM: "أخبرنا", ORTH: "نا"}, + {NORM: "مصدر سابق", ORTH: "م. س"}, + {NORM: "مصدر نفسه", ORTH: "م. ن"}, ]: _exc[exc_data[ORTH]] = [exc_data] # Other abv. for exc_data in [ - {LEMMA: "دكتور", ORTH: "د."}, - {LEMMA: "أستاذ دكتور", ORTH: "أ.د"}, - {LEMMA: "أستاذ", ORTH: "أ."}, - {LEMMA: "بروفيسور", ORTH: "ب."}, + {NORM: "دكتور", ORTH: "د."}, + {NORM: "أستاذ دكتور", ORTH: "أ.د"}, + {NORM: "أستاذ", ORTH: "أ."}, + {NORM: "بروفيسور", ORTH: "ب."}, ]: _exc[exc_data[ORTH]] = [exc_data] -for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: +for exc_data in [{NORM: "تلفون", ORTH: "ت."}, {NORM: "صندوق بريد", ORTH: "ص.ب"}]: _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index d896b4914..e666522b8 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc @@ -7,18 +7,18 @@ _exc = {} for exc_data in [ - {ORTH: "ডঃ", LEMMA: "ডক্টর"}, - {ORTH: "ডাঃ", LEMMA: "ডাক্তার"}, - {ORTH: "ড.", LEMMA: "ডক্টর"}, - {ORTH: "ডা.", LEMMA: "ডাক্তার"}, - {ORTH: "মোঃ", LEMMA: "মোহাম্মদ"}, - {ORTH: "মো.", LEMMA: "মোহাম্মদ"}, - {ORTH: "সে.", LEMMA: "সেলসিয়াস"}, - {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, - {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, - {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, - {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, - {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, + {ORTH: "ডঃ", NORM: "ডক্টর"}, + {ORTH: "ডাঃ", NORM: "ডাক্তার"}, + {ORTH: "ড.", NORM: "ডক্টর"}, + {ORTH: "ডা.", NORM: "ডাক্তার"}, + {ORTH: "মোঃ", NORM: "মোহাম্মদ"}, + {ORTH: "মো.", NORM: "মোহাম্মদ"}, + {ORTH: "সে.", NORM: "সেলসিয়াস"}, + {ORTH: "কি.মি.", NORM: "কিলোমিটার"}, + {ORTH: "কি.মি", NORM: "কিলোমিটার"}, + {ORTH: "সে.মি.", NORM: "সেন্টিমিটার"}, + {ORTH: "সে.মি", NORM: "সেন্টিমিটার"}, + {ORTH: "মি.লি.", NORM: "মিলিলিটার"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index 6928de46b..b465e97ba 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -1,40 +1,40 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc _exc = {} for exc_data in [ - {ORTH: "aprox.", LEMMA: "aproximadament"}, - {ORTH: "pàg.", LEMMA: "pàgina"}, - {ORTH: "p.ex.", LEMMA: "per exemple"}, - {ORTH: "gen.", LEMMA: "gener"}, - {ORTH: "feb.", LEMMA: "febrer"}, - {ORTH: "abr.", LEMMA: "abril"}, - {ORTH: "jul.", LEMMA: "juliol"}, - {ORTH: "set.", LEMMA: "setembre"}, - {ORTH: "oct.", LEMMA: "octubre"}, - {ORTH: "nov.", LEMMA: "novembre"}, - {ORTH: "dec.", LEMMA: "desembre"}, - {ORTH: "Dr.", LEMMA: "doctor"}, - {ORTH: "Sr.", LEMMA: "senyor"}, - {ORTH: "Sra.", LEMMA: "senyora"}, - {ORTH: "Srta.", LEMMA: "senyoreta"}, - {ORTH: "núm", LEMMA: "número"}, - {ORTH: "St.", LEMMA: "sant"}, - {ORTH: "Sta.", LEMMA: "santa"}, + {ORTH: "aprox.", NORM: "aproximadament"}, + {ORTH: "pàg.", NORM: "pàgina"}, + {ORTH: "p.ex.", NORM: "per exemple"}, + {ORTH: "gen.", NORM: "gener"}, + {ORTH: "feb.", NORM: "febrer"}, + {ORTH: "abr.", NORM: "abril"}, + {ORTH: "jul.", NORM: "juliol"}, + {ORTH: "set.", NORM: "setembre"}, + {ORTH: "oct.", NORM: "octubre"}, + {ORTH: "nov.", NORM: "novembre"}, + {ORTH: "dec.", NORM: "desembre"}, + {ORTH: "Dr.", NORM: "doctor"}, + {ORTH: "Sr.", NORM: "senyor"}, + {ORTH: "Sra.", NORM: "senyora"}, + {ORTH: "Srta.", NORM: "senyoreta"}, + {ORTH: "núm", NORM: "número"}, + {ORTH: "St.", NORM: "sant"}, + {ORTH: "Sta.", NORM: "santa"}, ]: _exc[exc_data[ORTH]] = [exc_data] # Times -_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] +_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "a.m."}] for period in ["p.m.", "pm"]: - _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "p.m."}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 826a6077b..ce25c546b 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -3,7 +3,7 @@ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -13,44 +13,44 @@ _exc = {} # (for "torsdag") are left out because they are ambiguous. The same is the case # for abbreviations "jul." and "Jul." ("juli"). for exc_data in [ - {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, - {ORTH: "jan.", LEMMA: "januar"}, - {ORTH: "febr.", LEMMA: "februar"}, - {ORTH: "feb.", LEMMA: "februar"}, - {ORTH: "mar.", LEMMA: "marts"}, - {ORTH: "apr.", LEMMA: "april"}, - {ORTH: "jun.", LEMMA: "juni"}, - {ORTH: "aug.", LEMMA: "august"}, - {ORTH: "sept.", LEMMA: "september"}, - {ORTH: "sep.", LEMMA: "september"}, - {ORTH: "okt.", LEMMA: "oktober"}, - {ORTH: "nov.", LEMMA: "november"}, - {ORTH: "dec.", LEMMA: "december"}, - {ORTH: "man.", LEMMA: "mandag"}, - {ORTH: "tirs.", LEMMA: "tirsdag"}, - {ORTH: "ons.", LEMMA: "onsdag"}, - {ORTH: "tor.", LEMMA: "torsdag"}, - {ORTH: "tors.", LEMMA: "torsdag"}, - {ORTH: "fre.", LEMMA: "fredag"}, - {ORTH: "lør.", LEMMA: "lørdag"}, - {ORTH: "Jan.", LEMMA: "januar"}, - {ORTH: "Febr.", LEMMA: "februar"}, - {ORTH: "Feb.", LEMMA: "februar"}, - {ORTH: "Mar.", LEMMA: "marts"}, - {ORTH: "Apr.", LEMMA: "april"}, - {ORTH: "Jun.", LEMMA: "juni"}, - {ORTH: "Aug.", LEMMA: "august"}, - {ORTH: "Sept.", LEMMA: "september"}, - {ORTH: "Sep.", LEMMA: "september"}, - {ORTH: "Okt.", LEMMA: "oktober"}, - {ORTH: "Nov.", LEMMA: "november"}, - {ORTH: "Dec.", LEMMA: "december"}, - {ORTH: "Man.", LEMMA: "mandag"}, - {ORTH: "Tirs.", LEMMA: "tirsdag"}, - {ORTH: "Ons.", LEMMA: "onsdag"}, - {ORTH: "Fre.", LEMMA: "fredag"}, - {ORTH: "Lør.", LEMMA: "lørdag"}, - {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"}, + {ORTH: "Kbh.", NORM: "København"}, + {ORTH: "jan.", NORM: "januar"}, + {ORTH: "febr.", NORM: "februar"}, + {ORTH: "feb.", NORM: "februar"}, + {ORTH: "mar.", NORM: "marts"}, + {ORTH: "apr.", NORM: "april"}, + {ORTH: "jun.", NORM: "juni"}, + {ORTH: "aug.", NORM: "august"}, + {ORTH: "sept.", NORM: "september"}, + {ORTH: "sep.", NORM: "september"}, + {ORTH: "okt.", NORM: "oktober"}, + {ORTH: "nov.", NORM: "november"}, + {ORTH: "dec.", NORM: "december"}, + {ORTH: "man.", NORM: "mandag"}, + {ORTH: "tirs.", NORM: "tirsdag"}, + {ORTH: "ons.", NORM: "onsdag"}, + {ORTH: "tor.", NORM: "torsdag"}, + {ORTH: "tors.", NORM: "torsdag"}, + {ORTH: "fre.", NORM: "fredag"}, + {ORTH: "lør.", NORM: "lørdag"}, + {ORTH: "Jan.", NORM: "januar"}, + {ORTH: "Febr.", NORM: "februar"}, + {ORTH: "Feb.", NORM: "februar"}, + {ORTH: "Mar.", NORM: "marts"}, + {ORTH: "Apr.", NORM: "april"}, + {ORTH: "Jun.", NORM: "juni"}, + {ORTH: "Aug.", NORM: "august"}, + {ORTH: "Sept.", NORM: "september"}, + {ORTH: "Sep.", NORM: "september"}, + {ORTH: "Okt.", NORM: "oktober"}, + {ORTH: "Nov.", NORM: "november"}, + {ORTH: "Dec.", NORM: "december"}, + {ORTH: "Man.", NORM: "mandag"}, + {ORTH: "Tirs.", NORM: "tirsdag"}, + {ORTH: "Ons.", NORM: "onsdag"}, + {ORTH: "Fre.", NORM: "fredag"}, + {ORTH: "Lør.", NORM: "lørdag"}, + {ORTH: "og/eller", NORM: "og/eller"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -550,22 +550,22 @@ for orth in [ _exc[capitalized] = [{ORTH: capitalized}] for exc_data in [ - {ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"}, - {ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"}, - {ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"}, - {ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"}, - {ORTH: "sku'", LEMMA: "skal", NORM: "skulle"}, - {ORTH: "ku'", LEMMA: "kan", NORM: "kunne"}, - {ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"}, - {ORTH: "ka'", LEMMA: "kan", NORM: "kan"}, - {ORTH: "Ka'", LEMMA: "kan", NORM: "kan"}, - {ORTH: "gi'", LEMMA: "give", NORM: "giv"}, - {ORTH: "Gi'", LEMMA: "give", NORM: "giv"}, - {ORTH: "li'", LEMMA: "lide", NORM: "lide"}, - {ORTH: "ha'", LEMMA: "have", NORM: "have"}, - {ORTH: "Ha'", LEMMA: "have", NORM: "have"}, - {ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"}, - {ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"}, + {ORTH: "s'gu", NORM: "s'gu"}, + {ORTH: "S'gu", NORM: "s'gu"}, + {ORTH: "sgu'", NORM: "s'gu"}, + {ORTH: "Sgu'", NORM: "s'gu"}, + {ORTH: "sku'", NORM: "skulle"}, + {ORTH: "ku'", NORM: "kunne"}, + {ORTH: "Ku'", NORM: "kunne"}, + {ORTH: "ka'", NORM: "kan"}, + {ORTH: "Ka'", NORM: "kan"}, + {ORTH: "gi'", NORM: "giv"}, + {ORTH: "Gi'", NORM: "giv"}, + {ORTH: "li'", NORM: "lide"}, + {ORTH: "ha'", NORM: "have"}, + {ORTH: "Ha'", NORM: "have"}, + {ORTH: "ik'", NORM: "ikke"}, + {ORTH: "Ik'", NORM: "ikke"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -575,7 +575,7 @@ for h in range(1, 31 + 1): for period in ["."]: _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] -_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} +_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]} _exc.update(_custom_base_exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index d7860ace6..21d99cffe 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -1,159 +1,135 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc _exc = { - "auf'm": [{ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}], - "du's": [ - {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "er's": [ - {ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "hinter'm": [ - {ORTH: "hinter", LEMMA: "hinter"}, - {ORTH: "'m", LEMMA: "der", NORM: "dem"}, - ], - "ich's": [ - {ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "ihr's": [ - {ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "sie's": [ - {ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "unter'm": [ - {ORTH: "unter", LEMMA: "unter"}, - {ORTH: "'m", LEMMA: "der", NORM: "dem"}, - ], - "vor'm": [{ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}], - "wir's": [ - {ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, - ], - "über'm": [{ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}], + "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], + "du's": [{ORTH: "du"}, {ORTH: "'s", NORM: "es"}], + "er's": [{ORTH: "er"}, {ORTH: "'s", NORM: "es"}], + "hinter'm": [{ORTH: "hinter"}, {ORTH: "'m", NORM: "dem"}], + "ich's": [{ORTH: "ich"}, {ORTH: "'s", NORM: "es"}], + "ihr's": [{ORTH: "ihr"}, {ORTH: "'s", NORM: "es"}], + "sie's": [{ORTH: "sie"}, {ORTH: "'s", NORM: "es"}], + "unter'm": [{ORTH: "unter"}, {ORTH: "'m", NORM: "dem"}], + "vor'm": [{ORTH: "vor"}, {ORTH: "'m", NORM: "dem"}], + "wir's": [{ORTH: "wir"}, {ORTH: "'s", NORM: "es"}], + "über'm": [{ORTH: "über"}, {ORTH: "'m", NORM: "dem"}], } for exc_data in [ - {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, - {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, - {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, - {ORTH: "'n", LEMMA: "ein", NORM: "ein"}, - {ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, - {ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, - {ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, - {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"}, - {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"}, - {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"}, - {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, - {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, - {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"}, - {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"}, - {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, - {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, - {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"}, - {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"}, - {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"}, - {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"}, - {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"}, - {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"}, - {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"}, - {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"}, - {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"}, - {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"}, - {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"}, - {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"}, - {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, - {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, - {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, - {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, - {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, - {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"}, - {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"}, - {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"}, - {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"}, - {ORTH: "Mrz.", LEMMA: "März", NORM: "März"}, - {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"}, - {ORTH: "Mär.", LEMMA: "März", NORM: "März"}, - {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, - {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"}, - {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, - {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"}, - {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"}, - {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"}, - {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"}, - {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"}, - {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, - {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, - {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"}, - {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"}, - {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"}, - {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, - {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"}, - {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"}, - {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"}, - {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"}, - {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"}, - {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"}, - {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"}, - {ORTH: "d.h.", LEMMA: "das heißt"}, - {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"}, - {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"}, - {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"}, - {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"}, - {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"}, - {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"}, - {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"}, - {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, - {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, - {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"}, - {ORTH: "i.O.", LEMMA: "in Ordnung"}, - {ORTH: "i.d.R.", LEMMA: "in der Regel"}, - {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"}, - {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"}, - {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"}, - {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"}, - {ORTH: "lt.", LEMMA: "laut", NORM: "laut"}, - {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"}, - {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"}, - {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"}, - {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"}, - {ORTH: "n.Chr.", LEMMA: "nach Christus"}, - {ORTH: "orig.", LEMMA: "original", NORM: "original"}, - {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"}, - {ORTH: "s.o.", LEMMA: "siehe oben"}, - {ORTH: "sog.", LEMMA: "so genannt"}, - {ORTH: "stellv.", LEMMA: "stellvertretend"}, - {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"}, - {ORTH: "u.U.", LEMMA: "unter Umständen"}, - {ORTH: "u.s.w.", LEMMA: "und so weiter"}, - {ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, - {ORTH: "usf.", LEMMA: "und so fort"}, - {ORTH: "usw.", LEMMA: "und so weiter"}, - {ORTH: "uvm.", LEMMA: "und vieles mehr"}, - {ORTH: "v.Chr.", LEMMA: "vor Christus"}, - {ORTH: "v.a.", LEMMA: "vor allem"}, - {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, - {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"}, - {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"}, - {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"}, - {ORTH: "z.B.", LEMMA: "zum Beispiel"}, - {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, - {ORTH: "z.T.", LEMMA: "zum Teil"}, - {ORTH: "z.Z.", LEMMA: "zur Zeit"}, - {ORTH: "z.Zt.", LEMMA: "zur Zeit"}, - {ORTH: "z.b.", LEMMA: "zum Beispiel"}, - {ORTH: "zzgl.", LEMMA: "zuzüglich"}, - {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}, + {ORTH: "'S", NORM: "'s"}, + {ORTH: "'s", NORM: "'s"}, + {ORTH: "S'", NORM: "'s"}, + {ORTH: "s'", NORM: "'s"}, + {ORTH: "'n", NORM: "ein"}, + {ORTH: "'ne", NORM: "eine"}, + {ORTH: "'nen", NORM: "einen"}, + {ORTH: "'nem", NORM: "einem"}, + {ORTH: "Abb.", NORM: "Abbildung"}, + {ORTH: "Abk.", NORM: "Abkürzung"}, + {ORTH: "Abt.", NORM: "Abteilung"}, + {ORTH: "Apr.", NORM: "April"}, + {ORTH: "Aug.", NORM: "August"}, + {ORTH: "Bd.", NORM: "Band"}, + {ORTH: "Betr.", NORM: "Betreff"}, + {ORTH: "Bf.", NORM: "Bahnhof"}, + {ORTH: "Bhf.", NORM: "Bahnhof"}, + {ORTH: "Bsp.", NORM: "Beispiel"}, + {ORTH: "Dez.", NORM: "Dezember"}, + {ORTH: "Di.", NORM: "Dienstag"}, + {ORTH: "Do.", NORM: "Donnerstag"}, + {ORTH: "Fa.", NORM: "Firma"}, + {ORTH: "Fam.", NORM: "Familie"}, + {ORTH: "Feb.", NORM: "Februar"}, + {ORTH: "Fr.", NORM: "Frau"}, + {ORTH: "Frl.", NORM: "Fräulein"}, + {ORTH: "Hbf.", NORM: "Hauptbahnhof"}, + {ORTH: "Hr.", NORM: "Herr"}, + {ORTH: "Hrn.", NORM: "Herrn"}, + {ORTH: "Jan.", NORM: "Januar"}, + {ORTH: "Jh.", NORM: "Jahrhundert"}, + {ORTH: "Jhd.", NORM: "Jahrhundert"}, + {ORTH: "Jul.", NORM: "Juli"}, + {ORTH: "Jun.", NORM: "Juni"}, + {ORTH: "Mi.", NORM: "Mittwoch"}, + {ORTH: "Mio.", NORM: "Million"}, + {ORTH: "Mo.", NORM: "Montag"}, + {ORTH: "Mrd.", NORM: "Milliarde"}, + {ORTH: "Mrz.", NORM: "März"}, + {ORTH: "MwSt.", NORM: "Mehrwertsteuer"}, + {ORTH: "Mär.", NORM: "März"}, + {ORTH: "Nov.", NORM: "November"}, + {ORTH: "Nr.", NORM: "Nummer"}, + {ORTH: "Okt.", NORM: "Oktober"}, + {ORTH: "Orig.", NORM: "Original"}, + {ORTH: "Pkt.", NORM: "Punkt"}, + {ORTH: "Prof.", NORM: "Professor"}, + {ORTH: "Red.", NORM: "Redaktion"}, + {ORTH: "Sa.", NORM: "Samstag"}, + {ORTH: "Sep.", NORM: "September"}, + {ORTH: "Sept.", NORM: "September"}, + {ORTH: "So.", NORM: "Sonntag"}, + {ORTH: "Std.", NORM: "Stunde"}, + {ORTH: "Str.", NORM: "Straße"}, + {ORTH: "Tel.", NORM: "Telefon"}, + {ORTH: "Tsd.", NORM: "Tausend"}, + {ORTH: "Univ.", NORM: "Universität"}, + {ORTH: "abzgl.", NORM: "abzüglich"}, + {ORTH: "allg.", NORM: "allgemein"}, + {ORTH: "bspw.", NORM: "beispielsweise"}, + {ORTH: "bzgl.", NORM: "bezüglich"}, + {ORTH: "bzw.", NORM: "beziehungsweise"}, + {ORTH: "d.h."}, + {ORTH: "dgl.", NORM: "dergleichen"}, + {ORTH: "ebd.", NORM: "ebenda"}, + {ORTH: "eigtl.", NORM: "eigentlich"}, + {ORTH: "engl.", NORM: "englisch"}, + {ORTH: "evtl.", NORM: "eventuell"}, + {ORTH: "frz.", NORM: "französisch"}, + {ORTH: "gegr.", NORM: "gegründet"}, + {ORTH: "ggf.", NORM: "gegebenenfalls"}, + {ORTH: "ggfs.", NORM: "gegebenenfalls"}, + {ORTH: "ggü.", NORM: "gegenüber"}, + {ORTH: "i.O."}, + {ORTH: "i.d.R."}, + {ORTH: "incl.", NORM: "inklusive"}, + {ORTH: "inkl.", NORM: "inklusive"}, + {ORTH: "insb.", NORM: "insbesondere"}, + {ORTH: "kath.", NORM: "katholisch"}, + {ORTH: "lt.", NORM: "laut"}, + {ORTH: "max.", NORM: "maximal"}, + {ORTH: "min.", NORM: "minimal"}, + {ORTH: "mind.", NORM: "mindestens"}, + {ORTH: "mtl.", NORM: "monatlich"}, + {ORTH: "n.Chr."}, + {ORTH: "orig.", NORM: "original"}, + {ORTH: "röm.", NORM: "römisch"}, + {ORTH: "s.o."}, + {ORTH: "sog."}, + {ORTH: "stellv."}, + {ORTH: "tägl.", NORM: "täglich"}, + {ORTH: "u.U."}, + {ORTH: "u.s.w."}, + {ORTH: "u.v.m."}, + {ORTH: "usf."}, + {ORTH: "usw."}, + {ORTH: "uvm."}, + {ORTH: "v.Chr."}, + {ORTH: "v.a."}, + {ORTH: "v.l.n.r."}, + {ORTH: "vgl.", NORM: "vergleiche"}, + {ORTH: "vllt.", NORM: "vielleicht"}, + {ORTH: "vlt.", NORM: "vielleicht"}, + {ORTH: "z.B."}, + {ORTH: "z.Bsp."}, + {ORTH: "z.T."}, + {ORTH: "z.Z."}, + {ORTH: "z.Zt."}, + {ORTH: "z.b."}, + {ORTH: "zzgl."}, + {ORTH: "österr.", NORM: "österreichisch"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index f9810828b..0a36d5d2b 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -1,130 +1,128 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc _exc = {} for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]: - _exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}] + _exc[token] = [{ORTH: token, NORM: "από"}] for token in ["Αλλ'", "αλλ'"]: - _exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}] + _exc[token] = [{ORTH: token, NORM: "αλλά"}] for token in ["παρ'", "Παρ'", "ΠΑΡ'"]: - _exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}] + _exc[token] = [{ORTH: token, NORM: "παρά"}] for token in ["καθ'", "Καθ'"]: - _exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}] + _exc[token] = [{ORTH: token, NORM: "κάθε"}] for token in ["κατ'", "Κατ'"]: - _exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}] + _exc[token] = [{ORTH: token, NORM: "κατά"}] for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]: - _exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}] + _exc[token] = [{ORTH: token, NORM: "είμαι"}] for token in ["Επ'", "επ'", "εφ'", "Εφ'"]: - _exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}] + _exc[token] = [{ORTH: token, NORM: "επί"}] for token in ["Δι'", "δι'"]: - _exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}] + _exc[token] = [{ORTH: token, NORM: "δια"}] for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]: - _exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}] + _exc[token] = [{ORTH: token, NORM: "έχω"}] for token in ["υπ'", "Υπ'"]: - _exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}] + _exc[token] = [{ORTH: token, NORM: "υπό"}] for token in ["Μετ'", "ΜΕΤ'", "'μετ"]: - _exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}] + _exc[token] = [{ORTH: token, NORM: "μετά"}] for token in ["Μ'", "μ'"]: - _exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}] + _exc[token] = [{ORTH: token, NORM: "με"}] for token in ["Γι'", "ΓΙ'", "γι'"]: - _exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}] + _exc[token] = [{ORTH: token, NORM: "για"}] for token in ["Σ'", "σ'"]: - _exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}] + _exc[token] = [{ORTH: token, NORM: "σε"}] for token in ["Θ'", "θ'"]: - _exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}] + _exc[token] = [{ORTH: token, NORM: "θα"}] for token in ["Ν'", "ν'"]: - _exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] + _exc[token] = [{ORTH: token, NORM: "να"}] for token in ["Τ'", "τ'"]: - _exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] + _exc[token] = [{ORTH: token, NORM: "να"}] for token in ["'γω", "'σένα", "'μεις"]: - _exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}] + _exc[token] = [{ORTH: token, NORM: "εγώ"}] for token in ["Τ'", "τ'"]: - _exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}] + _exc[token] = [{ORTH: token, NORM: "το"}] for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]: - _exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}] + _exc[token] = [{ORTH: token, NORM: "φέρνω"}] for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]: - _exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}] + _exc[token] = [{ORTH: token, NORM: "έρχομαι"}] for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]: - _exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}] + _exc[token] = [{ORTH: token, NORM: "λέγω"}] for token in ["Πάρ'", "πάρ'"]: - _exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}] + _exc[token] = [{ORTH: token, NORM: "παίρνω"}] for token in ["μέσ'", "Μέσ'", "μεσ'"]: - _exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}] + _exc[token] = [{ORTH: token, NORM: "μέσα"}] for token in ["Δέσ'", "Δεσ'", "δεσ'"]: - _exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}] + _exc[token] = [{ORTH: token, NORM: "δένω"}] for token in ["'κανε", "Κάν'"]: - _exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}] + _exc[token] = [{ORTH: token, NORM: "κάνω"}] _other_exc = { - "κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}], - "Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}], - "Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}], - "ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}], - "ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}], - "'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}], - "Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}], - "'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}], - "'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}], - "'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}], - "'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}], - "όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}], - "κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}], - "μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}], - "'ξομολογήθηκε": [ - {ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"} - ], - "'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}], - "'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}], - "έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}], - "εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}], - "δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}], - "τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}], - "Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}], - "άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}], - "Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}], - "Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}], - "Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}], - "Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}], - "Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}], - "Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}], - "'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}], - "'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}], - "'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}], - "'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}], - "'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}], - "'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}], - "'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}], - "'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}], - "'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}], - "'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}], - "'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}], + "κι": [{ORTH: "κι", NORM: "και"}], + "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}], + "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}], + "ολ'": [{ORTH: "ολ'", NORM: "όλος"}], + "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}], + "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}], + "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}], + "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}], + "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}], + "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}], + "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}], + "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}], + "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}], + "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}], + "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}], + "'μας": [{ORTH: "'μας", NORM: "εμάς"}], + "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}], + "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}], + "εξ'": [{ORTH: "εξ'", NORM: "εκ"}], + "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}], + "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}], + "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}], + "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}], + "Στ'": [{ORTH: "Στ'", NORM: "στο"}], + "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}], + "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}], + "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}], + "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}], + "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}], + "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}], + "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}], + "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}], + "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}], + "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}], + "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}], + "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}], + "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}], + "'κει": [{ORTH: "'κει", NORM: "εκεί"}], + "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}], + "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}], } _exc.update(_other_exc) @@ -134,35 +132,35 @@ for h in range(1, 12 + 1): for period in ["π.μ.", "πμ"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, - {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}, + {ORTH: period, NORM: "π.μ."}, ] for period in ["μ.μ.", "μμ"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, - {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}, + {ORTH: period, NORM: "μ.μ."}, ] for exc_data in [ - {ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"}, - {ORTH: "Αγ. Γρ.", LEMMA: "Αγία Γραφή", NORM: "Αγία Γραφή"}, - {ORTH: "Αθ.", LEMMA: "Αθανάσιος", NORM: "Αθανάσιος"}, - {ORTH: "Αλεξ.", LEMMA: "Αλέξανδρος", NORM: "Αλέξανδρος"}, - {ORTH: "Απρ.", LEMMA: "Απρίλιος", NORM: "Απρίλιος"}, - {ORTH: "Αύγ.", LEMMA: "Αύγουστος", NORM: "Αύγουστος"}, - {ORTH: "Δεκ.", LEMMA: "Δεκέμβριος", NORM: "Δεκέμβριος"}, - {ORTH: "Δημ.", LEMMA: "Δήμος", NORM: "Δήμος"}, - {ORTH: "Ιαν.", LEMMA: "Ιανουάριος", NORM: "Ιανουάριος"}, - {ORTH: "Ιούλ.", LEMMA: "Ιούλιος", NORM: "Ιούλιος"}, - {ORTH: "Ιούν.", LEMMA: "Ιούνιος", NORM: "Ιούνιος"}, - {ORTH: "Ιωαν.", LEMMA: "Ιωάννης", NORM: "Ιωάννης"}, - {ORTH: "Μ. Ασία", LEMMA: "Μικρά Ασία", NORM: "Μικρά Ασία"}, - {ORTH: "Μάρτ.", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, - {ORTH: "Μάρτ'", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, - {ORTH: "Νοέμβρ.", LEMMA: "Νοέμβριος", NORM: "Νοέμβριος"}, - {ORTH: "Οκτ.", LEMMA: "Οκτώβριος", NORM: "Οκτώβριος"}, - {ORTH: "Σεπτ.", LEMMA: "Σεπτέμβριος", NORM: "Σεπτέμβριος"}, - {ORTH: "Φεβρ.", LEMMA: "Φεβρουάριος", NORM: "Φεβρουάριος"}, + {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"}, + {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"}, + {ORTH: "Αθ.", NORM: "Αθανάσιος"}, + {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"}, + {ORTH: "Απρ.", NORM: "Απρίλιος"}, + {ORTH: "Αύγ.", NORM: "Αύγουστος"}, + {ORTH: "Δεκ.", NORM: "Δεκέμβριος"}, + {ORTH: "Δημ.", NORM: "Δήμος"}, + {ORTH: "Ιαν.", NORM: "Ιανουάριος"}, + {ORTH: "Ιούλ.", NORM: "Ιούλιος"}, + {ORTH: "Ιούν.", NORM: "Ιούνιος"}, + {ORTH: "Ιωαν.", NORM: "Ιωάννης"}, + {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"}, + {ORTH: "Μάρτ.", NORM: "Μάρτιος"}, + {ORTH: "Μάρτ'", NORM: "Μάρτιος"}, + {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"}, + {ORTH: "Οκτ.", NORM: "Οκτώβριος"}, + {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"}, + {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 226678430..c210e1a19 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc @@ -28,110 +28,110 @@ _exclude = [ for pron in ["i"]: for orth in [pron, pron.title()]: _exc[orth + "'m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'m", NORM: "am"}, ] _exc[orth + "m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}, + {ORTH: orth, NORM: pron}, + {ORTH: "m", "tenspect": 1, "number": 1}, ] _exc[orth + "'ma"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'m", NORM: "am"}, + {ORTH: "a", NORM: "gonna"}, ] _exc[orth + "ma"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"}, + {ORTH: orth, NORM: pron}, + {ORTH: "m", NORM: "am"}, + {ORTH: "a", NORM: "gonna"}, ] for pron in ["i", "you", "he", "she", "it", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'ll", NORM: "will"}, ] _exc[orth + "ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: orth, NORM: pron}, + {ORTH: "ll", NORM: "will"}, ] _exc[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'ll", NORM: "will"}, + {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "llve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "ll", NORM: "will"}, + {ORTH: "ve", NORM: "have"}, ] _exc[orth + "'d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: orth, NORM: pron}, {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: orth, NORM: pron}, {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'d", NORM: "would"}, + {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "dve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "d", NORM: "would"}, + {ORTH: "ve", NORM: "have"}, ] for pron in ["i", "you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: pron}, + {ORTH: "ve", NORM: "have"}, ] for pron in ["you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"}, + {ORTH: orth, NORM: pron}, + {ORTH: "'re", NORM: "are"}, ] _exc[orth + "re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}, + {ORTH: orth, NORM: pron}, + {ORTH: "re", NORM: "are"}, ] for pron in ["he", "she", "it"]: for orth in [pron, pron.title()]: _exc[orth + "'s"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: orth, NORM: pron}, {ORTH: "'s", NORM: "'s"}, ] _exc[orth + "s"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: orth, NORM: pron}, {ORTH: "s"}, ] @@ -153,145 +153,145 @@ for word in [ ]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: orth, NORM: word}, {ORTH: "'s", NORM: "'s"}, ] - _exc[orth + "s"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}] + _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}] _exc[orth + "'ll"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: orth, NORM: word}, + {ORTH: "'ll", NORM: "will"}, ] _exc[orth + "ll"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: orth, NORM: word}, + {ORTH: "ll", NORM: "will"}, ] _exc[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: word}, + {ORTH: "'ll", NORM: "will"}, + {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "llve"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: word}, + {ORTH: "ll", NORM: "will"}, + {ORTH: "ve", NORM: "have"}, ] _exc[orth + "'re"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "'re", LEMMA: "be", NORM: "are"}, + {ORTH: orth, NORM: word}, + {ORTH: "'re", NORM: "are"}, ] _exc[orth + "re"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "re", LEMMA: "be", NORM: "are"}, + {ORTH: orth, NORM: word}, + {ORTH: "re", NORM: "are"}, ] _exc[orth + "'ve"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}, + {ORTH: orth, NORM: word}, + {ORTH: "'ve"}, ] _exc[orth + "ve"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth}, + {ORTH: "ve", NORM: "have"}, ] _exc[orth + "'d"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: orth, NORM: word}, {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: orth, NORM: word}, {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: word}, + {ORTH: "'d", NORM: "would"}, + {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "dve"] = [ - {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: orth, NORM: word}, + {ORTH: "d", NORM: "would"}, + {ORTH: "ve", NORM: "have"}, ] # Verbs for verb_data in [ - {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"}, - {ORTH: "could", NORM: "could", TAG: "MD"}, - {ORTH: "do", LEMMA: "do", NORM: "do"}, - {ORTH: "does", LEMMA: "do", NORM: "does"}, - {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"}, - {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"}, - {ORTH: "may", NORM: "may", TAG: "MD"}, - {ORTH: "might", NORM: "might", TAG: "MD"}, - {ORTH: "must", NORM: "must", TAG: "MD"}, + {ORTH: "ca", NORM: "can"}, + {ORTH: "could", NORM: "could"}, + {ORTH: "do", NORM: "do"}, + {ORTH: "does", NORM: "does"}, + {ORTH: "did", NORM: "do"}, + {ORTH: "had", NORM: "have"}, + {ORTH: "may", NORM: "may"}, + {ORTH: "might", NORM: "might"}, + {ORTH: "must", NORM: "must"}, {ORTH: "need", NORM: "need"}, - {ORTH: "ought", NORM: "ought", TAG: "MD"}, - {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"}, - {ORTH: "should", NORM: "should", TAG: "MD"}, - {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"}, - {ORTH: "would", NORM: "would", TAG: "MD"}, + {ORTH: "ought", NORM: "ought"}, + {ORTH: "sha", NORM: "shall"}, + {ORTH: "should", NORM: "should"}, + {ORTH: "wo", NORM: "will"}, + {ORTH: "would", NORM: "would"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "n't", NORM: "not"}, ] _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "nt", NORM: "not"}, ] _exc[data[ORTH] + "n't've"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: "n't", NORM: "not"}, + {ORTH: "'ve", NORM: "have"}, ] _exc[data[ORTH] + "ntve"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, + {ORTH: "nt", NORM: "not"}, + {ORTH: "ve", NORM: "have"}, ] for verb_data in [ - {ORTH: "could", NORM: "could", TAG: "MD"}, - {ORTH: "might", NORM: "might", TAG: "MD"}, - {ORTH: "must", NORM: "must", TAG: "MD"}, - {ORTH: "should", NORM: "should", TAG: "MD"}, - {ORTH: "would", NORM: "would", TAG: "MD"}, + {ORTH: "could", NORM: "could"}, + {ORTH: "might", NORM: "might"}, + {ORTH: "must", NORM: "must"}, + {ORTH: "should", NORM: "should"}, + {ORTH: "would", NORM: "would"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: - _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve"}] - _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve"}] for verb_data in [ - {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2}, - {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, - {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, - {ORTH: "was", LEMMA: "be", NORM: "was"}, - {ORTH: "were", LEMMA: "be", NORM: "were"}, + {ORTH: "ai", "number": 2}, + {ORTH: "are", NORM: "are", "number": 2}, + {ORTH: "is", NORM: "is"}, + {ORTH: "was", NORM: "was"}, + {ORTH: "were", NORM: "were"}, {ORTH: "have", NORM: "have"}, - {ORTH: "has", LEMMA: "have", NORM: "has"}, + {ORTH: "has", NORM: "has"}, {ORTH: "dare", NORM: "dare"}, ]: verb_data_tc = dict(verb_data) @@ -299,24 +299,24 @@ for verb_data in [ for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "n't", NORM: "not"}, ] _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "nt", NORM: "not"}, ] # Other contractions with trailing apostrophe for exc_data in [ - {ORTH: "doin", LEMMA: "do", NORM: "doing"}, - {ORTH: "goin", LEMMA: "go", NORM: "going"}, - {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"}, - {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"}, - {ORTH: "ol", LEMMA: "old", NORM: "old"}, - {ORTH: "somethin", LEMMA: "something", NORM: "something"}, + {ORTH: "doin", NORM: "doing"}, + {ORTH: "goin", NORM: "going"}, + {ORTH: "nothin", NORM: "nothing"}, + {ORTH: "nuthin", NORM: "nothing"}, + {ORTH: "ol", NORM: "old"}, + {ORTH: "somethin", NORM: "something"}, ]: exc_data_tc = dict(exc_data) exc_data_tc[ORTH] = exc_data_tc[ORTH].title() @@ -331,9 +331,9 @@ for exc_data in [ for exc_data in [ {ORTH: "cause", NORM: "because"}, - {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, - {ORTH: "ll", LEMMA: "will", NORM: "will"}, - {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}, + {ORTH: "em", NORM: "them"}, + {ORTH: "ll", NORM: "will"}, + {ORTH: "nuff", NORM: "enough"}, ]: exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] @@ -347,166 +347,131 @@ for h in range(1, 12 + 1): for period in ["a.m.", "am"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, - {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}, + {ORTH: period, NORM: "a.m."}, ] for period in ["p.m.", "pm"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, - {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}, + {ORTH: period, NORM: "p.m."}, ] # Rest _other_exc = { - "y'all": [{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], - "yall": [{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], - "how'd'y": [ - {ORTH: "how", LEMMA: "how"}, - {ORTH: "'d", LEMMA: "do"}, - {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, - ], - "How'd'y": [ - {ORTH: "How", LEMMA: "how", NORM: "how"}, - {ORTH: "'d", LEMMA: "do"}, - {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, - ], - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, - ], - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, - ], - "Not've": [ - {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, - ], - "Notve": [ - {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, - ], - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - ], - "Cannot": [ - {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - ], - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to", NORM: "to"}, - ], - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to", NORM: "to"}, - ], - "gotta": [{ORTH: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}], - "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}], - "let's": [{ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], - "Let's": [ - {ORTH: "Let", LEMMA: "let", NORM: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, - ], - "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], - "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], + "y'all": [{ORTH: "y'", NORM: "you"}, {ORTH: "all"}], + "yall": [{ORTH: "y", NORM: "you"}, {ORTH: "all"}], + "how'd'y": [{ORTH: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}], + "How'd'y": [{ORTH: "How", NORM: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}], + "not've": [{ORTH: "not"}, {ORTH: "'ve", NORM: "have"}], + "notve": [{ORTH: "not"}, {ORTH: "ve", NORM: "have"}], + "Not've": [{ORTH: "Not", NORM: "not"}, {ORTH: "'ve", NORM: "have"}], + "Notve": [{ORTH: "Not", NORM: "not"}, {ORTH: "ve", NORM: "have"}], + "cannot": [{ORTH: "can"}, {ORTH: "not"}], + "Cannot": [{ORTH: "Can", NORM: "can"}, {ORTH: "not"}], + "gonna": [{ORTH: "gon", NORM: "going"}, {ORTH: "na", NORM: "to"}], + "Gonna": [{ORTH: "Gon", NORM: "going"}, {ORTH: "na", NORM: "to"}], + "gotta": [{ORTH: "got"}, {ORTH: "ta", NORM: "to"}], + "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", NORM: "to"}], + "let's": [{ORTH: "let"}, {ORTH: "'s", NORM: "us"}], + "Let's": [{ORTH: "Let", NORM: "let"}, {ORTH: "'s", NORM: "us"}], + "c'mon": [{ORTH: "c'm", NORM: "come"}, {ORTH: "on"}], + "C'mon": [{ORTH: "C'm", NORM: "come"}, {ORTH: "on"}], } _exc.update(_other_exc) for exc_data in [ - {ORTH: "'S", LEMMA: "'s", NORM: "'s"}, - {ORTH: "'s", LEMMA: "'s", NORM: "'s"}, - {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"}, - {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"}, - {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"}, - {ORTH: "w/o", LEMMA: "without", NORM: "without"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"}, - {ORTH: "'Cause", LEMMA: "because", NORM: "because"}, - {ORTH: "'cause", LEMMA: "because", NORM: "because"}, - {ORTH: "'cos", LEMMA: "because", NORM: "because"}, - {ORTH: "'Cos", LEMMA: "because", NORM: "because"}, - {ORTH: "'coz", LEMMA: "because", NORM: "because"}, - {ORTH: "'Coz", LEMMA: "because", NORM: "because"}, - {ORTH: "'cuz", LEMMA: "because", NORM: "because"}, - {ORTH: "'Cuz", LEMMA: "because", NORM: "because"}, - {ORTH: "'bout", LEMMA: "about", NORM: "about"}, - {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"}, - {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"}, - {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"}, - {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, - {ORTH: "lovin'", LEMMA: "love", NORM: "loving"}, - {ORTH: "Lovin'", LEMMA: "love", NORM: "loving"}, - {ORTH: "lovin", LEMMA: "love", NORM: "loving"}, - {ORTH: "Lovin", LEMMA: "love", NORM: "loving"}, - {ORTH: "havin'", LEMMA: "have", NORM: "having"}, - {ORTH: "Havin'", LEMMA: "have", NORM: "having"}, - {ORTH: "havin", LEMMA: "have", NORM: "having"}, - {ORTH: "Havin", LEMMA: "have", NORM: "having"}, - {ORTH: "doin'", LEMMA: "do", NORM: "doing"}, - {ORTH: "Doin'", LEMMA: "do", NORM: "doing"}, - {ORTH: "doin", LEMMA: "do", NORM: "doing"}, - {ORTH: "Doin", LEMMA: "do", NORM: "doing"}, - {ORTH: "goin'", LEMMA: "go", NORM: "going"}, - {ORTH: "Goin'", LEMMA: "go", NORM: "going"}, - {ORTH: "goin", LEMMA: "go", NORM: "going"}, - {ORTH: "Goin", LEMMA: "go", NORM: "going"}, - {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, - {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, - {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"}, - {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, - {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"}, - {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"}, - {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, - {ORTH: "Calif.", LEMMA: "California", NORM: "California"}, - {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"}, - {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"}, - {ORTH: "Dec.", LEMMA: "December", NORM: "December"}, - {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"}, - {ORTH: "Feb.", LEMMA: "February", NORM: "February"}, - {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"}, - {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"}, - {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"}, - {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"}, - {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"}, - {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"}, - {ORTH: "Jan.", LEMMA: "January", NORM: "January"}, - {ORTH: "Jul.", LEMMA: "July", NORM: "July"}, - {ORTH: "Jun.", LEMMA: "June", NORM: "June"}, - {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"}, - {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"}, - {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"}, - {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"}, - {ORTH: "Mar.", LEMMA: "March", NORM: "March"}, - {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"}, - {ORTH: "May.", LEMMA: "May", NORM: "May"}, - {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"}, - {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"}, - {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"}, - {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"}, - {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"}, - {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"}, - {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"}, - {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"}, - {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"}, - {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"}, - {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"}, - {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"}, - {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, - {ORTH: "Oct.", LEMMA: "October", NORM: "October"}, - {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"}, - {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"}, - {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"}, - {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"}, - {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, - {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, - {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"}, - {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, - {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, - {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}, + {ORTH: "'S", NORM: "'s"}, + {ORTH: "'s", NORM: "'s"}, + {ORTH: "\u2018S", NORM: "'s"}, + {ORTH: "\u2018s", NORM: "'s"}, + {ORTH: "and/or", NORM: "and/or"}, + {ORTH: "w/o", NORM: "without"}, + {ORTH: "'re", NORM: "are"}, + {ORTH: "'Cause", NORM: "because"}, + {ORTH: "'cause", NORM: "because"}, + {ORTH: "'cos", NORM: "because"}, + {ORTH: "'Cos", NORM: "because"}, + {ORTH: "'coz", NORM: "because"}, + {ORTH: "'Coz", NORM: "because"}, + {ORTH: "'cuz", NORM: "because"}, + {ORTH: "'Cuz", NORM: "because"}, + {ORTH: "'bout", NORM: "about"}, + {ORTH: "ma'am", NORM: "madam"}, + {ORTH: "Ma'am", NORM: "madam"}, + {ORTH: "o'clock", NORM: "o'clock"}, + {ORTH: "O'clock", NORM: "o'clock"}, + {ORTH: "lovin'", NORM: "loving"}, + {ORTH: "Lovin'", NORM: "loving"}, + {ORTH: "lovin", NORM: "loving"}, + {ORTH: "Lovin", NORM: "loving"}, + {ORTH: "havin'", NORM: "having"}, + {ORTH: "Havin'", NORM: "having"}, + {ORTH: "havin", NORM: "having"}, + {ORTH: "Havin", NORM: "having"}, + {ORTH: "doin'", NORM: "doing"}, + {ORTH: "Doin'", NORM: "doing"}, + {ORTH: "doin", NORM: "doing"}, + {ORTH: "Doin", NORM: "doing"}, + {ORTH: "goin'", NORM: "going"}, + {ORTH: "Goin'", NORM: "going"}, + {ORTH: "goin", NORM: "going"}, + {ORTH: "Goin", NORM: "going"}, + {ORTH: "Mt.", NORM: "Mount"}, + {ORTH: "Ak.", NORM: "Alaska"}, + {ORTH: "Ala.", NORM: "Alabama"}, + {ORTH: "Apr.", NORM: "April"}, + {ORTH: "Ariz.", NORM: "Arizona"}, + {ORTH: "Ark.", NORM: "Arkansas"}, + {ORTH: "Aug.", NORM: "August"}, + {ORTH: "Calif.", NORM: "California"}, + {ORTH: "Colo.", NORM: "Colorado"}, + {ORTH: "Conn.", NORM: "Connecticut"}, + {ORTH: "Dec.", NORM: "December"}, + {ORTH: "Del.", NORM: "Delaware"}, + {ORTH: "Feb.", NORM: "February"}, + {ORTH: "Fla.", NORM: "Florida"}, + {ORTH: "Ga.", NORM: "Georgia"}, + {ORTH: "Ia.", NORM: "Iowa"}, + {ORTH: "Id.", NORM: "Idaho"}, + {ORTH: "Ill.", NORM: "Illinois"}, + {ORTH: "Ind.", NORM: "Indiana"}, + {ORTH: "Jan.", NORM: "January"}, + {ORTH: "Jul.", NORM: "July"}, + {ORTH: "Jun.", NORM: "June"}, + {ORTH: "Kan.", NORM: "Kansas"}, + {ORTH: "Kans.", NORM: "Kansas"}, + {ORTH: "Ky.", NORM: "Kentucky"}, + {ORTH: "La.", NORM: "Louisiana"}, + {ORTH: "Mar.", NORM: "March"}, + {ORTH: "Mass.", NORM: "Massachusetts"}, + {ORTH: "May.", NORM: "May"}, + {ORTH: "Mich.", NORM: "Michigan"}, + {ORTH: "Minn.", NORM: "Minnesota"}, + {ORTH: "Miss.", NORM: "Mississippi"}, + {ORTH: "N.C.", NORM: "North Carolina"}, + {ORTH: "N.D.", NORM: "North Dakota"}, + {ORTH: "N.H.", NORM: "New Hampshire"}, + {ORTH: "N.J.", NORM: "New Jersey"}, + {ORTH: "N.M.", NORM: "New Mexico"}, + {ORTH: "N.Y.", NORM: "New York"}, + {ORTH: "Neb.", NORM: "Nebraska"}, + {ORTH: "Nebr.", NORM: "Nebraska"}, + {ORTH: "Nev.", NORM: "Nevada"}, + {ORTH: "Nov.", NORM: "November"}, + {ORTH: "Oct.", NORM: "October"}, + {ORTH: "Okla.", NORM: "Oklahoma"}, + {ORTH: "Ore.", NORM: "Oregon"}, + {ORTH: "Pa.", NORM: "Pennsylvania"}, + {ORTH: "S.C.", NORM: "South Carolina"}, + {ORTH: "Sep.", NORM: "September"}, + {ORTH: "Sept.", NORM: "September"}, + {ORTH: "Tenn.", NORM: "Tennessee"}, + {ORTH: "Va.", NORM: "Virginia"}, + {ORTH: "Wash.", NORM: "Washington"}, + {ORTH: "Wis.", NORM: "Wisconsin"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 63124578e..fbfe75545 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -1,27 +1,27 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc _exc = { - "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], + "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}], } for exc_data in [ - {ORTH: "n°", LEMMA: "número"}, - {ORTH: "°C", LEMMA: "grados Celcius"}, - {ORTH: "aprox.", LEMMA: "aproximadamente"}, - {ORTH: "dna.", LEMMA: "docena"}, - {ORTH: "dpto.", LEMMA: "departamento"}, - {ORTH: "ej.", LEMMA: "ejemplo"}, - {ORTH: "esq.", LEMMA: "esquina"}, - {ORTH: "pág.", LEMMA: "página"}, - {ORTH: "p.ej.", LEMMA: "por ejemplo"}, - {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, - {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, - {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, - {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "n°"}, + {ORTH: "°C"}, + {ORTH: "aprox."}, + {ORTH: "dna."}, + {ORTH: "dpto."}, + {ORTH: "ej."}, + {ORTH: "esq."}, + {ORTH: "pág."}, + {ORTH: "p.ej."}, + {ORTH: "Ud.", NORM: "usted"}, + {ORTH: "Vd.", NORM: "usted"}, + {ORTH: "Uds.", NORM: "ustedes"}, + {ORTH: "Vds.", NORM: "ustedes"}, {ORTH: "vol.", NORM: "volúmen"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -29,14 +29,14 @@ for exc_data in [ # Times -_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] +_exc["12m."] = [{ORTH: "12"}, {ORTH: "m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}] for period in ["p.m.", "pm"]: - _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}] for orth in [ diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py index db9e3f6fc..30df798ab 100644 --- a/spacy/lang/fa/tokenizer_exceptions.py +++ b/spacy/lang/fa/tokenizer_exceptions.py @@ -1,2753 +1,747 @@ -from ...symbols import ORTH, LEMMA, TAG, NORM +from ...symbols import ORTH, NORM -_exc = { - ".ق ": [{LEMMA: "قمری", ORTH: ".ق "}], - ".م": [{LEMMA: "میلادی", ORTH: ".م"}], - ".هـ": [{LEMMA: "هجری", ORTH: ".هـ"}], - "ب.م": [{LEMMA: "بعد از میلاد", ORTH: "ب.م"}], - "ق.م": [{LEMMA: "قبل از میلاد", ORTH: "ق.م"}], +TOKENIZER_EXCEPTIONS = { + ".ق ": [{ORTH: ".ق "}], + ".م": [{ORTH: ".م"}], + ".هـ": [{ORTH: ".هـ"}], + "ب.م": [{ORTH: "ب.م"}], + "ق.م": [{ORTH: "ق.م"}], + "آبرویت": [{ORTH: "آبروی", NORM: "آبروی"}, {ORTH: "ت", NORM: "ت"}], + "آب‌نباتش": [{ORTH: "آب‌نبات", NORM: "آب‌نبات"}, {ORTH: "ش", NORM: "ش"}], + "آثارش": [{ORTH: "آثار", NORM: "آثار"}, {ORTH: "ش", NORM: "ش"}], + "آخرش": [{ORTH: "آخر", NORM: "آخر"}, {ORTH: "ش", NORM: "ش"}], + "آدمهاست": [{ORTH: "آدمها", NORM: "آدمها"}, {ORTH: "ست", NORM: "ست"}], + "آرزومندیم": [{ORTH: "آرزومند", NORM: "آرزومند"}, {ORTH: "یم", NORM: "یم"}], + "آزادند": [{ORTH: "آزاد", NORM: "آزاد"}, {ORTH: "ند", NORM: "ند"}], + "آسیب‌پذیرند": [{ORTH: "آسیب‌پذیر", NORM: "آسیب‌پذیر"}, {ORTH: "ند", NORM: "ند"}], + "آفریده‌اند": [{ORTH: "آفریده‌", NORM: "آفریده‌"}, {ORTH: "اند", NORM: "اند"}], + "آمدنش": [{ORTH: "آمدن", NORM: "آمدن"}, {ORTH: "ش", NORM: "ش"}], + "آمریکاست": [{ORTH: "آمریکا", NORM: "آمریکا"}, {ORTH: "ست", NORM: "ست"}], + "آنجاست": [{ORTH: "آنجا", NORM: "آنجا"}, {ORTH: "ست", NORM: "ست"}], + "آنست": [{ORTH: "آن", NORM: "آن"}, {ORTH: "ست", NORM: "ست"}], + "آنند": [{ORTH: "آن", NORM: "آن"}, {ORTH: "ند", NORM: "ند"}], + "آن‌هاست": [{ORTH: "آن‌ها", NORM: "آن‌ها"}, {ORTH: "ست", NORM: "ست"}], + "آپاداناست": [{ORTH: "آپادانا", NORM: "آپادانا"}, {ORTH: "ست", NORM: "ست"}], + "اجتماعی‌مان": [{ORTH: "اجتماعی‌", NORM: "اجتماعی‌"}, {ORTH: "مان", NORM: "مان"}], + "اجدادت": [{ORTH: "اجداد", NORM: "اجداد"}, {ORTH: "ت", NORM: "ت"}], + "اجدادش": [{ORTH: "اجداد", NORM: "اجداد"}, {ORTH: "ش", NORM: "ش"}], + "اجدادی‌شان": [{ORTH: "اجدادی‌", NORM: "اجدادی‌"}, {ORTH: "شان", NORM: "شان"}], + "اجراست": [{ORTH: "اجرا", NORM: "اجرا"}, {ORTH: "ست", NORM: "ست"}], + "اختیارش": [{ORTH: "اختیار", NORM: "اختیار"}, {ORTH: "ش", NORM: "ش"}], + "اخلاقشان": [{ORTH: "اخلاق", NORM: "اخلاق"}, {ORTH: "شان", NORM: "شان"}], + "ادعایمان": [{ORTH: "ادعای", NORM: "ادعای"}, {ORTH: "مان", NORM: "مان"}], + "اذیتش": [{ORTH: "اذیت", NORM: "اذیت"}, {ORTH: "ش", NORM: "ش"}], + "اراده‌اش": [{ORTH: "اراده‌", NORM: "اراده‌"}, {ORTH: "اش", NORM: "اش"}], + "ارتباطش": [{ORTH: "ارتباط", NORM: "ارتباط"}, {ORTH: "ش", NORM: "ش"}], + "ارتباطمان": [{ORTH: "ارتباط", NORM: "ارتباط"}, {ORTH: "مان", NORM: "مان"}], + "ارزشهاست": [{ORTH: "ارزشها", NORM: "ارزشها"}, {ORTH: "ست", NORM: "ست"}], + "ارزی‌اش": [{ORTH: "ارزی‌", NORM: "ارزی‌"}, {ORTH: "اش", NORM: "اش"}], + "اره‌اش": [{ORTH: "اره‌", NORM: "اره‌"}, {ORTH: "اش", NORM: "اش"}], + "ازش": [{ORTH: "از", NORM: "از"}, {ORTH: "ش", NORM: "ش"}], + "ازین": [{ORTH: "از", NORM: "از"}, {ORTH: "ین", NORM: "ین"}], + "ازین‌هاست": [ + {ORTH: "از", NORM: "از"}, + {ORTH: "ین‌ها", NORM: "ین‌ها"}, + {ORTH: "ست", NORM: "ست"}, + ], + "استخوانند": [{ORTH: "استخوان", NORM: "استخوان"}, {ORTH: "ند", NORM: "ند"}], + "اسلامند": [{ORTH: "اسلام", NORM: "اسلام"}, {ORTH: "ند", NORM: "ند"}], + "اسلامی‌اند": [{ORTH: "اسلامی‌", NORM: "اسلامی‌"}, {ORTH: "اند", NORM: "اند"}], + "اسلحه‌هایشان": [ + {ORTH: "اسلحه‌های", NORM: "اسلحه‌های"}, + {ORTH: "شان", NORM: "شان"}, + ], + "اسمت": [{ORTH: "اسم", NORM: "اسم"}, {ORTH: "ت", NORM: "ت"}], + "اسمش": [{ORTH: "اسم", NORM: "اسم"}, {ORTH: "ش", NORM: "ش"}], + "اشتباهند": [{ORTH: "اشتباه", NORM: "اشتباه"}, {ORTH: "ند", NORM: "ند"}], + "اصلش": [{ORTH: "اصل", NORM: "اصل"}, {ORTH: "ش", NORM: "ش"}], + "اطاقش": [{ORTH: "اطاق", NORM: "اطاق"}, {ORTH: "ش", NORM: "ش"}], + "اعتقادند": [{ORTH: "اعتقاد", NORM: "اعتقاد"}, {ORTH: "ند", NORM: "ند"}], + "اعلایش": [{ORTH: "اعلای", NORM: "اعلای"}, {ORTH: "ش", NORM: "ش"}], + "افتراست": [{ORTH: "افترا", NORM: "افترا"}, {ORTH: "ست", NORM: "ست"}], + "افطارت": [{ORTH: "افطار", NORM: "افطار"}, {ORTH: "ت", NORM: "ت"}], + "اقوامش": [{ORTH: "اقوام", NORM: "اقوام"}, {ORTH: "ش", NORM: "ش"}], + "امروزیش": [{ORTH: "امروزی", NORM: "امروزی"}, {ORTH: "ش", NORM: "ش"}], + "اموالش": [{ORTH: "اموال", NORM: "اموال"}, {ORTH: "ش", NORM: "ش"}], + "امیدوارند": [{ORTH: "امیدوار", NORM: "امیدوار"}, {ORTH: "ند", NORM: "ند"}], + "امیدواریم": [{ORTH: "امیدوار", NORM: "امیدوار"}, {ORTH: "یم", NORM: "یم"}], + "انتخابهایم": [{ORTH: "انتخابها", NORM: "انتخابها"}, {ORTH: "یم", NORM: "یم"}], + "انتظارم": [{ORTH: "انتظار", NORM: "انتظار"}, {ORTH: "م", NORM: "م"}], + "انجمنم": [{ORTH: "انجمن", NORM: "انجمن"}, {ORTH: "م", NORM: "م"}], + "اندرش": [{ORTH: "اندر", NORM: "اندر"}, {ORTH: "ش", NORM: "ش"}], + "انشایش": [{ORTH: "انشای", NORM: "انشای"}, {ORTH: "ش", NORM: "ش"}], + "انگشتشان": [{ORTH: "انگشت", NORM: "انگشت"}, {ORTH: "شان", NORM: "شان"}], + "انگشتهایش": [{ORTH: "انگشتهای", NORM: "انگشتهای"}, {ORTH: "ش", NORM: "ش"}], + "اهمیتشان": [{ORTH: "اهمیت", NORM: "اهمیت"}, {ORTH: "شان", NORM: "شان"}], + "اهمیتند": [{ORTH: "اهمیت", NORM: "اهمیت"}, {ORTH: "ند", NORM: "ند"}], + "اوایلش": [{ORTH: "اوایل", NORM: "اوایل"}, {ORTH: "ش", NORM: "ش"}], + "اوست": [{ORTH: "او", NORM: "او"}, {ORTH: "ست", NORM: "ست"}], + "اولش": [{ORTH: "اول", NORM: "اول"}, {ORTH: "ش", NORM: "ش"}], + "اولشان": [{ORTH: "اول", NORM: "اول"}, {ORTH: "شان", NORM: "شان"}], + "اولم": [{ORTH: "اول", NORM: "اول"}, {ORTH: "م", NORM: "م"}], + "اکثرشان": [{ORTH: "اکثر", NORM: "اکثر"}, {ORTH: "شان", NORM: "شان"}], + "ایتالیاست": [{ORTH: "ایتالیا", NORM: "ایتالیا"}, {ORTH: "ست", NORM: "ست"}], + "ایرانی‌اش": [{ORTH: "ایرانی‌", NORM: "ایرانی‌"}, {ORTH: "اش", NORM: "اش"}], + "اینجاست": [{ORTH: "اینجا", NORM: "اینجا"}, {ORTH: "ست", NORM: "ست"}], + "این‌هاست": [{ORTH: "این‌ها", NORM: "این‌ها"}, {ORTH: "ست", NORM: "ست"}], + "بابات": [{ORTH: "بابا", NORM: "بابا"}, {ORTH: "ت", NORM: "ت"}], + "بارش": [{ORTH: "بار", NORM: "بار"}, {ORTH: "ش", NORM: "ش"}], + "بازیگرانش": [{ORTH: "بازیگران", NORM: "بازیگران"}, {ORTH: "ش", NORM: "ش"}], + "بازیگرمان": [{ORTH: "بازیگر", NORM: "بازیگر"}, {ORTH: "مان", NORM: "مان"}], + "بازیگرهایم": [{ORTH: "بازیگرها", NORM: "بازیگرها"}, {ORTH: "یم", NORM: "یم"}], + "بازی‌اش": [{ORTH: "بازی‌", NORM: "بازی‌"}, {ORTH: "اش", NORM: "اش"}], + "بالاست": [{ORTH: "بالا", NORM: "بالا"}, {ORTH: "ست", NORM: "ست"}], + "باورند": [{ORTH: "باور", NORM: "باور"}, {ORTH: "ند", NORM: "ند"}], + "بجاست": [{ORTH: "بجا", NORM: "بجا"}, {ORTH: "ست", NORM: "ست"}], + "بدان": [{ORTH: "ب", NORM: "ب"}, {ORTH: "دان", NORM: "دان"}], + "بدش": [{ORTH: "بد", NORM: "بد"}, {ORTH: "ش", NORM: "ش"}], + "بدشان": [{ORTH: "بد", NORM: "بد"}, {ORTH: "شان", NORM: "شان"}], + "بدنم": [{ORTH: "بدن", NORM: "بدن"}, {ORTH: "م", NORM: "م"}], + "بدهی‌ات": [{ORTH: "بدهی‌", NORM: "بدهی‌"}, {ORTH: "ات", NORM: "ات"}], + "بدین": [{ORTH: "ب", NORM: "ب"}, {ORTH: "دین", NORM: "دین"}], + "برابرش": [{ORTH: "برابر", NORM: "برابر"}, {ORTH: "ش", NORM: "ش"}], + "برادرت": [{ORTH: "برادر", NORM: "برادر"}, {ORTH: "ت", NORM: "ت"}], + "برادرش": [{ORTH: "برادر", NORM: "برادر"}, {ORTH: "ش", NORM: "ش"}], + "برایت": [{ORTH: "برای", NORM: "برای"}, {ORTH: "ت", NORM: "ت"}], + "برایتان": [{ORTH: "برای", NORM: "برای"}, {ORTH: "تان", NORM: "تان"}], + "برایش": [{ORTH: "برای", NORM: "برای"}, {ORTH: "ش", NORM: "ش"}], + "برایشان": [{ORTH: "برای", NORM: "برای"}, {ORTH: "شان", NORM: "شان"}], + "برایم": [{ORTH: "برای", NORM: "برای"}, {ORTH: "م", NORM: "م"}], + "برایمان": [{ORTH: "برای", NORM: "برای"}, {ORTH: "مان", NORM: "مان"}], + "برخوردارند": [{ORTH: "برخوردار", NORM: "برخوردار"}, {ORTH: "ند", NORM: "ند"}], + "برنامه‌سازهاست": [ + {ORTH: "برنامه‌سازها", NORM: "برنامه‌سازها"}, + {ORTH: "ست", NORM: "ست"}, + ], + "برهمش": [{ORTH: "برهم", NORM: "برهم"}, {ORTH: "ش", NORM: "ش"}], + "برهنه‌اش": [{ORTH: "برهنه‌", NORM: "برهنه‌"}, {ORTH: "اش", NORM: "اش"}], + "برگهایش": [{ORTH: "برگها", NORM: "برگها"}, {ORTH: "یش", NORM: "یش"}], + "برین": [{ORTH: "بر", NORM: "بر"}, {ORTH: "ین", NORM: "ین"}], + "بزرگش": [{ORTH: "بزرگ", NORM: "بزرگ"}, {ORTH: "ش", NORM: "ش"}], + "بزرگ‌تری": [{ORTH: "بزرگ‌تر", NORM: "بزرگ‌تر"}, {ORTH: "ی", NORM: "ی"}], + "بساطش": [{ORTH: "بساط", NORM: "بساط"}, {ORTH: "ش", NORM: "ش"}], + "بعدش": [{ORTH: "بعد", NORM: "بعد"}, {ORTH: "ش", NORM: "ش"}], + "بعضیهایشان": [{ORTH: "بعضیهای", NORM: "بعضیهای"}, {ORTH: "شان", NORM: "شان"}], + "بعضی‌شان": [{ORTH: "بعضی", NORM: "بعضی"}, {ORTH: "‌شان", NORM: "شان"}], + "بقیه‌اش": [{ORTH: "بقیه‌", NORM: "بقیه‌"}, {ORTH: "اش", NORM: "اش"}], + "بلندش": [{ORTH: "بلند", NORM: "بلند"}, {ORTH: "ش", NORM: "ش"}], + "بناگوشش": [{ORTH: "بناگوش", NORM: "بناگوش"}, {ORTH: "ش", NORM: "ش"}], + "بنظرم": [ + {ORTH: "ب", NORM: "ب"}, + {ORTH: "نظر", NORM: "نظر"}, + {ORTH: "م", NORM: "م"}, + ], + "بهت": [{ORTH: "به", NORM: "به"}, {ORTH: "ت", NORM: "ت"}], + "بهترش": [{ORTH: "بهتر", NORM: "بهتر"}, {ORTH: "ش", NORM: "ش"}], + "بهترم": [{ORTH: "بهتر", NORM: "بهتر"}, {ORTH: "م", NORM: "م"}], + "بهتری": [{ORTH: "بهتر", NORM: "بهتر"}, {ORTH: "ی", NORM: "ی"}], + "بهش": [{ORTH: "به", NORM: "به"}, {ORTH: "ش", NORM: "ش"}], + "به‌شان": [{ORTH: "به‌", NORM: "به‌"}, {ORTH: "شان", NORM: "شان"}], + "بودمش": [{ORTH: "بودم", NORM: "بودم"}, {ORTH: "ش", NORM: "ش"}], + "بودنش": [{ORTH: "بودن", NORM: "بودن"}, {ORTH: "ش", NORM: "ش"}], + "بودن‌شان": [{ORTH: "بودن‌", NORM: "بودن‌"}, {ORTH: "شان", NORM: "شان"}], + "بوستانش": [{ORTH: "بوستان", NORM: "بوستان"}, {ORTH: "ش", NORM: "ش"}], + "بویش": [{ORTH: "بو", NORM: "بو"}, {ORTH: "یش", NORM: "یش"}], + "بچه‌اش": [{ORTH: "بچه‌", NORM: "بچه‌"}, {ORTH: "اش", NORM: "اش"}], + "بچه‌م": [{ORTH: "بچه‌", NORM: "بچه‌"}, {ORTH: "م", NORM: "م"}], + "بچه‌هایش": [{ORTH: "بچه‌های", NORM: "بچه‌های"}, {ORTH: "ش", NORM: "ش"}], + "بیانیه‌شان": [{ORTH: "بیانیه‌", NORM: "بیانیه‌"}, {ORTH: "شان", NORM: "شان"}], + "بیدارم": [{ORTH: "بیدار", NORM: "بیدار"}, {ORTH: "م", NORM: "م"}], + "بیناتری": [{ORTH: "بیناتر", NORM: "بیناتر"}, {ORTH: "ی", NORM: "ی"}], + "بی‌اطلاعند": [{ORTH: "بی‌اطلاع", NORM: "بی‌اطلاع"}, {ORTH: "ند", NORM: "ند"}], + "بی‌اطلاعید": [{ORTH: "بی‌اطلاع", NORM: "بی‌اطلاع"}, {ORTH: "ید", NORM: "ید"}], + "بی‌بهره‌اند": [{ORTH: "بی‌بهره‌", NORM: "بی‌بهره‌"}, {ORTH: "اند", NORM: "اند"}], + "بی‌تفاوتند": [{ORTH: "بی‌تفاوت", NORM: "بی‌تفاوت"}, {ORTH: "ند", NORM: "ند"}], + "بی‌حسابش": [{ORTH: "بی‌حساب", NORM: "بی‌حساب"}, {ORTH: "ش", NORM: "ش"}], + "بی‌نیش": [{ORTH: "بی‌نی", NORM: "بی‌نی"}, {ORTH: "ش", NORM: "ش"}], + "تجربه‌هایم": [{ORTH: "تجربه‌ها", NORM: "تجربه‌ها"}, {ORTH: "یم", NORM: "یم"}], + "تحریم‌هاست": [{ORTH: "تحریم‌ها", NORM: "تحریم‌ها"}, {ORTH: "ست", NORM: "ست"}], + "تحولند": [{ORTH: "تحول", NORM: "تحول"}, {ORTH: "ند", NORM: "ند"}], + "تخیلی‌اش": [{ORTH: "تخیلی‌", NORM: "تخیلی‌"}, {ORTH: "اش", NORM: "اش"}], + "ترا": [{ORTH: "ت", NORM: "ت"}, {ORTH: "را", NORM: "را"}], + "ترسشان": [{ORTH: "ترس", NORM: "ترس"}, {ORTH: "شان", NORM: "شان"}], + "ترکش": [{ORTH: "ترک", NORM: "ترک"}, {ORTH: "ش", NORM: "ش"}], + "تشنه‌ت": [{ORTH: "تشنه‌", NORM: "تشنه‌"}, {ORTH: "ت", NORM: "ت"}], + "تشکیلاتی‌اش": [{ORTH: "تشکیلاتی‌", NORM: "تشکیلاتی‌"}, {ORTH: "اش", NORM: "اش"}], + "تعلقش": [{ORTH: "تعلق", NORM: "تعلق"}, {ORTH: "ش", NORM: "ش"}], + "تلاششان": [{ORTH: "تلاش", NORM: "تلاش"}, {ORTH: "شان", NORM: "شان"}], + "تلاشمان": [{ORTH: "تلاش", NORM: "تلاش"}, {ORTH: "مان", NORM: "مان"}], + "تماشاگرش": [{ORTH: "تماشاگر", NORM: "تماشاگر"}, {ORTH: "ش", NORM: "ش"}], + "تمامشان": [{ORTH: "تمام", NORM: "تمام"}, {ORTH: "شان", NORM: "شان"}], + "تنش": [{ORTH: "تن", NORM: "تن"}, {ORTH: "ش", NORM: "ش"}], + "تنمان": [{ORTH: "تن", NORM: "تن"}, {ORTH: "مان", NORM: "مان"}], + "تنهایی‌اش": [{ORTH: "تنهایی‌", NORM: "تنهایی‌"}, {ORTH: "اش", NORM: "اش"}], + "توانایی‌اش": [{ORTH: "توانایی‌", NORM: "توانایی‌"}, {ORTH: "اش", NORM: "اش"}], + "توجهش": [{ORTH: "توجه", NORM: "توجه"}, {ORTH: "ش", NORM: "ش"}], + "توست": [{ORTH: "تو", NORM: "تو"}, {ORTH: "ست", NORM: "ست"}], + "توصیه‌اش": [{ORTH: "توصیه‌", NORM: "توصیه‌"}, {ORTH: "اش", NORM: "اش"}], + "تیغه‌اش": [{ORTH: "تیغه‌", NORM: "تیغه‌"}, {ORTH: "اش", NORM: "اش"}], + "جاست": [{ORTH: "جا", NORM: "جا"}, {ORTH: "ست", NORM: "ست"}], + "جامعه‌اند": [{ORTH: "جامعه‌", NORM: "جامعه‌"}, {ORTH: "اند", NORM: "اند"}], + "جانم": [{ORTH: "جان", NORM: "جان"}, {ORTH: "م", NORM: "م"}], + "جایش": [{ORTH: "جای", NORM: "جای"}, {ORTH: "ش", NORM: "ش"}], + "جایشان": [{ORTH: "جای", NORM: "جای"}, {ORTH: "شان", NORM: "شان"}], + "جدیدش": [{ORTH: "جدید", NORM: "جدید"}, {ORTH: "ش", NORM: "ش"}], + "جرمزاست": [{ORTH: "جرمزا", NORM: "جرمزا"}, {ORTH: "ست", NORM: "ست"}], + "جلوست": [{ORTH: "جلو", NORM: "جلو"}, {ORTH: "ست", NORM: "ست"}], + "جلویش": [{ORTH: "جلوی", NORM: "جلوی"}, {ORTH: "ش", NORM: "ش"}], + "جمهوریست": [{ORTH: "جمهوری", NORM: "جمهوری"}, {ORTH: "ست", NORM: "ست"}], + "جنسش": [{ORTH: "جنس", NORM: "جنس"}, {ORTH: "ش", NORM: "ش"}], + "جنس‌اند": [{ORTH: "جنس‌", NORM: "جنس‌"}, {ORTH: "اند", NORM: "اند"}], + "جوانانش": [{ORTH: "جوانان", NORM: "جوانان"}, {ORTH: "ش", NORM: "ش"}], + "جویش": [{ORTH: "جوی", NORM: "جوی"}, {ORTH: "ش", NORM: "ش"}], + "جگرش": [{ORTH: "جگر", NORM: "جگر"}, {ORTH: "ش", NORM: "ش"}], + "حاضرم": [{ORTH: "حاضر", NORM: "حاضر"}, {ORTH: "م", NORM: "م"}], + "حالتهایشان": [{ORTH: "حالتهای", NORM: "حالتهای"}, {ORTH: "شان", NORM: "شان"}], + "حالیست": [{ORTH: "حالی", NORM: "حالی"}, {ORTH: "ست", NORM: "ست"}], + "حالی‌مان": [{ORTH: "حالی‌", NORM: "حالی‌"}, {ORTH: "مان", NORM: "مان"}], + "حاکیست": [{ORTH: "حاکی", NORM: "حاکی"}, {ORTH: "ست", NORM: "ست"}], + "حرامزادگی‌اش": [ + {ORTH: "حرامزادگی‌", NORM: "حرامزادگی‌"}, + {ORTH: "اش", NORM: "اش"}, + ], + "حرفتان": [{ORTH: "حرف", NORM: "حرف"}, {ORTH: "تان", NORM: "تان"}], + "حرفش": [{ORTH: "حرف", NORM: "حرف"}, {ORTH: "ش", NORM: "ش"}], + "حرفشان": [{ORTH: "حرف", NORM: "حرف"}, {ORTH: "شان", NORM: "شان"}], + "حرفم": [{ORTH: "حرف", NORM: "حرف"}, {ORTH: "م", NORM: "م"}], + "حرف‌های‌شان": [{ORTH: "حرف‌های‌", NORM: "حرف‌های‌"}, {ORTH: "شان", NORM: "شان"}], + "حرکتمان": [{ORTH: "حرکت", NORM: "حرکت"}, {ORTH: "مان", NORM: "مان"}], + "حریفانشان": [{ORTH: "حریفان", NORM: "حریفان"}, {ORTH: "شان", NORM: "شان"}], + "حضورشان": [{ORTH: "حضور", NORM: "حضور"}, {ORTH: "شان", NORM: "شان"}], + "حمایتش": [{ORTH: "حمایت", NORM: "حمایت"}, {ORTH: "ش", NORM: "ش"}], + "حواسش": [{ORTH: "حواس", NORM: "حواس"}, {ORTH: "ش", NORM: "ش"}], + "حواسشان": [{ORTH: "حواس", NORM: "حواس"}, {ORTH: "شان", NORM: "شان"}], + "حوصله‌مان": [{ORTH: "حوصله‌", NORM: "حوصله‌"}, {ORTH: "مان", NORM: "مان"}], + "حکومتش": [{ORTH: "حکومت", NORM: "حکومت"}, {ORTH: "ش", NORM: "ش"}], + "حکومتشان": [{ORTH: "حکومت", NORM: "حکومت"}, {ORTH: "شان", NORM: "شان"}], + "حیفم": [{ORTH: "حیف", NORM: "حیف"}, {ORTH: "م", NORM: "م"}], + "خاندانش": [{ORTH: "خاندان", NORM: "خاندان"}, {ORTH: "ش", NORM: "ش"}], + "خانه‌اش": [{ORTH: "خانه‌", NORM: "خانه‌"}, {ORTH: "اش", NORM: "اش"}], + "خانه‌شان": [{ORTH: "خانه‌", NORM: "خانه‌"}, {ORTH: "شان", NORM: "شان"}], + "خانه‌مان": [{ORTH: "خانه‌", NORM: "خانه‌"}, {ORTH: "مان", NORM: "مان"}], + "خانه‌هایشان": [{ORTH: "خانه‌های", NORM: "خانه‌های"}, {ORTH: "شان", NORM: "شان"}], + "خانواده‌ات": [{ORTH: "خانواده", NORM: "خانواده"}, {ORTH: "‌ات", NORM: "ات"}], + "خانواده‌اش": [{ORTH: "خانواده‌", NORM: "خانواده‌"}, {ORTH: "اش", NORM: "اش"}], + "خانواده‌ام": [{ORTH: "خانواده‌", NORM: "خانواده‌"}, {ORTH: "ام", NORM: "ام"}], + "خانواده‌شان": [{ORTH: "خانواده‌", NORM: "خانواده‌"}, {ORTH: "شان", NORM: "شان"}], + "خداست": [{ORTH: "خدا", NORM: "خدا"}, {ORTH: "ست", NORM: "ست"}], + "خدایش": [{ORTH: "خدا", NORM: "خدا"}, {ORTH: "یش", NORM: "یش"}], + "خدایشان": [{ORTH: "خدای", NORM: "خدای"}, {ORTH: "شان", NORM: "شان"}], + "خردسالش": [{ORTH: "خردسال", NORM: "خردسال"}, {ORTH: "ش", NORM: "ش"}], + "خروپفشان": [{ORTH: "خروپف", NORM: "خروپف"}, {ORTH: "شان", NORM: "شان"}], + "خسته‌ای": [{ORTH: "خسته‌", NORM: "خسته‌"}, {ORTH: "ای", NORM: "ای"}], + "خطت": [{ORTH: "خط", NORM: "خط"}, {ORTH: "ت", NORM: "ت"}], + "خوابمان": [{ORTH: "خواب", NORM: "خواب"}, {ORTH: "مان", NORM: "مان"}], + "خواندنش": [{ORTH: "خواندن", NORM: "خواندن"}, {ORTH: "ش", NORM: "ش"}], + "خواهرش": [{ORTH: "خواهر", NORM: "خواهر"}, {ORTH: "ش", NORM: "ش"}], + "خوبش": [{ORTH: "خوب", NORM: "خوب"}, {ORTH: "ش", NORM: "ش"}], + "خودت": [{ORTH: "خود", NORM: "خود"}, {ORTH: "ت", NORM: "ت"}], + "خودتان": [{ORTH: "خود", NORM: "خود"}, {ORTH: "تان", NORM: "تان"}], + "خودش": [{ORTH: "خود", NORM: "خود"}, {ORTH: "ش", NORM: "ش"}], + "خودشان": [{ORTH: "خود", NORM: "خود"}, {ORTH: "شان", NORM: "شان"}], + "خودمان": [{ORTH: "خود", NORM: "خود"}, {ORTH: "مان", NORM: "مان"}], + "خوردمان": [{ORTH: "خورد", NORM: "خورد"}, {ORTH: "مان", NORM: "مان"}], + "خوردنشان": [{ORTH: "خوردن", NORM: "خوردن"}, {ORTH: "شان", NORM: "شان"}], + "خوشش": [{ORTH: "خوش", NORM: "خوش"}, {ORTH: "ش", NORM: "ش"}], + "خوشوقتم": [{ORTH: "خوشوقت", NORM: "خوشوقت"}, {ORTH: "م", NORM: "م"}], + "خونشان": [{ORTH: "خون", NORM: "خون"}, {ORTH: "شان", NORM: "شان"}], + "خویش": [{ORTH: "خوی", NORM: "خوی"}, {ORTH: "ش", NORM: "ش"}], + "خویشتنم": [{ORTH: "خویشتن", NORM: "خویشتن"}, {ORTH: "م", NORM: "م"}], + "خیالش": [{ORTH: "خیال", NORM: "خیال"}, {ORTH: "ش", NORM: "ش"}], + "خیسش": [{ORTH: "خیس", NORM: "خیس"}, {ORTH: "ش", NORM: "ش"}], + "داراست": [{ORTH: "دارا", NORM: "دارا"}, {ORTH: "ست", NORM: "ست"}], + "داستانهایش": [{ORTH: "داستانهای", NORM: "داستانهای"}, {ORTH: "ش", NORM: "ش"}], + "دخترمان": [{ORTH: "دختر", NORM: "دختر"}, {ORTH: "مان", NORM: "مان"}], + "دخیلند": [{ORTH: "دخیل", NORM: "دخیل"}, {ORTH: "ند", NORM: "ند"}], + "درباره‌ات": [{ORTH: "درباره", NORM: "درباره"}, {ORTH: "‌ات", NORM: "ات"}], + "درباره‌اش": [{ORTH: "درباره‌", NORM: "درباره‌"}, {ORTH: "اش", NORM: "اش"}], + "دردش": [{ORTH: "درد", NORM: "درد"}, {ORTH: "ش", NORM: "ش"}], + "دردشان": [{ORTH: "درد", NORM: "درد"}, {ORTH: "شان", NORM: "شان"}], + "درسته": [{ORTH: "درست", NORM: "درست"}, {ORTH: "ه", NORM: "ه"}], + "درش": [{ORTH: "در", NORM: "در"}, {ORTH: "ش", NORM: "ش"}], + "درون‌شان": [{ORTH: "درون‌", NORM: "درون‌"}, {ORTH: "شان", NORM: "شان"}], + "درین": [{ORTH: "در", NORM: "در"}, {ORTH: "ین", NORM: "ین"}], + "دریچه‌هایش": [{ORTH: "دریچه‌های", NORM: "دریچه‌های"}, {ORTH: "ش", NORM: "ش"}], + "دزدانش": [{ORTH: "دزدان", NORM: "دزدان"}, {ORTH: "ش", NORM: "ش"}], + "دستت": [{ORTH: "دست", NORM: "دست"}, {ORTH: "ت", NORM: "ت"}], + "دستش": [{ORTH: "دست", NORM: "دست"}, {ORTH: "ش", NORM: "ش"}], + "دستمان": [{ORTH: "دست", NORM: "دست"}, {ORTH: "مان", NORM: "مان"}], + "دستهایشان": [{ORTH: "دستهای", NORM: "دستهای"}, {ORTH: "شان", NORM: "شان"}], + "دست‌یافتنی‌ست": [ + {ORTH: "دست‌یافتنی‌", NORM: "دست‌یافتنی‌"}, + {ORTH: "ست", NORM: "ست"}, + ], + "دشمنند": [{ORTH: "دشمن", NORM: "دشمن"}, {ORTH: "ند", NORM: "ند"}], + "دشمنیشان": [{ORTH: "دشمنی", NORM: "دشمنی"}, {ORTH: "شان", NORM: "شان"}], + "دشمنیم": [{ORTH: "دشمن", NORM: "دشمن"}, {ORTH: "یم", NORM: "یم"}], + "دفترش": [{ORTH: "دفتر", NORM: "دفتر"}, {ORTH: "ش", NORM: "ش"}], + "دفنشان": [{ORTH: "دفن", NORM: "دفن"}, {ORTH: "شان", NORM: "شان"}], + "دلت": [{ORTH: "دل", NORM: "دل"}, {ORTH: "ت", NORM: "ت"}], + "دلش": [{ORTH: "دل", NORM: "دل"}, {ORTH: "ش", NORM: "ش"}], + "دلشان": [{ORTH: "دل", NORM: "دل"}, {ORTH: "شان", NORM: "شان"}], + "دلم": [{ORTH: "دل", NORM: "دل"}, {ORTH: "م", NORM: "م"}], + "دلیلش": [{ORTH: "دلیل", NORM: "دلیل"}, {ORTH: "ش", NORM: "ش"}], + "دنبالش": [{ORTH: "دنبال", NORM: "دنبال"}, {ORTH: "ش", NORM: "ش"}], + "دنباله‌اش": [{ORTH: "دنباله‌", NORM: "دنباله‌"}, {ORTH: "اش", NORM: "اش"}], + "دهاتی‌هایش": [{ORTH: "دهاتی‌های", NORM: "دهاتی‌های"}, {ORTH: "ش", NORM: "ش"}], + "دهانت": [{ORTH: "دهان", NORM: "دهان"}, {ORTH: "ت", NORM: "ت"}], + "دهنش": [{ORTH: "دهن", NORM: "دهن"}, {ORTH: "ش", NORM: "ش"}], + "دورش": [{ORTH: "دور", NORM: "دور"}, {ORTH: "ش", NORM: "ش"}], + "دوروبریهاشان": [ + {ORTH: "دوروبریها", NORM: "دوروبریها"}, + {ORTH: "شان", NORM: "شان"}, + ], + "دوستانش": [{ORTH: "دوستان", NORM: "دوستان"}, {ORTH: "ش", NORM: "ش"}], + "دوستانشان": [{ORTH: "دوستان", NORM: "دوستان"}, {ORTH: "شان", NORM: "شان"}], + "دوستت": [{ORTH: "دوست", NORM: "دوست"}, {ORTH: "ت", NORM: "ت"}], + "دوستش": [{ORTH: "دوست", NORM: "دوست"}, {ORTH: "ش", NORM: "ش"}], + "دومش": [{ORTH: "دوم", NORM: "دوم"}, {ORTH: "ش", NORM: "ش"}], + "دویدنش": [{ORTH: "دویدن", NORM: "دویدن"}, {ORTH: "ش", NORM: "ش"}], + "دکورهایمان": [{ORTH: "دکورهای", NORM: "دکورهای"}, {ORTH: "مان", NORM: "مان"}], + "دیدگاهش": [{ORTH: "دیدگاه", NORM: "دیدگاه"}, {ORTH: "ش", NORM: "ش"}], + "دیرت": [{ORTH: "دیر", NORM: "دیر"}, {ORTH: "ت", NORM: "ت"}], + "دیرم": [{ORTH: "دیر", NORM: "دیر"}, {ORTH: "م", NORM: "م"}], + "دینت": [{ORTH: "دین", NORM: "دین"}, {ORTH: "ت", NORM: "ت"}], + "دینش": [{ORTH: "دین", NORM: "دین"}, {ORTH: "ش", NORM: "ش"}], + "دین‌شان": [{ORTH: "دین‌", NORM: "دین‌"}, {ORTH: "شان", NORM: "شان"}], + "دیواره‌هایش": [{ORTH: "دیواره‌های", NORM: "دیواره‌های"}, {ORTH: "ش", NORM: "ش"}], + "دیوانه‌ای": [{ORTH: "دیوانه‌", NORM: "دیوانه‌"}, {ORTH: "ای", NORM: "ای"}], + "دیوی": [{ORTH: "دیو", NORM: "دیو"}, {ORTH: "ی", NORM: "ی"}], + "دیگرم": [{ORTH: "دیگر", NORM: "دیگر"}, {ORTH: "م", NORM: "م"}], + "دیگرمان": [{ORTH: "دیگر", NORM: "دیگر"}, {ORTH: "مان", NORM: "مان"}], + "ذهنش": [{ORTH: "ذهن", NORM: "ذهن"}, {ORTH: "ش", NORM: "ش"}], + "ذهنشان": [{ORTH: "ذهن", NORM: "ذهن"}, {ORTH: "شان", NORM: "شان"}], + "ذهنم": [{ORTH: "ذهن", NORM: "ذهن"}, {ORTH: "م", NORM: "م"}], + "رئوسش": [{ORTH: "رئوس", NORM: "رئوس"}, {ORTH: "ش", NORM: "ش"}], + "راهشان": [{ORTH: "راه", NORM: "راه"}, {ORTH: "شان", NORM: "شان"}], + "راهگشاست": [{ORTH: "راهگشا", NORM: "راهگشا"}, {ORTH: "ست", NORM: "ست"}], + "رایانه‌هایشان": [ + {ORTH: "رایانه‌های", NORM: "رایانه‌های"}, + {ORTH: "شان", NORM: "شان"}, + ], + "رعایتشان": [{ORTH: "رعایت", NORM: "رعایت"}, {ORTH: "شان", NORM: "شان"}], + "رفتارش": [{ORTH: "رفتار", NORM: "رفتار"}, {ORTH: "ش", NORM: "ش"}], + "رفتارشان": [{ORTH: "رفتار", NORM: "رفتار"}, {ORTH: "شان", NORM: "شان"}], + "رفتارمان": [{ORTH: "رفتار", NORM: "رفتار"}, {ORTH: "مان", NORM: "مان"}], + "رفتارهاست": [{ORTH: "رفتارها", NORM: "رفتارها"}, {ORTH: "ست", NORM: "ست"}], + "رفتارهایشان": [{ORTH: "رفتارهای", NORM: "رفتارهای"}, {ORTH: "شان", NORM: "شان"}], + "رفقایم": [{ORTH: "رفقا", NORM: "رفقا"}, {ORTH: "یم", NORM: "یم"}], + "رقیق‌ترش": [{ORTH: "رقیق‌تر", NORM: "رقیق‌تر"}, {ORTH: "ش", NORM: "ش"}], + "رنجند": [{ORTH: "رنج", NORM: "رنج"}, {ORTH: "ند", NORM: "ند"}], + "رهگشاست": [{ORTH: "رهگشا", NORM: "رهگشا"}, {ORTH: "ست", NORM: "ست"}], + "رواست": [{ORTH: "روا", NORM: "روا"}, {ORTH: "ست", NORM: "ست"}], + "روبروست": [{ORTH: "روبرو", NORM: "روبرو"}, {ORTH: "ست", NORM: "ست"}], + "روحی‌اش": [{ORTH: "روحی‌", NORM: "روحی‌"}, {ORTH: "اش", NORM: "اش"}], + "روزنامه‌اش": [{ORTH: "روزنامه‌", NORM: "روزنامه‌"}, {ORTH: "اش", NORM: "اش"}], + "روزه‌ست": [{ORTH: "روزه‌", NORM: "روزه‌"}, {ORTH: "ست", NORM: "ست"}], + "روسری‌اش": [{ORTH: "روسری‌", NORM: "روسری‌"}, {ORTH: "اش", NORM: "اش"}], + "روشتان": [{ORTH: "روش", NORM: "روش"}, {ORTH: "تان", NORM: "تان"}], + "رویش": [{ORTH: "روی", NORM: "روی"}, {ORTH: "ش", NORM: "ش"}], + "زبانش": [{ORTH: "زبان", NORM: "زبان"}, {ORTH: "ش", NORM: "ش"}], + "زحماتشان": [{ORTH: "زحمات", NORM: "زحمات"}, {ORTH: "شان", NORM: "شان"}], + "زدنهایشان": [{ORTH: "زدنهای", NORM: "زدنهای"}, {ORTH: "شان", NORM: "شان"}], + "زرنگشان": [{ORTH: "زرنگ", NORM: "زرنگ"}, {ORTH: "شان", NORM: "شان"}], + "زشتش": [{ORTH: "زشت", NORM: "زشت"}, {ORTH: "ش", NORM: "ش"}], + "زشتکارانند": [{ORTH: "زشتکاران", NORM: "زشتکاران"}, {ORTH: "ند", NORM: "ند"}], + "زلفش": [{ORTH: "زلف", NORM: "زلف"}, {ORTH: "ش", NORM: "ش"}], + "زمن": [{ORTH: "ز", NORM: "ز"}, {ORTH: "من", NORM: "من"}], + "زنبوری‌اش": [{ORTH: "زنبوری‌", NORM: "زنبوری‌"}, {ORTH: "اش", NORM: "اش"}], + "زندانم": [{ORTH: "زندان", NORM: "زندان"}, {ORTH: "م", NORM: "م"}], + "زنده‌ام": [{ORTH: "زنده‌", NORM: "زنده‌"}, {ORTH: "ام", NORM: "ام"}], + "زندگانی‌اش": [{ORTH: "زندگانی‌", NORM: "زندگانی‌"}, {ORTH: "اش", NORM: "اش"}], + "زندگی‌اش": [{ORTH: "زندگی‌", NORM: "زندگی‌"}, {ORTH: "اش", NORM: "اش"}], + "زندگی‌ام": [{ORTH: "زندگی‌", NORM: "زندگی‌"}, {ORTH: "ام", NORM: "ام"}], + "زندگی‌شان": [{ORTH: "زندگی‌", NORM: "زندگی‌"}, {ORTH: "شان", NORM: "شان"}], + "زنش": [{ORTH: "زن", NORM: "زن"}, {ORTH: "ش", NORM: "ش"}], + "زنند": [{ORTH: "زن", NORM: "زن"}, {ORTH: "ند", NORM: "ند"}], + "زو": [{ORTH: "ز", NORM: "ز"}, {ORTH: "و", NORM: "و"}], + "زیاده": [{ORTH: "زیاد", NORM: "زیاد"}, {ORTH: "ه", NORM: "ه"}], + "زیباست": [{ORTH: "زیبا", NORM: "زیبا"}, {ORTH: "ست", NORM: "ست"}], + "زیبایش": [{ORTH: "زیبای", NORM: "زیبای"}, {ORTH: "ش", NORM: "ش"}], + "زیبایی": [{ORTH: "زیبای", NORM: "زیبای"}, {ORTH: "ی", NORM: "ی"}], + "زیربناست": [{ORTH: "زیربنا", NORM: "زیربنا"}, {ORTH: "ست", NORM: "ست"}], + "زیرک‌اند": [{ORTH: "زیرک‌", NORM: "زیرک‌"}, {ORTH: "اند", NORM: "اند"}], + "سؤالتان": [{ORTH: "سؤال", NORM: "سؤال"}, {ORTH: "تان", NORM: "تان"}], + "سؤالم": [{ORTH: "سؤال", NORM: "سؤال"}, {ORTH: "م", NORM: "م"}], + "سابقه‌اش": [{ORTH: "سابقه‌", NORM: "سابقه‌"}, {ORTH: "اش", NORM: "اش"}], + "ساختنم": [{ORTH: "ساختن", NORM: "ساختن"}, {ORTH: "م", NORM: "م"}], + "ساده‌اش": [{ORTH: "ساده‌", NORM: "ساده‌"}, {ORTH: "اش", NORM: "اش"}], + "ساده‌اند": [{ORTH: "ساده‌", NORM: "ساده‌"}, {ORTH: "اند", NORM: "اند"}], + "سازمانش": [{ORTH: "سازمان", NORM: "سازمان"}, {ORTH: "ش", NORM: "ش"}], + "ساعتم": [{ORTH: "ساعت", NORM: "ساعت"}, {ORTH: "م", NORM: "م"}], + "سالته": [ + {ORTH: "سال", NORM: "سال"}, + {ORTH: "ت", NORM: "ت"}, + {ORTH: "ه", NORM: "ه"}, + ], + "سالش": [{ORTH: "سال", NORM: "سال"}, {ORTH: "ش", NORM: "ش"}], + "سالهاست": [{ORTH: "سالها", NORM: "سالها"}, {ORTH: "ست", NORM: "ست"}], + "ساله‌اش": [{ORTH: "ساله‌", NORM: "ساله‌"}, {ORTH: "اش", NORM: "اش"}], + "ساکتند": [{ORTH: "ساکت", NORM: "ساکت"}, {ORTH: "ند", NORM: "ند"}], + "ساکنند": [{ORTH: "ساکن", NORM: "ساکن"}, {ORTH: "ند", NORM: "ند"}], + "سبزشان": [{ORTH: "سبز", NORM: "سبز"}, {ORTH: "شان", NORM: "شان"}], + "سبیل‌مان": [{ORTH: "سبیل‌", NORM: "سبیل‌"}, {ORTH: "مان", NORM: "مان"}], + "ستم‌هایش": [{ORTH: "ستم‌های", NORM: "ستم‌های"}, {ORTH: "ش", NORM: "ش"}], + "سخنانش": [{ORTH: "سخنان", NORM: "سخنان"}, {ORTH: "ش", NORM: "ش"}], + "سخنانشان": [{ORTH: "سخنان", NORM: "سخنان"}, {ORTH: "شان", NORM: "شان"}], + "سخنتان": [{ORTH: "سخن", NORM: "سخن"}, {ORTH: "تان", NORM: "تان"}], + "سخنش": [{ORTH: "سخن", NORM: "سخن"}, {ORTH: "ش", NORM: "ش"}], + "سخنم": [{ORTH: "سخن", NORM: "سخن"}, {ORTH: "م", NORM: "م"}], + "سردش": [{ORTH: "سرد", NORM: "سرد"}, {ORTH: "ش", NORM: "ش"}], + "سرزمینشان": [{ORTH: "سرزمین", NORM: "سرزمین"}, {ORTH: "شان", NORM: "شان"}], + "سرش": [{ORTH: "سر", NORM: "سر"}, {ORTH: "ش", NORM: "ش"}], + "سرمایه‌دارهاست": [ + {ORTH: "سرمایه‌دارها", NORM: "سرمایه‌دارها"}, + {ORTH: "ست", NORM: "ست"}, + ], + "سرنوشتش": [{ORTH: "سرنوشت", NORM: "سرنوشت"}, {ORTH: "ش", NORM: "ش"}], + "سرنوشتشان": [{ORTH: "سرنوشت", NORM: "سرنوشت"}, {ORTH: "شان", NORM: "شان"}], + "سروتهش": [{ORTH: "سروته", NORM: "سروته"}, {ORTH: "ش", NORM: "ش"}], + "سرچشمه‌اش": [{ORTH: "سرچشمه‌", NORM: "سرچشمه‌"}, {ORTH: "اش", NORM: "اش"}], + "سقمش": [{ORTH: "سقم", NORM: "سقم"}, {ORTH: "ش", NORM: "ش"}], + "سنش": [{ORTH: "سن", NORM: "سن"}, {ORTH: "ش", NORM: "ش"}], + "سپاهش": [{ORTH: "سپاه", NORM: "سپاه"}, {ORTH: "ش", NORM: "ش"}], + "سیاسیشان": [{ORTH: "سیاسی", NORM: "سیاسی"}, {ORTH: "شان", NORM: "شان"}], + "سیاه‌چاله‌هاست": [ + {ORTH: "سیاه‌چاله‌ها", NORM: "سیاه‌چاله‌ها"}, + {ORTH: "ست", NORM: "ست"}, + ], + "شاخه‌هایشان": [{ORTH: "شاخه‌های", NORM: "شاخه‌های"}, {ORTH: "شان", NORM: "شان"}], + "شالوده‌اش": [{ORTH: "شالوده‌", NORM: "شالوده‌"}, {ORTH: "اش", NORM: "اش"}], + "شانه‌هایش": [{ORTH: "شانه‌های", NORM: "شانه‌های"}, {ORTH: "ش", NORM: "ش"}], + "شاهدیم": [{ORTH: "شاهد", NORM: "شاهد"}, {ORTH: "یم", NORM: "یم"}], + "شاهکارهایش": [{ORTH: "شاهکارهای", NORM: "شاهکارهای"}, {ORTH: "ش", NORM: "ش"}], + "شخصیتش": [{ORTH: "شخصیت", NORM: "شخصیت"}, {ORTH: "ش", NORM: "ش"}], + "شدنشان": [{ORTH: "شدن", NORM: "شدن"}, {ORTH: "شان", NORM: "شان"}], + "شرکتیست": [{ORTH: "شرکتی", NORM: "شرکتی"}, {ORTH: "ست", NORM: "ست"}], + "شعارهاشان": [{ORTH: "شعارها", NORM: "شعارها"}, {ORTH: "شان", NORM: "شان"}], + "شعورش": [{ORTH: "شعور", NORM: "شعور"}, {ORTH: "ش", NORM: "ش"}], + "شغلش": [{ORTH: "شغل", NORM: "شغل"}, {ORTH: "ش", NORM: "ش"}], + "شماست": [{ORTH: "شما", NORM: "شما"}, {ORTH: "ست", NORM: "ست"}], + "شمشیرش": [{ORTH: "شمشیر", NORM: "شمشیر"}, {ORTH: "ش", NORM: "ش"}], + "شنیدنش": [{ORTH: "شنیدن", NORM: "شنیدن"}, {ORTH: "ش", NORM: "ش"}], + "شوراست": [{ORTH: "شورا", NORM: "شورا"}, {ORTH: "ست", NORM: "ست"}], + "شومت": [{ORTH: "شوم", NORM: "شوم"}, {ORTH: "ت", NORM: "ت"}], + "شیرینترش": [{ORTH: "شیرینتر", NORM: "شیرینتر"}, {ORTH: "ش", NORM: "ش"}], + "شیطان‌اند": [{ORTH: "شیطان‌", NORM: "شیطان‌"}, {ORTH: "اند", NORM: "اند"}], + "شیوه‌هاست": [{ORTH: "شیوه‌ها", NORM: "شیوه‌ها"}, {ORTH: "ست", NORM: "ست"}], + "صاحبش": [{ORTH: "صاحب", NORM: "صاحب"}, {ORTH: "ش", NORM: "ش"}], + "صحنه‌اش": [{ORTH: "صحنه‌", NORM: "صحنه‌"}, {ORTH: "اش", NORM: "اش"}], + "صدایش": [{ORTH: "صدای", NORM: "صدای"}, {ORTH: "ش", NORM: "ش"}], + "صددند": [{ORTH: "صدد", NORM: "صدد"}, {ORTH: "ند", NORM: "ند"}], + "صندوق‌هاست": [{ORTH: "صندوق‌ها", NORM: "صندوق‌ها"}, {ORTH: "ست", NORM: "ست"}], + "صندوق‌هایش": [{ORTH: "صندوق‌های", NORM: "صندوق‌های"}, {ORTH: "ش", NORM: "ش"}], + "صورتش": [{ORTH: "صورت", NORM: "صورت"}, {ORTH: "ش", NORM: "ش"}], + "ضروری‌اند": [{ORTH: "ضروری‌", NORM: "ضروری‌"}, {ORTH: "اند", NORM: "اند"}], + "ضمیرش": [{ORTH: "ضمیر", NORM: "ضمیر"}, {ORTH: "ش", NORM: "ش"}], + "طرفش": [{ORTH: "طرف", NORM: "طرف"}, {ORTH: "ش", NORM: "ش"}], + "طلسمش": [{ORTH: "طلسم", NORM: "طلسم"}, {ORTH: "ش", NORM: "ش"}], + "طوره": [{ORTH: "طور", NORM: "طور"}, {ORTH: "ه", NORM: "ه"}], + "عاشوراست": [{ORTH: "عاشورا", NORM: "عاشورا"}, {ORTH: "ست", NORM: "ست"}], + "عبارتند": [{ORTH: "عبارت", NORM: "عبارت"}, {ORTH: "ند", NORM: "ند"}], + "عزیزانتان": [{ORTH: "عزیزان", NORM: "عزیزان"}, {ORTH: "تان", NORM: "تان"}], + "عزیزانش": [{ORTH: "عزیزان", NORM: "عزیزان"}, {ORTH: "ش", NORM: "ش"}], + "عزیزش": [{ORTH: "عزیز", NORM: "عزیز"}, {ORTH: "ش", NORM: "ش"}], + "عشرت‌طلبی‌اش": [ + {ORTH: "عشرت‌طلبی‌", NORM: "عشرت‌طلبی‌"}, + {ORTH: "اش", NORM: "اش"}, + ], + "عقبیم": [{ORTH: "عقب", NORM: "عقب"}, {ORTH: "یم", NORM: "یم"}], + "علاقه‌اش": [{ORTH: "علاقه‌", NORM: "علاقه‌"}, {ORTH: "اش", NORM: "اش"}], + "علمیمان": [{ORTH: "علمی", NORM: "علمی"}, {ORTH: "مان", NORM: "مان"}], + "عمرش": [{ORTH: "عمر", NORM: "عمر"}, {ORTH: "ش", NORM: "ش"}], + "عمرشان": [{ORTH: "عمر", NORM: "عمر"}, {ORTH: "شان", NORM: "شان"}], + "عملش": [{ORTH: "عمل", NORM: "عمل"}, {ORTH: "ش", NORM: "ش"}], + "عملی‌اند": [{ORTH: "عملی‌", NORM: "عملی‌"}, {ORTH: "اند", NORM: "اند"}], + "عمویت": [{ORTH: "عموی", NORM: "عموی"}, {ORTH: "ت", NORM: "ت"}], + "عمویش": [{ORTH: "عموی", NORM: "عموی"}, {ORTH: "ش", NORM: "ش"}], + "عمیقش": [{ORTH: "عمیق", NORM: "عمیق"}, {ORTH: "ش", NORM: "ش"}], + "عواملش": [{ORTH: "عوامل", NORM: "عوامل"}, {ORTH: "ش", NORM: "ش"}], + "عوضشان": [{ORTH: "عوض", NORM: "عوض"}, {ORTH: "شان", NORM: "شان"}], + "غذایی‌شان": [{ORTH: "غذایی‌", NORM: "غذایی‌"}, {ORTH: "شان", NORM: "شان"}], + "غریبه‌اند": [{ORTH: "غریبه‌", NORM: "غریبه‌"}, {ORTH: "اند", NORM: "اند"}], + "غلامانش": [{ORTH: "غلامان", NORM: "غلامان"}, {ORTH: "ش", NORM: "ش"}], + "غلطهاست": [{ORTH: "غلطها", NORM: "غلطها"}, {ORTH: "ست", NORM: "ست"}], + "فراموشتان": [{ORTH: "فراموش", NORM: "فراموش"}, {ORTH: "تان", NORM: "تان"}], + "فردی‌اند": [{ORTH: "فردی‌", NORM: "فردی‌"}, {ORTH: "اند", NORM: "اند"}], + "فرزندانش": [{ORTH: "فرزندان", NORM: "فرزندان"}, {ORTH: "ش", NORM: "ش"}], + "فرزندش": [{ORTH: "فرزند", NORM: "فرزند"}, {ORTH: "ش", NORM: "ش"}], + "فرم‌هایش": [{ORTH: "فرم‌های", NORM: "فرم‌های"}, {ORTH: "ش", NORM: "ش"}], + "فرهنگی‌مان": [{ORTH: "فرهنگی‌", NORM: "فرهنگی‌"}, {ORTH: "مان", NORM: "مان"}], + "فریادشان": [{ORTH: "فریاد", NORM: "فریاد"}, {ORTH: "شان", NORM: "شان"}], + "فضایی‌شان": [{ORTH: "فضایی‌", NORM: "فضایی‌"}, {ORTH: "شان", NORM: "شان"}], + "فقیرشان": [{ORTH: "فقیر", NORM: "فقیر"}, {ORTH: "شان", NORM: "شان"}], + "فوری‌شان": [{ORTH: "فوری‌", NORM: "فوری‌"}, {ORTH: "شان", NORM: "شان"}], + "قائلند": [{ORTH: "قائل", NORM: "قائل"}, {ORTH: "ند", NORM: "ند"}], + "قائلیم": [{ORTH: "قائل", NORM: "قائل"}, {ORTH: "یم", NORM: "یم"}], + "قادرند": [{ORTH: "قادر", NORM: "قادر"}, {ORTH: "ند", NORM: "ند"}], + "قانونمندش": [{ORTH: "قانونمند", NORM: "قانونمند"}, {ORTH: "ش", NORM: "ش"}], + "قبلند": [{ORTH: "قبل", NORM: "قبل"}, {ORTH: "ند", NORM: "ند"}], + "قبلی‌اش": [{ORTH: "قبلی‌", NORM: "قبلی‌"}, {ORTH: "اش", NORM: "اش"}], + "قبلی‌مان": [{ORTH: "قبلی‌", NORM: "قبلی‌"}, {ORTH: "مان", NORM: "مان"}], + "قدریست": [{ORTH: "قدری", NORM: "قدری"}, {ORTH: "ست", NORM: "ست"}], + "قدمش": [{ORTH: "قدم", NORM: "قدم"}, {ORTH: "ش", NORM: "ش"}], + "قسمتش": [{ORTH: "قسمت", NORM: "قسمت"}, {ORTH: "ش", NORM: "ش"}], + "قضایاست": [{ORTH: "قضایا", NORM: "قضایا"}, {ORTH: "ست", NORM: "ست"}], + "قضیه‌شان": [{ORTH: "قضیه‌", NORM: "قضیه‌"}, {ORTH: "شان", NORM: "شان"}], + "قهرمانهایشان": [ + {ORTH: "قهرمانهای", NORM: "قهرمانهای"}, + {ORTH: "شان", NORM: "شان"}, + ], + "قهرمانیش": [{ORTH: "قهرمانی", NORM: "قهرمانی"}, {ORTH: "ش", NORM: "ش"}], + "قومت": [{ORTH: "قوم", NORM: "قوم"}, {ORTH: "ت", NORM: "ت"}], + "لازمه‌اش": [{ORTH: "لازمه‌", NORM: "لازمه‌"}, {ORTH: "اش", NORM: "اش"}], + "مأموریتش": [{ORTH: "مأموریت", NORM: "مأموریت"}, {ORTH: "ش", NORM: "ش"}], + "مأموریتم": [{ORTH: "مأموریت", NORM: "مأموریت"}, {ORTH: "م", NORM: "م"}], + "مأموریت‌اند": [{ORTH: "مأموریت‌", NORM: "مأموریت‌"}, {ORTH: "اند", NORM: "اند"}], + "مادرانشان": [{ORTH: "مادران", NORM: "مادران"}, {ORTH: "شان", NORM: "شان"}], + "مادرت": [{ORTH: "مادر", NORM: "مادر"}, {ORTH: "ت", NORM: "ت"}], + "مادرش": [{ORTH: "مادر", NORM: "مادر"}, {ORTH: "ش", NORM: "ش"}], + "مادرم": [{ORTH: "مادر", NORM: "مادر"}, {ORTH: "م", NORM: "م"}], + "ماست": [{ORTH: "ما", NORM: "ما"}, {ORTH: "ست", NORM: "ست"}], + "مالی‌اش": [{ORTH: "مالی‌", NORM: "مالی‌"}, {ORTH: "اش", NORM: "اش"}], + "ماهیتش": [{ORTH: "ماهیت", NORM: "ماهیت"}, {ORTH: "ش", NORM: "ش"}], + "مایی": [{ORTH: "ما", NORM: "ما"}, {ORTH: "یی", NORM: "یی"}], + "مجازاتش": [{ORTH: "مجازات", NORM: "مجازات"}, {ORTH: "ش", NORM: "ش"}], + "مجبورند": [{ORTH: "مجبور", NORM: "مجبور"}, {ORTH: "ند", NORM: "ند"}], + "محتاجند": [{ORTH: "محتاج", NORM: "محتاج"}, {ORTH: "ند", NORM: "ند"}], + "محرمم": [{ORTH: "محرم", NORM: "محرم"}, {ORTH: "م", NORM: "م"}], + "محلش": [{ORTH: "محل", NORM: "محل"}, {ORTH: "ش", NORM: "ش"}], + "مخالفند": [{ORTH: "مخالف", NORM: "مخالف"}, {ORTH: "ند", NORM: "ند"}], + "مخدرش": [{ORTH: "مخدر", NORM: "مخدر"}, {ORTH: "ش", NORM: "ش"}], + "مدتهاست": [{ORTH: "مدتها", NORM: "مدتها"}, {ORTH: "ست", NORM: "ست"}], + "مدرسه‌ات": [{ORTH: "مدرسه", NORM: "مدرسه"}, {ORTH: "‌ات", NORM: "ات"}], + "مدرکم": [{ORTH: "مدرک", NORM: "مدرک"}, {ORTH: "م", NORM: "م"}], + "مدیرانش": [{ORTH: "مدیران", NORM: "مدیران"}, {ORTH: "ش", NORM: "ش"}], + "مدیونم": [{ORTH: "مدیون", NORM: "مدیون"}, {ORTH: "م", NORM: "م"}], + "مذهبی‌اند": [{ORTH: "مذهبی‌", NORM: "مذهبی‌"}, {ORTH: "اند", NORM: "اند"}], + "مرا": [{ORTH: "م", NORM: "م"}, {ORTH: "را", NORM: "را"}], + "مرادت": [{ORTH: "مراد", NORM: "مراد"}, {ORTH: "ت", NORM: "ت"}], + "مردمشان": [{ORTH: "مردم", NORM: "مردم"}, {ORTH: "شان", NORM: "شان"}], + "مردمند": [{ORTH: "مردم", NORM: "مردم"}, {ORTH: "ند", NORM: "ند"}], + "مردم‌اند": [{ORTH: "مردم‌", NORM: "مردم‌"}, {ORTH: "اند", NORM: "اند"}], + "مرزشان": [{ORTH: "مرز", NORM: "مرز"}, {ORTH: "شان", NORM: "شان"}], + "مرزهاشان": [{ORTH: "مرزها", NORM: "مرزها"}, {ORTH: "شان", NORM: "شان"}], + "مزدورش": [{ORTH: "مزدور", NORM: "مزدور"}, {ORTH: "ش", NORM: "ش"}], + "مسئولیتش": [{ORTH: "مسئولیت", NORM: "مسئولیت"}, {ORTH: "ش", NORM: "ش"}], + "مسائلش": [{ORTH: "مسائل", NORM: "مسائل"}, {ORTH: "ش", NORM: "ش"}], + "مستحضرید": [{ORTH: "مستحضر", NORM: "مستحضر"}, {ORTH: "ید", NORM: "ید"}], + "مسلمانم": [{ORTH: "مسلمان", NORM: "مسلمان"}, {ORTH: "م", NORM: "م"}], + "مسلمانند": [{ORTH: "مسلمان", NORM: "مسلمان"}, {ORTH: "ند", NORM: "ند"}], + "مشتریانش": [{ORTH: "مشتریان", NORM: "مشتریان"}, {ORTH: "ش", NORM: "ش"}], + "مشتهایمان": [{ORTH: "مشتهای", NORM: "مشتهای"}, {ORTH: "مان", NORM: "مان"}], + "مشخصند": [{ORTH: "مشخص", NORM: "مشخص"}, {ORTH: "ند", NORM: "ند"}], + "مشغولند": [{ORTH: "مشغول", NORM: "مشغول"}, {ORTH: "ند", NORM: "ند"}], + "مشغولیم": [{ORTH: "مشغول", NORM: "مشغول"}, {ORTH: "یم", NORM: "یم"}], + "مشهورش": [{ORTH: "مشهور", NORM: "مشهور"}, {ORTH: "ش", NORM: "ش"}], + "مشکلاتشان": [{ORTH: "مشکلات", NORM: "مشکلات"}, {ORTH: "شان", NORM: "شان"}], + "مشکلم": [{ORTH: "مشکل", NORM: "مشکل"}, {ORTH: "م", NORM: "م"}], + "مطمئنم": [{ORTH: "مطمئن", NORM: "مطمئن"}, {ORTH: "م", NORM: "م"}], + "معامله‌مان": [{ORTH: "معامله‌", NORM: "معامله‌"}, {ORTH: "مان", NORM: "مان"}], + "معتقدم": [{ORTH: "معتقد", NORM: "معتقد"}, {ORTH: "م", NORM: "م"}], + "معتقدند": [{ORTH: "معتقد", NORM: "معتقد"}, {ORTH: "ند", NORM: "ند"}], + "معتقدیم": [{ORTH: "معتقد", NORM: "معتقد"}, {ORTH: "یم", NORM: "یم"}], + "معرفی‌اش": [{ORTH: "معرفی‌", NORM: "معرفی‌"}, {ORTH: "اش", NORM: "اش"}], + "معروفش": [{ORTH: "معروف", NORM: "معروف"}, {ORTH: "ش", NORM: "ش"}], + "معضلاتمان": [{ORTH: "معضلات", NORM: "معضلات"}, {ORTH: "مان", NORM: "مان"}], + "معلمش": [{ORTH: "معلم", NORM: "معلم"}, {ORTH: "ش", NORM: "ش"}], + "معنایش": [{ORTH: "معنای", NORM: "معنای"}, {ORTH: "ش", NORM: "ش"}], + "مغزشان": [{ORTH: "مغز", NORM: "مغز"}, {ORTH: "شان", NORM: "شان"}], + "مفیدند": [{ORTH: "مفید", NORM: "مفید"}, {ORTH: "ند", NORM: "ند"}], + "مقابلش": [{ORTH: "مقابل", NORM: "مقابل"}, {ORTH: "ش", NORM: "ش"}], + "مقاله‌اش": [{ORTH: "مقاله‌", NORM: "مقاله‌"}, {ORTH: "اش", NORM: "اش"}], + "مقدمش": [{ORTH: "مقدم", NORM: "مقدم"}, {ORTH: "ش", NORM: "ش"}], + "مقرش": [{ORTH: "مقر", NORM: "مقر"}, {ORTH: "ش", NORM: "ش"}], + "مقصدشان": [{ORTH: "مقصد", NORM: "مقصد"}, {ORTH: "شان", NORM: "شان"}], + "مقصرند": [{ORTH: "مقصر", NORM: "مقصر"}, {ORTH: "ند", NORM: "ند"}], + "مقصودتان": [{ORTH: "مقصود", NORM: "مقصود"}, {ORTH: "تان", NORM: "تان"}], + "ملاقاتهایش": [{ORTH: "ملاقاتهای", NORM: "ملاقاتهای"}, {ORTH: "ش", NORM: "ش"}], + "ممکنشان": [{ORTH: "ممکن", NORM: "ممکن"}, {ORTH: "شان", NORM: "شان"}], + "ممیزیهاست": [{ORTH: "ممیزیها", NORM: "ممیزیها"}, {ORTH: "ست", NORM: "ست"}], + "منظورم": [{ORTH: "منظور", NORM: "منظور"}, {ORTH: "م", NORM: "م"}], + "منی": [{ORTH: "من", NORM: "من"}, {ORTH: "ی", NORM: "ی"}], + "منید": [{ORTH: "من", NORM: "من"}, {ORTH: "ید", NORM: "ید"}], + "مهربانش": [{ORTH: "مهربان", NORM: "مهربان"}, {ORTH: "ش", NORM: "ش"}], + "مهم‌اند": [{ORTH: "مهم‌", NORM: "مهم‌"}, {ORTH: "اند", NORM: "اند"}], + "مواجهند": [{ORTH: "مواجه", NORM: "مواجه"}, {ORTH: "ند", NORM: "ند"}], + "مواجه‌اند": [{ORTH: "مواجه‌", NORM: "مواجه‌"}, {ORTH: "اند", NORM: "اند"}], + "مواخذه‌ات": [{ORTH: "مواخذه", NORM: "مواخذه"}, {ORTH: "‌ات", NORM: "ات"}], + "مواضعشان": [{ORTH: "مواضع", NORM: "مواضع"}, {ORTH: "شان", NORM: "شان"}], + "مواضعمان": [{ORTH: "مواضع", NORM: "مواضع"}, {ORTH: "مان", NORM: "مان"}], + "موافقند": [{ORTH: "موافق", NORM: "موافق"}, {ORTH: "ند", NORM: "ند"}], + "موجوداتش": [{ORTH: "موجودات", NORM: "موجودات"}, {ORTH: "ش", NORM: "ش"}], + "موجودند": [{ORTH: "موجود", NORM: "موجود"}, {ORTH: "ند", NORM: "ند"}], + "موردش": [{ORTH: "مورد", NORM: "مورد"}, {ORTH: "ش", NORM: "ش"}], + "موضعشان": [{ORTH: "موضع", NORM: "موضع"}, {ORTH: "شان", NORM: "شان"}], + "موظفند": [{ORTH: "موظف", NORM: "موظف"}, {ORTH: "ند", NORM: "ند"}], + "موهایش": [{ORTH: "موهای", NORM: "موهای"}, {ORTH: "ش", NORM: "ش"}], + "موهایمان": [{ORTH: "موهای", NORM: "موهای"}, {ORTH: "مان", NORM: "مان"}], + "مویم": [{ORTH: "مو", NORM: "مو"}, {ORTH: "یم", NORM: "یم"}], + "ناخرسندند": [{ORTH: "ناخرسند", NORM: "ناخرسند"}, {ORTH: "ند", NORM: "ند"}], + "ناراحتیش": [{ORTH: "ناراحتی", NORM: "ناراحتی"}, {ORTH: "ش", NORM: "ش"}], + "ناراضی‌اند": [{ORTH: "ناراضی‌", NORM: "ناراضی‌"}, {ORTH: "اند", NORM: "اند"}], + "نارواست": [{ORTH: "ناروا", NORM: "ناروا"}, {ORTH: "ست", NORM: "ست"}], + "نازش": [{ORTH: "ناز", NORM: "ناز"}, {ORTH: "ش", NORM: "ش"}], + "نامش": [{ORTH: "نام", NORM: "نام"}, {ORTH: "ش", NORM: "ش"}], + "نامشان": [{ORTH: "نام", NORM: "نام"}, {ORTH: "شان", NORM: "شان"}], + "نامم": [{ORTH: "نام", NORM: "نام"}, {ORTH: "م", NORM: "م"}], + "نامه‌ات": [{ORTH: "نامه", NORM: "نامه"}, {ORTH: "‌ات", NORM: "ات"}], + "نامه‌ام": [{ORTH: "نامه‌", NORM: "نامه‌"}, {ORTH: "ام", NORM: "ام"}], + "ناچارم": [{ORTH: "ناچار", NORM: "ناچار"}, {ORTH: "م", NORM: "م"}], + "نخست‌وزیری‌اش": [ + {ORTH: "نخست‌وزیری‌", NORM: "نخست‌وزیری‌"}, + {ORTH: "اش", NORM: "اش"}, + ], + "نزدش": [{ORTH: "نزد", NORM: "نزد"}, {ORTH: "ش", NORM: "ش"}], + "نشانم": [{ORTH: "نشان", NORM: "نشان"}, {ORTH: "م", NORM: "م"}], + "نظرات‌شان": [{ORTH: "نظرات‌", NORM: "نظرات‌"}, {ORTH: "شان", NORM: "شان"}], + "نظرتان": [{ORTH: "نظر", NORM: "نظر"}, {ORTH: "تان", NORM: "تان"}], + "نظرش": [{ORTH: "نظر", NORM: "نظر"}, {ORTH: "ش", NORM: "ش"}], + "نظرشان": [{ORTH: "نظر", NORM: "نظر"}, {ORTH: "شان", NORM: "شان"}], + "نظرم": [{ORTH: "نظر", NORM: "نظر"}, {ORTH: "م", NORM: "م"}], + "نظرهایشان": [{ORTH: "نظرهای", NORM: "نظرهای"}, {ORTH: "شان", NORM: "شان"}], + "نفاقش": [{ORTH: "نفاق", NORM: "نفاق"}, {ORTH: "ش", NORM: "ش"}], + "نفرند": [{ORTH: "نفر", NORM: "نفر"}, {ORTH: "ند", NORM: "ند"}], + "نفوذیند": [{ORTH: "نفوذی", NORM: "نفوذی"}, {ORTH: "ند", NORM: "ند"}], + "نقطه‌نظراتتان": [ + {ORTH: "نقطه‌نظرات", NORM: "نقطه‌نظرات"}, + {ORTH: "تان", NORM: "تان"}, + ], + "نمایشی‌مان": [{ORTH: "نمایشی‌", NORM: "نمایشی‌"}, {ORTH: "مان", NORM: "مان"}], + "نمایندگی‌شان": [ + {ORTH: "نمایندگی‌", NORM: "نمایندگی‌"}, + {ORTH: "شان", NORM: "شان"}, + ], + "نمونه‌اش": [{ORTH: "نمونه‌", NORM: "نمونه‌"}, {ORTH: "اش", NORM: "اش"}], + "نمی‌پذیرندش": [{ORTH: "نمی‌پذیرند", NORM: "نمی‌پذیرند"}, {ORTH: "ش", NORM: "ش"}], + "نوآوری‌اش": [{ORTH: "نوآوری‌", NORM: "نوآوری‌"}, {ORTH: "اش", NORM: "اش"}], + "نوشته‌هایشان": [ + {ORTH: "نوشته‌های", NORM: "نوشته‌های"}, + {ORTH: "شان", NORM: "شان"}, + ], + "نوشته‌هایم": [{ORTH: "نوشته‌ها", NORM: "نوشته‌ها"}, {ORTH: "یم", NORM: "یم"}], + "نکردنشان": [{ORTH: "نکردن", NORM: "نکردن"}, {ORTH: "شان", NORM: "شان"}], + "نگاهداری‌شان": [ + {ORTH: "نگاهداری‌", NORM: "نگاهداری‌"}, + {ORTH: "شان", NORM: "شان"}, + ], + "نگاهش": [{ORTH: "نگاه", NORM: "نگاه"}, {ORTH: "ش", NORM: "ش"}], + "نگرانم": [{ORTH: "نگران", NORM: "نگران"}, {ORTH: "م", NORM: "م"}], + "نگرشهایشان": [{ORTH: "نگرشهای", NORM: "نگرشهای"}, {ORTH: "شان", NORM: "شان"}], + "نیازمندند": [{ORTH: "نیازمند", NORM: "نیازمند"}, {ORTH: "ند", NORM: "ند"}], + "هدفش": [{ORTH: "هدف", NORM: "هدف"}, {ORTH: "ش", NORM: "ش"}], + "همانست": [{ORTH: "همان", NORM: "همان"}, {ORTH: "ست", NORM: "ست"}], + "همراهش": [{ORTH: "همراه", NORM: "همراه"}, {ORTH: "ش", NORM: "ش"}], + "همسرتان": [{ORTH: "همسر", NORM: "همسر"}, {ORTH: "تان", NORM: "تان"}], + "همسرش": [{ORTH: "همسر", NORM: "همسر"}, {ORTH: "ش", NORM: "ش"}], + "همسرم": [{ORTH: "همسر", NORM: "همسر"}, {ORTH: "م", NORM: "م"}], + "همفکرانش": [{ORTH: "همفکران", NORM: "همفکران"}, {ORTH: "ش", NORM: "ش"}], + "همه‌اش": [{ORTH: "همه‌", NORM: "همه‌"}, {ORTH: "اش", NORM: "اش"}], + "همه‌شان": [{ORTH: "همه‌", NORM: "همه‌"}, {ORTH: "شان", NORM: "شان"}], + "همکارانش": [{ORTH: "همکاران", NORM: "همکاران"}, {ORTH: "ش", NORM: "ش"}], + "هم‌نظریم": [{ORTH: "هم‌نظر", NORM: "هم‌نظر"}, {ORTH: "یم", NORM: "یم"}], + "هنرش": [{ORTH: "هنر", NORM: "هنر"}, {ORTH: "ش", NORM: "ش"}], + "هواست": [{ORTH: "هوا", NORM: "هوا"}, {ORTH: "ست", NORM: "ست"}], + "هویتش": [{ORTH: "هویت", NORM: "هویت"}, {ORTH: "ش", NORM: "ش"}], + "وابسته‌اند": [{ORTH: "وابسته‌", NORM: "وابسته‌"}, {ORTH: "اند", NORM: "اند"}], + "واقفند": [{ORTH: "واقف", NORM: "واقف"}, {ORTH: "ند", NORM: "ند"}], + "والدینشان": [{ORTH: "والدین", NORM: "والدین"}, {ORTH: "شان", NORM: "شان"}], + "وجدان‌تان": [{ORTH: "وجدان‌", NORM: "وجدان‌"}, {ORTH: "تان", NORM: "تان"}], + "وجودشان": [{ORTH: "وجود", NORM: "وجود"}, {ORTH: "شان", NORM: "شان"}], + "وطنم": [{ORTH: "وطن", NORM: "وطن"}, {ORTH: "م", NORM: "م"}], + "وعده‌اش": [{ORTH: "وعده‌", NORM: "وعده‌"}, {ORTH: "اش", NORM: "اش"}], + "وقتمان": [{ORTH: "وقت", NORM: "وقت"}, {ORTH: "مان", NORM: "مان"}], + "ولادتش": [{ORTH: "ولادت", NORM: "ولادت"}, {ORTH: "ش", NORM: "ش"}], + "پایانش": [{ORTH: "پایان", NORM: "پایان"}, {ORTH: "ش", NORM: "ش"}], + "پایش": [{ORTH: "پای", NORM: "پای"}, {ORTH: "ش", NORM: "ش"}], + "پایین‌ترند": [{ORTH: "پایین‌تر", NORM: "پایین‌تر"}, {ORTH: "ند", NORM: "ند"}], + "پدرت": [{ORTH: "پدر", NORM: "پدر"}, {ORTH: "ت", NORM: "ت"}], + "پدرش": [{ORTH: "پدر", NORM: "پدر"}, {ORTH: "ش", NORM: "ش"}], + "پدرشان": [{ORTH: "پدر", NORM: "پدر"}, {ORTH: "شان", NORM: "شان"}], + "پدرم": [{ORTH: "پدر", NORM: "پدر"}, {ORTH: "م", NORM: "م"}], + "پربارش": [{ORTH: "پربار", NORM: "پربار"}, {ORTH: "ش", NORM: "ش"}], + "پروردگارت": [{ORTH: "پروردگار", NORM: "پروردگار"}, {ORTH: "ت", NORM: "ت"}], + "پسرتان": [{ORTH: "پسر", NORM: "پسر"}, {ORTH: "تان", NORM: "تان"}], + "پسرش": [{ORTH: "پسر", NORM: "پسر"}, {ORTH: "ش", NORM: "ش"}], + "پسرعمویش": [{ORTH: "پسرعموی", NORM: "پسرعموی"}, {ORTH: "ش", NORM: "ش"}], + "پسر‌عمویت": [{ORTH: "پسر‌عموی", NORM: "پسر‌عموی"}, {ORTH: "ت", NORM: "ت"}], + "پشتش": [{ORTH: "پشت", NORM: "پشت"}, {ORTH: "ش", NORM: "ش"}], + "پشیمونی": [{ORTH: "پشیمون", NORM: "پشیمون"}, {ORTH: "ی", NORM: "ی"}], + "پولش": [{ORTH: "پول", NORM: "پول"}, {ORTH: "ش", NORM: "ش"}], + "پژوهش‌هایش": [{ORTH: "پژوهش‌های", NORM: "پژوهش‌های"}, {ORTH: "ش", NORM: "ش"}], + "پیامبرش": [{ORTH: "پیامبر", NORM: "پیامبر"}, {ORTH: "ش", NORM: "ش"}], + "پیامبری": [{ORTH: "پیامبر", NORM: "پیامبر"}, {ORTH: "ی", NORM: "ی"}], + "پیامش": [{ORTH: "پیام", NORM: "پیام"}, {ORTH: "ش", NORM: "ش"}], + "پیداست": [{ORTH: "پیدا", NORM: "پیدا"}, {ORTH: "ست", NORM: "ست"}], + "پیراهنش": [{ORTH: "پیراهن", NORM: "پیراهن"}, {ORTH: "ش", NORM: "ش"}], + "پیروانش": [{ORTH: "پیروان", NORM: "پیروان"}, {ORTH: "ش", NORM: "ش"}], + "پیشانی‌اش": [{ORTH: "پیشانی‌", NORM: "پیشانی‌"}, {ORTH: "اش", NORM: "اش"}], + "پیمانت": [{ORTH: "پیمان", NORM: "پیمان"}, {ORTH: "ت", NORM: "ت"}], + "پیوندشان": [{ORTH: "پیوند", NORM: "پیوند"}, {ORTH: "شان", NORM: "شان"}], + "چاپش": [{ORTH: "چاپ", NORM: "چاپ"}, {ORTH: "ش", NORM: "ش"}], + "چت": [{ORTH: "چ", NORM: "چ"}, {ORTH: "ت", NORM: "ت"}], + "چته": [{ORTH: "چ", NORM: "چ"}, {ORTH: "ت", NORM: "ت"}, {ORTH: "ه", NORM: "ه"}], + "چرخ‌هایش": [{ORTH: "چرخ‌های", NORM: "چرخ‌های"}, {ORTH: "ش", NORM: "ش"}], + "چشمم": [{ORTH: "چشم", NORM: "چشم"}, {ORTH: "م", NORM: "م"}], + "چشمهایش": [{ORTH: "چشمهای", NORM: "چشمهای"}, {ORTH: "ش", NORM: "ش"}], + "چشمهایشان": [{ORTH: "چشمهای", NORM: "چشمهای"}, {ORTH: "شان", NORM: "شان"}], + "چمنم": [{ORTH: "چمن", NORM: "چمن"}, {ORTH: "م", NORM: "م"}], + "چهره‌اش": [{ORTH: "چهره‌", NORM: "چهره‌"}, {ORTH: "اش", NORM: "اش"}], + "چکاره‌اند": [{ORTH: "چکاره‌", NORM: "چکاره‌"}, {ORTH: "اند", NORM: "اند"}], + "چیزهاست": [{ORTH: "چیزها", NORM: "چیزها"}, {ORTH: "ست", NORM: "ست"}], + "چیزهایش": [{ORTH: "چیزهای", NORM: "چیزهای"}, {ORTH: "ش", NORM: "ش"}], + "چیزیست": [{ORTH: "چیزی", NORM: "چیزی"}, {ORTH: "ست", NORM: "ست"}], + "چیست": [{ORTH: "چی", NORM: "چی"}, {ORTH: "ست", NORM: "ست"}], + "کارش": [{ORTH: "کار", NORM: "کار"}, {ORTH: "ش", NORM: "ش"}], + "کارشان": [{ORTH: "کار", NORM: "کار"}, {ORTH: "شان", NORM: "شان"}], + "کارم": [{ORTH: "کار", NORM: "کار"}, {ORTH: "م", NORM: "م"}], + "کارند": [{ORTH: "کار", NORM: "کار"}, {ORTH: "ند", NORM: "ند"}], + "کارهایم": [{ORTH: "کارها", NORM: "کارها"}, {ORTH: "یم", NORM: "یم"}], + "کافیست": [{ORTH: "کافی", NORM: "کافی"}, {ORTH: "ست", NORM: "ست"}], + "کتابخانه‌اش": [{ORTH: "کتابخانه‌", NORM: "کتابخانه‌"}, {ORTH: "اش", NORM: "اش"}], + "کتابش": [{ORTH: "کتاب", NORM: "کتاب"}, {ORTH: "ش", NORM: "ش"}], + "کتابهاشان": [{ORTH: "کتابها", NORM: "کتابها"}, {ORTH: "شان", NORM: "شان"}], + "کجاست": [{ORTH: "کجا", NORM: "کجا"}, {ORTH: "ست", NORM: "ست"}], + "کدورتهایشان": [{ORTH: "کدورتهای", NORM: "کدورتهای"}, {ORTH: "شان", NORM: "شان"}], + "کردنش": [{ORTH: "کردن", NORM: "کردن"}, {ORTH: "ش", NORM: "ش"}], + "کرم‌خورده‌اش": [ + {ORTH: "کرم‌خورده‌", NORM: "کرم‌خورده‌"}, + {ORTH: "اش", NORM: "اش"}, + ], + "کشش": [{ORTH: "کش", NORM: "کش"}, {ORTH: "ش", NORM: "ش"}], + "کشورش": [{ORTH: "کشور", NORM: "کشور"}, {ORTH: "ش", NORM: "ش"}], + "کشورشان": [{ORTH: "کشور", NORM: "کشور"}, {ORTH: "شان", NORM: "شان"}], + "کشورمان": [{ORTH: "کشور", NORM: "کشور"}, {ORTH: "مان", NORM: "مان"}], + "کشورهاست": [{ORTH: "کشورها", NORM: "کشورها"}, {ORTH: "ست", NORM: "ست"}], + "کلیشه‌هاست": [{ORTH: "کلیشه‌ها", NORM: "کلیشه‌ها"}, {ORTH: "ست", NORM: "ست"}], + "کمبودهاست": [{ORTH: "کمبودها", NORM: "کمبودها"}, {ORTH: "ست", NORM: "ست"}], + "کمتره": [{ORTH: "کمتر", NORM: "کمتر"}, {ORTH: "ه", NORM: "ه"}], + "کمکم": [{ORTH: "کمک", NORM: "کمک"}, {ORTH: "م", NORM: "م"}], + "کنارش": [{ORTH: "کنار", NORM: "کنار"}, {ORTH: "ش", NORM: "ش"}], + "کودکانشان": [{ORTH: "کودکان", NORM: "کودکان"}, {ORTH: "شان", NORM: "شان"}], + "کوچکش": [{ORTH: "کوچک", NORM: "کوچک"}, {ORTH: "ش", NORM: "ش"}], + "کیست": [{ORTH: "کی", NORM: "کی"}, {ORTH: "ست", NORM: "ست"}], + "کیفش": [{ORTH: "کیف", NORM: "کیف"}, {ORTH: "ش", NORM: "ش"}], + "گذشته‌اند": [{ORTH: "گذشته‌", NORM: "گذشته‌"}, {ORTH: "اند", NORM: "اند"}], + "گرانقدرش": [{ORTH: "گرانقدر", NORM: "گرانقدر"}, {ORTH: "ش", NORM: "ش"}], + "گرانقدرشان": [{ORTH: "گرانقدر", NORM: "گرانقدر"}, {ORTH: "شان", NORM: "شان"}], + "گردنتان": [{ORTH: "گردن", NORM: "گردن"}, {ORTH: "تان", NORM: "تان"}], + "گردنش": [{ORTH: "گردن", NORM: "گردن"}, {ORTH: "ش", NORM: "ش"}], + "گرفتارند": [{ORTH: "گرفتار", NORM: "گرفتار"}, {ORTH: "ند", NORM: "ند"}], + "گرفتنت": [{ORTH: "گرفتن", NORM: "گرفتن"}, {ORTH: "ت", NORM: "ت"}], + "گروهند": [{ORTH: "گروه", NORM: "گروه"}, {ORTH: "ند", NORM: "ند"}], + "گروگانهایش": [{ORTH: "گروگانهای", NORM: "گروگانهای"}, {ORTH: "ش", NORM: "ش"}], + "گریمش": [{ORTH: "گریم", NORM: "گریم"}, {ORTH: "ش", NORM: "ش"}], + "گفتارمان": [{ORTH: "گفتار", NORM: "گفتار"}, {ORTH: "مان", NORM: "مان"}], + "گلهایش": [{ORTH: "گلهای", NORM: "گلهای"}, {ORTH: "ش", NORM: "ش"}], + "گلویش": [{ORTH: "گلوی", NORM: "گلوی"}, {ORTH: "ش", NORM: "ش"}], + "گناهت": [{ORTH: "گناه", NORM: "گناه"}, {ORTH: "ت", NORM: "ت"}], + "گوشش": [{ORTH: "گوش", NORM: "گوش"}, {ORTH: "ش", NORM: "ش"}], + "گوشم": [{ORTH: "گوش", NORM: "گوش"}, {ORTH: "م", NORM: "م"}], + "گولش": [{ORTH: "گول", NORM: "گول"}, {ORTH: "ش", NORM: "ش"}], + "یادتان": [{ORTH: "یاد", NORM: "یاد"}, {ORTH: "تان", NORM: "تان"}], + "یادم": [{ORTH: "یاد", NORM: "یاد"}, {ORTH: "م", NORM: "م"}], + "یادمان": [{ORTH: "یاد", NORM: "یاد"}, {ORTH: "مان", NORM: "مان"}], + "یارانش": [{ORTH: "یاران", NORM: "یاران"}, {ORTH: "ش", NORM: "ش"}], } - -_exc.update( - { - "آبرویت": [ - {ORTH: "آبروی", LEMMA: "آبروی", NORM: "آبروی", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "آب‌نباتش": [ - {ORTH: "آب‌نبات", LEMMA: "آب‌نبات", NORM: "آب‌نبات", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "آثارش": [ - {ORTH: "آثار", LEMMA: "آثار", NORM: "آثار", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "آخرش": [ - {ORTH: "آخر", LEMMA: "آخر", NORM: "آخر", TAG: "ADV"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "آدمهاست": [ - {ORTH: "آدمها", LEMMA: "آدمها", NORM: "آدمها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "آرزومندیم": [ - {ORTH: "آرزومند", LEMMA: "آرزومند", NORM: "آرزومند", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "آزادند": [ - {ORTH: "آزاد", LEMMA: "آزاد", NORM: "آزاد", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "آسیب‌پذیرند": [ - {ORTH: "آسیب‌پذیر", LEMMA: "آسیب‌پذیر", NORM: "آسیب‌پذیر", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "آفریده‌اند": [ - {ORTH: "آفریده‌", LEMMA: "آفریده‌", NORM: "آفریده‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "آمدنش": [ - {ORTH: "آمدن", LEMMA: "آمدن", NORM: "آمدن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "آمریکاست": [ - {ORTH: "آمریکا", LEMMA: "آمریکا", NORM: "آمریکا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "آنجاست": [ - {ORTH: "آنجا", LEMMA: "آنجا", NORM: "آنجا", TAG: "ADV"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "آنست": [ - {ORTH: "آن", LEMMA: "آن", NORM: "آن", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "آنند": [ - {ORTH: "آن", LEMMA: "آن", NORM: "آن", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "آن‌هاست": [ - {ORTH: "آن‌ها", LEMMA: "آن‌ها", NORM: "آن‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "آپاداناست": [ - {ORTH: "آپادانا", LEMMA: "آپادانا", NORM: "آپادانا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "اجتماعی‌مان": [ - {ORTH: "اجتماعی‌", LEMMA: "اجتماعی‌", NORM: "اجتماعی‌", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "اجدادت": [ - {ORTH: "اجداد", LEMMA: "اجداد", NORM: "اجداد", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "اجدادش": [ - {ORTH: "اجداد", LEMMA: "اجداد", NORM: "اجداد", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اجدادی‌شان": [ - {ORTH: "اجدادی‌", LEMMA: "اجدادی‌", NORM: "اجدادی‌", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "اجراست": [ - {ORTH: "اجرا", LEMMA: "اجرا", NORM: "اجرا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "اختیارش": [ - {ORTH: "اختیار", LEMMA: "اختیار", NORM: "اختیار", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اخلاقشان": [ - {ORTH: "اخلاق", LEMMA: "اخلاق", NORM: "اخلاق", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "ادعایمان": [ - {ORTH: "ادعای", LEMMA: "ادعای", NORM: "ادعای", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "اذیتش": [ - {ORTH: "اذیت", LEMMA: "اذیت", NORM: "اذیت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اراده‌اش": [ - {ORTH: "اراده‌", LEMMA: "اراده‌", NORM: "اراده‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ارتباطش": [ - {ORTH: "ارتباط", LEMMA: "ارتباط", NORM: "ارتباط", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ارتباطمان": [ - {ORTH: "ارتباط", LEMMA: "ارتباط", NORM: "ارتباط", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "ارزشهاست": [ - {ORTH: "ارزشها", LEMMA: "ارزشها", NORM: "ارزشها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "ارزی‌اش": [ - {ORTH: "ارزی‌", LEMMA: "ارزی‌", NORM: "ارزی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "اره‌اش": [ - {ORTH: "اره‌", LEMMA: "اره‌", NORM: "اره‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ازش": [ - {ORTH: "از", LEMMA: "از", NORM: "از", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ازین": [ - {ORTH: "از", LEMMA: "از", NORM: "از", TAG: "ADP"}, - {ORTH: "ین", LEMMA: "ین", NORM: "ین", TAG: "NOUN"}, - ], - "ازین‌هاست": [ - {ORTH: "از", LEMMA: "از", NORM: "از", TAG: "ADP"}, - {ORTH: "ین‌ها", LEMMA: "ین‌ها", NORM: "ین‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "استخوانند": [ - {ORTH: "استخوان", LEMMA: "استخوان", NORM: "استخوان", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "اسلامند": [ - {ORTH: "اسلام", LEMMA: "اسلام", NORM: "اسلام", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "اسلامی‌اند": [ - {ORTH: "اسلامی‌", LEMMA: "اسلامی‌", NORM: "اسلامی‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "اسلحه‌هایشان": [ - {ORTH: "اسلحه‌های", LEMMA: "اسلحه‌های", NORM: "اسلحه‌های", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "اسمت": [ - {ORTH: "اسم", LEMMA: "اسم", NORM: "اسم", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "اسمش": [ - {ORTH: "اسم", LEMMA: "اسم", NORM: "اسم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اشتباهند": [ - {ORTH: "اشتباه", LEMMA: "اشتباه", NORM: "اشتباه", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "اصلش": [ - {ORTH: "اصل", LEMMA: "اصل", NORM: "اصل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اطاقش": [ - {ORTH: "اطاق", LEMMA: "اطاق", NORM: "اطاق", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اعتقادند": [ - {ORTH: "اعتقاد", LEMMA: "اعتقاد", NORM: "اعتقاد", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "اعلایش": [ - {ORTH: "اعلای", LEMMA: "اعلای", NORM: "اعلای", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "افتراست": [ - {ORTH: "افترا", LEMMA: "افترا", NORM: "افترا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "افطارت": [ - {ORTH: "افطار", LEMMA: "افطار", NORM: "افطار", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "اقوامش": [ - {ORTH: "اقوام", LEMMA: "اقوام", NORM: "اقوام", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "امروزیش": [ - {ORTH: "امروزی", LEMMA: "امروزی", NORM: "امروزی", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اموالش": [ - {ORTH: "اموال", LEMMA: "اموال", NORM: "اموال", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "امیدوارند": [ - {ORTH: "امیدوار", LEMMA: "امیدوار", NORM: "امیدوار", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "امیدواریم": [ - {ORTH: "امیدوار", LEMMA: "امیدوار", NORM: "امیدوار", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "انتخابهایم": [ - {ORTH: "انتخابها", LEMMA: "انتخابها", NORM: "انتخابها", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "انتظارم": [ - {ORTH: "انتظار", LEMMA: "انتظار", NORM: "انتظار", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "انجمنم": [ - {ORTH: "انجمن", LEMMA: "انجمن", NORM: "انجمن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "اندرش": [ - {ORTH: "اندر", LEMMA: "اندر", NORM: "اندر", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "انشایش": [ - {ORTH: "انشای", LEMMA: "انشای", NORM: "انشای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "انگشتشان": [ - {ORTH: "انگشت", LEMMA: "انگشت", NORM: "انگشت", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "انگشتهایش": [ - {ORTH: "انگشتهای", LEMMA: "انگشتهای", NORM: "انگشتهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اهمیتشان": [ - {ORTH: "اهمیت", LEMMA: "اهمیت", NORM: "اهمیت", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "اهمیتند": [ - {ORTH: "اهمیت", LEMMA: "اهمیت", NORM: "اهمیت", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "اوایلش": [ - {ORTH: "اوایل", LEMMA: "اوایل", NORM: "اوایل", TAG: "ADV"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اوست": [ - {ORTH: "او", LEMMA: "او", NORM: "او", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "اولش": [ - {ORTH: "اول", LEMMA: "اول", NORM: "اول", TAG: "ADV"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "اولشان": [ - {ORTH: "اول", LEMMA: "اول", NORM: "اول", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "اولم": [ - {ORTH: "اول", LEMMA: "اول", NORM: "اول", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "اکثرشان": [ - {ORTH: "اکثر", LEMMA: "اکثر", NORM: "اکثر", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "ایتالیاست": [ - {ORTH: "ایتالیا", LEMMA: "ایتالیا", NORM: "ایتالیا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "ایرانی‌اش": [ - {ORTH: "ایرانی‌", LEMMA: "ایرانی‌", NORM: "ایرانی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "اینجاست": [ - {ORTH: "اینجا", LEMMA: "اینجا", NORM: "اینجا", TAG: "ADV"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "این‌هاست": [ - {ORTH: "این‌ها", LEMMA: "این‌ها", NORM: "این‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "بابات": [ - {ORTH: "بابا", LEMMA: "بابا", NORM: "بابا", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "بارش": [ - {ORTH: "بار", LEMMA: "بار", NORM: "بار", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بازیگرانش": [ - {ORTH: "بازیگران", LEMMA: "بازیگران", NORM: "بازیگران", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بازیگرمان": [ - {ORTH: "بازیگر", LEMMA: "بازیگر", NORM: "بازیگر", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "بازیگرهایم": [ - {ORTH: "بازیگرها", LEMMA: "بازیگرها", NORM: "بازیگرها", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "بازی‌اش": [ - {ORTH: "بازی‌", LEMMA: "بازی‌", NORM: "بازی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "بالاست": [ - {ORTH: "بالا", LEMMA: "بالا", NORM: "بالا", TAG: "ADV"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "باورند": [ - {ORTH: "باور", LEMMA: "باور", NORM: "باور", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "بجاست": [ - {ORTH: "بجا", LEMMA: "بجا", NORM: "بجا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "بدان": [ - {ORTH: "ب", LEMMA: "ب", NORM: "ب", TAG: "ADP"}, - {ORTH: "دان", LEMMA: "دان", NORM: "دان", TAG: "NOUN"}, - ], - "بدش": [ - {ORTH: "بد", LEMMA: "بد", NORM: "بد", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بدشان": [ - {ORTH: "بد", LEMMA: "بد", NORM: "بد", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بدنم": [ - {ORTH: "بدن", LEMMA: "بدن", NORM: "بدن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "بدهی‌ات": [ - {ORTH: "بدهی‌", LEMMA: "بدهی‌", NORM: "بدهی‌", TAG: "NOUN"}, - {ORTH: "ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "بدین": [ - {ORTH: "ب", LEMMA: "ب", NORM: "ب", TAG: "ADP"}, - {ORTH: "دین", LEMMA: "دین", NORM: "دین", TAG: "NOUN"}, - ], - "برابرش": [ - {ORTH: "برابر", LEMMA: "برابر", NORM: "برابر", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "برادرت": [ - {ORTH: "برادر", LEMMA: "برادر", NORM: "برادر", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "برادرش": [ - {ORTH: "برادر", LEMMA: "برادر", NORM: "برادر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "برایت": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "برایتان": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "برایش": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "برایشان": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "برایم": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "برایمان": [ - {ORTH: "برای", LEMMA: "برای", NORM: "برای", TAG: "ADP"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "برخوردارند": [ - {ORTH: "برخوردار", LEMMA: "برخوردار", NORM: "برخوردار", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "برنامه‌سازهاست": [ - { - ORTH: "برنامه‌سازها", - LEMMA: "برنامه‌سازها", - NORM: "برنامه‌سازها", - TAG: "NOUN", - }, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "برهمش": [ - {ORTH: "برهم", LEMMA: "برهم", NORM: "برهم", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "برهنه‌اش": [ - {ORTH: "برهنه‌", LEMMA: "برهنه‌", NORM: "برهنه‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "برگهایش": [ - {ORTH: "برگها", LEMMA: "برگها", NORM: "برگها", TAG: "NOUN"}, - {ORTH: "یش", LEMMA: "یش", NORM: "یش", TAG: "NOUN"}, - ], - "برین": [ - {ORTH: "بر", LEMMA: "بر", NORM: "بر", TAG: "ADP"}, - {ORTH: "ین", LEMMA: "ین", NORM: "ین", TAG: "NOUN"}, - ], - "بزرگش": [ - {ORTH: "بزرگ", LEMMA: "بزرگ", NORM: "بزرگ", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بزرگ‌تری": [ - {ORTH: "بزرگ‌تر", LEMMA: "بزرگ‌تر", NORM: "بزرگ‌تر", TAG: "ADJ"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "بساطش": [ - {ORTH: "بساط", LEMMA: "بساط", NORM: "بساط", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بعدش": [ - {ORTH: "بعد", LEMMA: "بعد", NORM: "بعد", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بعضیهایشان": [ - {ORTH: "بعضیهای", LEMMA: "بعضیهای", NORM: "بعضیهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بعضی‌شان": [ - {ORTH: "بعضی", LEMMA: "بعضی", NORM: "بعضی", TAG: "NOUN"}, - {ORTH: "‌شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بقیه‌اش": [ - {ORTH: "بقیه‌", LEMMA: "بقیه‌", NORM: "بقیه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "بلندش": [ - {ORTH: "بلند", LEMMA: "بلند", NORM: "بلند", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بناگوشش": [ - {ORTH: "بناگوش", LEMMA: "بناگوش", NORM: "بناگوش", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بنظرم": [ - {ORTH: "ب", LEMMA: "ب", NORM: "ب", TAG: "ADP"}, - {ORTH: "نظر", LEMMA: "نظر", NORM: "نظر", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "بهت": [ - {ORTH: "به", LEMMA: "به", NORM: "به", TAG: "ADP"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "بهترش": [ - {ORTH: "بهتر", LEMMA: "بهتر", NORM: "بهتر", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بهترم": [ - {ORTH: "بهتر", LEMMA: "بهتر", NORM: "بهتر", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "بهتری": [ - {ORTH: "بهتر", LEMMA: "بهتر", NORM: "بهتر", TAG: "ADJ"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "بهش": [ - {ORTH: "به", LEMMA: "به", NORM: "به", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "به‌شان": [ - {ORTH: "به‌", LEMMA: "به‌", NORM: "به‌", TAG: "ADP"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بودمش": [ - {ORTH: "بودم", LEMMA: "بودم", NORM: "بودم", TAG: "VERB"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بودنش": [ - {ORTH: "بودن", LEMMA: "بودن", NORM: "بودن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بودن‌شان": [ - {ORTH: "بودن‌", LEMMA: "بودن‌", NORM: "بودن‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بوستانش": [ - {ORTH: "بوستان", LEMMA: "بوستان", NORM: "بوستان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بویش": [ - {ORTH: "بو", LEMMA: "بو", NORM: "بو", TAG: "NOUN"}, - {ORTH: "یش", LEMMA: "یش", NORM: "یش", TAG: "NOUN"}, - ], - "بچه‌اش": [ - {ORTH: "بچه‌", LEMMA: "بچه‌", NORM: "بچه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "بچه‌م": [ - {ORTH: "بچه‌", LEMMA: "بچه‌", NORM: "بچه‌", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "بچه‌هایش": [ - {ORTH: "بچه‌های", LEMMA: "بچه‌های", NORM: "بچه‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بیانیه‌شان": [ - {ORTH: "بیانیه‌", LEMMA: "بیانیه‌", NORM: "بیانیه‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "بیدارم": [ - {ORTH: "بیدار", LEMMA: "بیدار", NORM: "بیدار", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "بیناتری": [ - {ORTH: "بیناتر", LEMMA: "بیناتر", NORM: "بیناتر", TAG: "ADJ"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "بی‌اطلاعند": [ - {ORTH: "بی‌اطلاع", LEMMA: "بی‌اطلاع", NORM: "بی‌اطلاع", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "بی‌اطلاعید": [ - {ORTH: "بی‌اطلاع", LEMMA: "بی‌اطلاع", NORM: "بی‌اطلاع", TAG: "ADJ"}, - {ORTH: "ید", LEMMA: "ید", NORM: "ید", TAG: "VERB"}, - ], - "بی‌بهره‌اند": [ - {ORTH: "بی‌بهره‌", LEMMA: "بی‌بهره‌", NORM: "بی‌بهره‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "بی‌تفاوتند": [ - {ORTH: "بی‌تفاوت", LEMMA: "بی‌تفاوت", NORM: "بی‌تفاوت", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "بی‌حسابش": [ - {ORTH: "بی‌حساب", LEMMA: "بی‌حساب", NORM: "بی‌حساب", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "بی‌نیش": [ - {ORTH: "بی‌نی", LEMMA: "بی‌نی", NORM: "بی‌نی", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "تجربه‌هایم": [ - {ORTH: "تجربه‌ها", LEMMA: "تجربه‌ها", NORM: "تجربه‌ها", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "تحریم‌هاست": [ - {ORTH: "تحریم‌ها", LEMMA: "تحریم‌ها", NORM: "تحریم‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "تحولند": [ - {ORTH: "تحول", LEMMA: "تحول", NORM: "تحول", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "تخیلی‌اش": [ - {ORTH: "تخیلی‌", LEMMA: "تخیلی‌", NORM: "تخیلی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ترا": [ - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - {ORTH: "را", LEMMA: "را", NORM: "را", TAG: "PART"}, - ], - "ترسشان": [ - {ORTH: "ترس", LEMMA: "ترس", NORM: "ترس", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "ترکش": [ - {ORTH: "ترک", LEMMA: "ترک", NORM: "ترک", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "تشنه‌ت": [ - {ORTH: "تشنه‌", LEMMA: "تشنه‌", NORM: "تشنه‌", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "تشکیلاتی‌اش": [ - {ORTH: "تشکیلاتی‌", LEMMA: "تشکیلاتی‌", NORM: "تشکیلاتی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "تعلقش": [ - {ORTH: "تعلق", LEMMA: "تعلق", NORM: "تعلق", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "تلاششان": [ - {ORTH: "تلاش", LEMMA: "تلاش", NORM: "تلاش", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "تلاشمان": [ - {ORTH: "تلاش", LEMMA: "تلاش", NORM: "تلاش", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "تماشاگرش": [ - {ORTH: "تماشاگر", LEMMA: "تماشاگر", NORM: "تماشاگر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "تمامشان": [ - {ORTH: "تمام", LEMMA: "تمام", NORM: "تمام", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "تنش": [ - {ORTH: "تن", LEMMA: "تن", NORM: "تن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "تنمان": [ - {ORTH: "تن", LEMMA: "تن", NORM: "تن", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "تنهایی‌اش": [ - {ORTH: "تنهایی‌", LEMMA: "تنهایی‌", NORM: "تنهایی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "توانایی‌اش": [ - {ORTH: "توانایی‌", LEMMA: "توانایی‌", NORM: "توانایی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "توجهش": [ - {ORTH: "توجه", LEMMA: "توجه", NORM: "توجه", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "توست": [ - {ORTH: "تو", LEMMA: "تو", NORM: "تو", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "توصیه‌اش": [ - {ORTH: "توصیه‌", LEMMA: "توصیه‌", NORM: "توصیه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "تیغه‌اش": [ - {ORTH: "تیغه‌", LEMMA: "تیغه‌", NORM: "تیغه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "جاست": [ - {ORTH: "جا", LEMMA: "جا", NORM: "جا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "جامعه‌اند": [ - {ORTH: "جامعه‌", LEMMA: "جامعه‌", NORM: "جامعه‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "جانم": [ - {ORTH: "جان", LEMMA: "جان", NORM: "جان", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "جایش": [ - {ORTH: "جای", LEMMA: "جای", NORM: "جای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جایشان": [ - {ORTH: "جای", LEMMA: "جای", NORM: "جای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "جدیدش": [ - {ORTH: "جدید", LEMMA: "جدید", NORM: "جدید", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جرمزاست": [ - {ORTH: "جرمزا", LEMMA: "جرمزا", NORM: "جرمزا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "جلوست": [ - {ORTH: "جلو", LEMMA: "جلو", NORM: "جلو", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "جلویش": [ - {ORTH: "جلوی", LEMMA: "جلوی", NORM: "جلوی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جمهوریست": [ - {ORTH: "جمهوری", LEMMA: "جمهوری", NORM: "جمهوری", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "جنسش": [ - {ORTH: "جنس", LEMMA: "جنس", NORM: "جنس", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جنس‌اند": [ - {ORTH: "جنس‌", LEMMA: "جنس‌", NORM: "جنس‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "جوانانش": [ - {ORTH: "جوانان", LEMMA: "جوانان", NORM: "جوانان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جویش": [ - {ORTH: "جوی", LEMMA: "جوی", NORM: "جوی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "جگرش": [ - {ORTH: "جگر", LEMMA: "جگر", NORM: "جگر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "حاضرم": [ - {ORTH: "حاضر", LEMMA: "حاضر", NORM: "حاضر", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "حالتهایشان": [ - {ORTH: "حالتهای", LEMMA: "حالتهای", NORM: "حالتهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حالیست": [ - {ORTH: "حالی", LEMMA: "حالی", NORM: "حالی", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "حالی‌مان": [ - {ORTH: "حالی‌", LEMMA: "حالی‌", NORM: "حالی‌", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "حاکیست": [ - {ORTH: "حاکی", LEMMA: "حاکی", NORM: "حاکی", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "حرامزادگی‌اش": [ - {ORTH: "حرامزادگی‌", LEMMA: "حرامزادگی‌", NORM: "حرامزادگی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "حرفتان": [ - {ORTH: "حرف", LEMMA: "حرف", NORM: "حرف", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "حرفش": [ - {ORTH: "حرف", LEMMA: "حرف", NORM: "حرف", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "حرفشان": [ - {ORTH: "حرف", LEMMA: "حرف", NORM: "حرف", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حرفم": [ - {ORTH: "حرف", LEMMA: "حرف", NORM: "حرف", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "حرف‌های‌شان": [ - {ORTH: "حرف‌های‌", LEMMA: "حرف‌های‌", NORM: "حرف‌های‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حرکتمان": [ - {ORTH: "حرکت", LEMMA: "حرکت", NORM: "حرکت", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "حریفانشان": [ - {ORTH: "حریفان", LEMMA: "حریفان", NORM: "حریفان", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حضورشان": [ - {ORTH: "حضور", LEMMA: "حضور", NORM: "حضور", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حمایتش": [ - {ORTH: "حمایت", LEMMA: "حمایت", NORM: "حمایت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "حواسش": [ - {ORTH: "حواس", LEMMA: "حواس", NORM: "حواس", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "حواسشان": [ - {ORTH: "حواس", LEMMA: "حواس", NORM: "حواس", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حوصله‌مان": [ - {ORTH: "حوصله‌", LEMMA: "حوصله‌", NORM: "حوصله‌", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "حکومتش": [ - {ORTH: "حکومت", LEMMA: "حکومت", NORM: "حکومت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "حکومتشان": [ - {ORTH: "حکومت", LEMMA: "حکومت", NORM: "حکومت", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "حیفم": [ - {ORTH: "حیف", LEMMA: "حیف", NORM: "حیف", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "خاندانش": [ - {ORTH: "خاندان", LEMMA: "خاندان", NORM: "خاندان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خانه‌اش": [ - {ORTH: "خانه‌", LEMMA: "خانه‌", NORM: "خانه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "خانه‌شان": [ - {ORTH: "خانه‌", LEMMA: "خانه‌", NORM: "خانه‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خانه‌مان": [ - {ORTH: "خانه‌", LEMMA: "خانه‌", NORM: "خانه‌", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "خانه‌هایشان": [ - {ORTH: "خانه‌های", LEMMA: "خانه‌های", NORM: "خانه‌های", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خانواده‌ات": [ - {ORTH: "خانواده", LEMMA: "خانواده", NORM: "خانواده", TAG: "NOUN"}, - {ORTH: "‌ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "خانواده‌اش": [ - {ORTH: "خانواده‌", LEMMA: "خانواده‌", NORM: "خانواده‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "خانواده‌ام": [ - {ORTH: "خانواده‌", LEMMA: "خانواده‌", NORM: "خانواده‌", TAG: "NOUN"}, - {ORTH: "ام", LEMMA: "ام", NORM: "ام", TAG: "NOUN"}, - ], - "خانواده‌شان": [ - {ORTH: "خانواده‌", LEMMA: "خانواده‌", NORM: "خانواده‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خداست": [ - {ORTH: "خدا", LEMMA: "خدا", NORM: "خدا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "خدایش": [ - {ORTH: "خدا", LEMMA: "خدا", NORM: "خدا", TAG: "NOUN"}, - {ORTH: "یش", LEMMA: "یش", NORM: "یش", TAG: "NOUN"}, - ], - "خدایشان": [ - {ORTH: "خدای", LEMMA: "خدای", NORM: "خدای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خردسالش": [ - {ORTH: "خردسال", LEMMA: "خردسال", NORM: "خردسال", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خروپفشان": [ - {ORTH: "خروپف", LEMMA: "خروپف", NORM: "خروپف", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خسته‌ای": [ - {ORTH: "خسته‌", LEMMA: "خسته‌", NORM: "خسته‌", TAG: "ADJ"}, - {ORTH: "ای", LEMMA: "ای", NORM: "ای", TAG: "VERB"}, - ], - "خطت": [ - {ORTH: "خط", LEMMA: "خط", NORM: "خط", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "خوابمان": [ - {ORTH: "خواب", LEMMA: "خواب", NORM: "خواب", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "خواندنش": [ - {ORTH: "خواندن", LEMMA: "خواندن", NORM: "خواندن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خواهرش": [ - {ORTH: "خواهر", LEMMA: "خواهر", NORM: "خواهر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خوبش": [ - {ORTH: "خوب", LEMMA: "خوب", NORM: "خوب", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خودت": [ - {ORTH: "خود", LEMMA: "خود", NORM: "خود", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "خودتان": [ - {ORTH: "خود", LEMMA: "خود", NORM: "خود", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "خودش": [ - {ORTH: "خود", LEMMA: "خود", NORM: "خود", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خودشان": [ - {ORTH: "خود", LEMMA: "خود", NORM: "خود", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خودمان": [ - {ORTH: "خود", LEMMA: "خود", NORM: "خود", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "خوردمان": [ - {ORTH: "خورد", LEMMA: "خورد", NORM: "خورد", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "خوردنشان": [ - {ORTH: "خوردن", LEMMA: "خوردن", NORM: "خوردن", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خوشش": [ - {ORTH: "خوش", LEMMA: "خوش", NORM: "خوش", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خوشوقتم": [ - {ORTH: "خوشوقت", LEMMA: "خوشوقت", NORM: "خوشوقت", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "خونشان": [ - {ORTH: "خون", LEMMA: "خون", NORM: "خون", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "خویش": [ - {ORTH: "خوی", LEMMA: "خوی", NORM: "خوی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خویشتنم": [ - {ORTH: "خویشتن", LEMMA: "خویشتن", NORM: "خویشتن", TAG: "VERB"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "خیالش": [ - {ORTH: "خیال", LEMMA: "خیال", NORM: "خیال", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "خیسش": [ - {ORTH: "خیس", LEMMA: "خیس", NORM: "خیس", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "داراست": [ - {ORTH: "دارا", LEMMA: "دارا", NORM: "دارا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "داستانهایش": [ - {ORTH: "داستانهای", LEMMA: "داستانهای", NORM: "داستانهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دخترمان": [ - {ORTH: "دختر", LEMMA: "دختر", NORM: "دختر", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "دخیلند": [ - {ORTH: "دخیل", LEMMA: "دخیل", NORM: "دخیل", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "درباره‌ات": [ - {ORTH: "درباره", LEMMA: "درباره", NORM: "درباره", TAG: "ADP"}, - {ORTH: "‌ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "درباره‌اش": [ - {ORTH: "درباره‌", LEMMA: "درباره‌", NORM: "درباره‌", TAG: "ADP"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "دردش": [ - {ORTH: "درد", LEMMA: "درد", NORM: "درد", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دردشان": [ - {ORTH: "درد", LEMMA: "درد", NORM: "درد", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "درسته": [ - {ORTH: "درست", LEMMA: "درست", NORM: "درست", TAG: "ADJ"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "درش": [ - {ORTH: "در", LEMMA: "در", NORM: "در", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "درون‌شان": [ - {ORTH: "درون‌", LEMMA: "درون‌", NORM: "درون‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "درین": [ - {ORTH: "در", LEMMA: "در", NORM: "در", TAG: "ADP"}, - {ORTH: "ین", LEMMA: "ین", NORM: "ین", TAG: "NOUN"}, - ], - "دریچه‌هایش": [ - {ORTH: "دریچه‌های", LEMMA: "دریچه‌های", NORM: "دریچه‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دزدانش": [ - {ORTH: "دزدان", LEMMA: "دزدان", NORM: "دزدان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دستت": [ - {ORTH: "دست", LEMMA: "دست", NORM: "دست", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دستش": [ - {ORTH: "دست", LEMMA: "دست", NORM: "دست", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دستمان": [ - {ORTH: "دست", LEMMA: "دست", NORM: "دست", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "دستهایشان": [ - {ORTH: "دستهای", LEMMA: "دستهای", NORM: "دستهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دست‌یافتنی‌ست": [ - { - ORTH: "دست‌یافتنی‌", - LEMMA: "دست‌یافتنی‌", - NORM: "دست‌یافتنی‌", - TAG: "ADJ", - }, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "دشمنند": [ - {ORTH: "دشمن", LEMMA: "دشمن", NORM: "دشمن", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "دشمنیشان": [ - {ORTH: "دشمنی", LEMMA: "دشمنی", NORM: "دشمنی", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دشمنیم": [ - {ORTH: "دشمن", LEMMA: "دشمن", NORM: "دشمن", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "دفترش": [ - {ORTH: "دفتر", LEMMA: "دفتر", NORM: "دفتر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دفنشان": [ - {ORTH: "دفن", LEMMA: "دفن", NORM: "دفن", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دلت": [ - {ORTH: "دل", LEMMA: "دل", NORM: "دل", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دلش": [ - {ORTH: "دل", LEMMA: "دل", NORM: "دل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دلشان": [ - {ORTH: "دل", LEMMA: "دل", NORM: "دل", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دلم": [ - {ORTH: "دل", LEMMA: "دل", NORM: "دل", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "دلیلش": [ - {ORTH: "دلیل", LEMMA: "دلیل", NORM: "دلیل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دنبالش": [ - {ORTH: "دنبال", LEMMA: "دنبال", NORM: "دنبال", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دنباله‌اش": [ - {ORTH: "دنباله‌", LEMMA: "دنباله‌", NORM: "دنباله‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "دهاتی‌هایش": [ - {ORTH: "دهاتی‌های", LEMMA: "دهاتی‌های", NORM: "دهاتی‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دهانت": [ - {ORTH: "دهان", LEMMA: "دهان", NORM: "دهان", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دهنش": [ - {ORTH: "دهن", LEMMA: "دهن", NORM: "دهن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دورش": [ - {ORTH: "دور", LEMMA: "دور", NORM: "دور", TAG: "ADV"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دوروبریهاشان": [ - {ORTH: "دوروبریها", LEMMA: "دوروبریها", NORM: "دوروبریها", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دوستانش": [ - {ORTH: "دوستان", LEMMA: "دوستان", NORM: "دوستان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دوستانشان": [ - {ORTH: "دوستان", LEMMA: "دوستان", NORM: "دوستان", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دوستت": [ - {ORTH: "دوست", LEMMA: "دوست", NORM: "دوست", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دوستش": [ - {ORTH: "دوست", LEMMA: "دوست", NORM: "دوست", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دومش": [ - {ORTH: "دوم", LEMMA: "دوم", NORM: "دوم", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دویدنش": [ - {ORTH: "دویدن", LEMMA: "دویدن", NORM: "دویدن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دکورهایمان": [ - {ORTH: "دکورهای", LEMMA: "دکورهای", NORM: "دکورهای", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "دیدگاهش": [ - {ORTH: "دیدگاه", LEMMA: "دیدگاه", NORM: "دیدگاه", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دیرت": [ - {ORTH: "دیر", LEMMA: "دیر", NORM: "دیر", TAG: "ADV"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دیرم": [ - {ORTH: "دیر", LEMMA: "دیر", NORM: "دیر", TAG: "ADV"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "دینت": [ - {ORTH: "دین", LEMMA: "دین", NORM: "دین", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "دینش": [ - {ORTH: "دین", LEMMA: "دین", NORM: "دین", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دین‌شان": [ - {ORTH: "دین‌", LEMMA: "دین‌", NORM: "دین‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "دیواره‌هایش": [ - {ORTH: "دیواره‌های", LEMMA: "دیواره‌های", NORM: "دیواره‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "دیوانه‌ای": [ - {ORTH: "دیوانه‌", LEMMA: "دیوانه‌", NORM: "دیوانه‌", TAG: "ADJ"}, - {ORTH: "ای", LEMMA: "ای", NORM: "ای", TAG: "VERB"}, - ], - "دیوی": [ - {ORTH: "دیو", LEMMA: "دیو", NORM: "دیو", TAG: "NOUN"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "دیگرم": [ - {ORTH: "دیگر", LEMMA: "دیگر", NORM: "دیگر", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "دیگرمان": [ - {ORTH: "دیگر", LEMMA: "دیگر", NORM: "دیگر", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "ذهنش": [ - {ORTH: "ذهن", LEMMA: "ذهن", NORM: "ذهن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ذهنشان": [ - {ORTH: "ذهن", LEMMA: "ذهن", NORM: "ذهن", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "ذهنم": [ - {ORTH: "ذهن", LEMMA: "ذهن", NORM: "ذهن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "رئوسش": [ - {ORTH: "رئوس", LEMMA: "رئوس", NORM: "رئوس", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "راهشان": [ - {ORTH: "راه", LEMMA: "راه", NORM: "راه", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "راهگشاست": [ - {ORTH: "راهگشا", LEMMA: "راهگشا", NORM: "راهگشا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "رایانه‌هایشان": [ - {ORTH: "رایانه‌های", LEMMA: "رایانه‌های", NORM: "رایانه‌های", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "رعایتشان": [ - {ORTH: "رعایت", LEMMA: "رعایت", NORM: "رعایت", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "رفتارش": [ - {ORTH: "رفتار", LEMMA: "رفتار", NORM: "رفتار", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "رفتارشان": [ - {ORTH: "رفتار", LEMMA: "رفتار", NORM: "رفتار", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "رفتارمان": [ - {ORTH: "رفتار", LEMMA: "رفتار", NORM: "رفتار", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "رفتارهاست": [ - {ORTH: "رفتارها", LEMMA: "رفتارها", NORM: "رفتارها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "رفتارهایشان": [ - {ORTH: "رفتارهای", LEMMA: "رفتارهای", NORM: "رفتارهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "رفقایم": [ - {ORTH: "رفقا", LEMMA: "رفقا", NORM: "رفقا", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "رقیق‌ترش": [ - {ORTH: "رقیق‌تر", LEMMA: "رقیق‌تر", NORM: "رقیق‌تر", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "رنجند": [ - {ORTH: "رنج", LEMMA: "رنج", NORM: "رنج", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "رهگشاست": [ - {ORTH: "رهگشا", LEMMA: "رهگشا", NORM: "رهگشا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "رواست": [ - {ORTH: "روا", LEMMA: "روا", NORM: "روا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "روبروست": [ - {ORTH: "روبرو", LEMMA: "روبرو", NORM: "روبرو", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "روحی‌اش": [ - {ORTH: "روحی‌", LEMMA: "روحی‌", NORM: "روحی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "روزنامه‌اش": [ - {ORTH: "روزنامه‌", LEMMA: "روزنامه‌", NORM: "روزنامه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "روزه‌ست": [ - {ORTH: "روزه‌", LEMMA: "روزه‌", NORM: "روزه‌", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "روسری‌اش": [ - {ORTH: "روسری‌", LEMMA: "روسری‌", NORM: "روسری‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "روشتان": [ - {ORTH: "روش", LEMMA: "روش", NORM: "روش", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "رویش": [ - {ORTH: "روی", LEMMA: "روی", NORM: "روی", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زبانش": [ - {ORTH: "زبان", LEMMA: "زبان", NORM: "زبان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زحماتشان": [ - {ORTH: "زحمات", LEMMA: "زحمات", NORM: "زحمات", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "زدنهایشان": [ - {ORTH: "زدنهای", LEMMA: "زدنهای", NORM: "زدنهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "زرنگشان": [ - {ORTH: "زرنگ", LEMMA: "زرنگ", NORM: "زرنگ", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "زشتش": [ - {ORTH: "زشت", LEMMA: "زشت", NORM: "زشت", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زشتکارانند": [ - {ORTH: "زشتکاران", LEMMA: "زشتکاران", NORM: "زشتکاران", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "زلفش": [ - {ORTH: "زلف", LEMMA: "زلف", NORM: "زلف", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زمن": [ - {ORTH: "ز", LEMMA: "ز", NORM: "ز", TAG: "ADP"}, - {ORTH: "من", LEMMA: "من", NORM: "من", TAG: "NOUN"}, - ], - "زنبوری‌اش": [ - {ORTH: "زنبوری‌", LEMMA: "زنبوری‌", NORM: "زنبوری‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "زندانم": [ - {ORTH: "زندان", LEMMA: "زندان", NORM: "زندان", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "زنده‌ام": [ - {ORTH: "زنده‌", LEMMA: "زنده‌", NORM: "زنده‌", TAG: "ADJ"}, - {ORTH: "ام", LEMMA: "ام", NORM: "ام", TAG: "VERB"}, - ], - "زندگانی‌اش": [ - {ORTH: "زندگانی‌", LEMMA: "زندگانی‌", NORM: "زندگانی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "زندگی‌اش": [ - {ORTH: "زندگی‌", LEMMA: "زندگی‌", NORM: "زندگی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "زندگی‌ام": [ - {ORTH: "زندگی‌", LEMMA: "زندگی‌", NORM: "زندگی‌", TAG: "NOUN"}, - {ORTH: "ام", LEMMA: "ام", NORM: "ام", TAG: "NOUN"}, - ], - "زندگی‌شان": [ - {ORTH: "زندگی‌", LEMMA: "زندگی‌", NORM: "زندگی‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "زنش": [ - {ORTH: "زن", LEMMA: "زن", NORM: "زن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زنند": [ - {ORTH: "زن", LEMMA: "زن", NORM: "زن", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "زو": [ - {ORTH: "ز", LEMMA: "ز", NORM: "ز", TAG: "ADP"}, - {ORTH: "و", LEMMA: "و", NORM: "و", TAG: "NOUN"}, - ], - "زیاده": [ - {ORTH: "زیاد", LEMMA: "زیاد", NORM: "زیاد", TAG: "ADJ"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "زیباست": [ - {ORTH: "زیبا", LEMMA: "زیبا", NORM: "زیبا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "زیبایش": [ - {ORTH: "زیبای", LEMMA: "زیبای", NORM: "زیبای", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "زیبایی": [ - {ORTH: "زیبای", LEMMA: "زیبای", NORM: "زیبای", TAG: "ADJ"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "زیربناست": [ - {ORTH: "زیربنا", LEMMA: "زیربنا", NORM: "زیربنا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "زیرک‌اند": [ - {ORTH: "زیرک‌", LEMMA: "زیرک‌", NORM: "زیرک‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "سؤالتان": [ - {ORTH: "سؤال", LEMMA: "سؤال", NORM: "سؤال", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "سؤالم": [ - {ORTH: "سؤال", LEMMA: "سؤال", NORM: "سؤال", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "سابقه‌اش": [ - {ORTH: "سابقه‌", LEMMA: "سابقه‌", NORM: "سابقه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ساختنم": [ - {ORTH: "ساختن", LEMMA: "ساختن", NORM: "ساختن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "ساده‌اش": [ - {ORTH: "ساده‌", LEMMA: "ساده‌", NORM: "ساده‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ساده‌اند": [ - {ORTH: "ساده‌", LEMMA: "ساده‌", NORM: "ساده‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "سازمانش": [ - {ORTH: "سازمان", LEMMA: "سازمان", NORM: "سازمان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ساعتم": [ - {ORTH: "ساعت", LEMMA: "ساعت", NORM: "ساعت", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "سالته": [ - {ORTH: "سال", LEMMA: "سال", NORM: "سال", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "سالش": [ - {ORTH: "سال", LEMMA: "سال", NORM: "سال", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سالهاست": [ - {ORTH: "سالها", LEMMA: "سالها", NORM: "سالها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "ساله‌اش": [ - {ORTH: "ساله‌", LEMMA: "ساله‌", NORM: "ساله‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ساکتند": [ - {ORTH: "ساکت", LEMMA: "ساکت", NORM: "ساکت", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "ساکنند": [ - {ORTH: "ساکن", LEMMA: "ساکن", NORM: "ساکن", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "سبزشان": [ - {ORTH: "سبز", LEMMA: "سبز", NORM: "سبز", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "سبیل‌مان": [ - {ORTH: "سبیل‌", LEMMA: "سبیل‌", NORM: "سبیل‌", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "ستم‌هایش": [ - {ORTH: "ستم‌های", LEMMA: "ستم‌های", NORM: "ستم‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سخنانش": [ - {ORTH: "سخنان", LEMMA: "سخنان", NORM: "سخنان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سخنانشان": [ - {ORTH: "سخنان", LEMMA: "سخنان", NORM: "سخنان", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "سخنتان": [ - {ORTH: "سخن", LEMMA: "سخن", NORM: "سخن", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "سخنش": [ - {ORTH: "سخن", LEMMA: "سخن", NORM: "سخن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سخنم": [ - {ORTH: "سخن", LEMMA: "سخن", NORM: "سخن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "سردش": [ - {ORTH: "سرد", LEMMA: "سرد", NORM: "سرد", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سرزمینشان": [ - {ORTH: "سرزمین", LEMMA: "سرزمین", NORM: "سرزمین", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "سرش": [ - {ORTH: "سر", LEMMA: "سر", NORM: "سر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سرمایه‌دارهاست": [ - { - ORTH: "سرمایه‌دارها", - LEMMA: "سرمایه‌دارها", - NORM: "سرمایه‌دارها", - TAG: "NOUN", - }, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "سرنوشتش": [ - {ORTH: "سرنوشت", LEMMA: "سرنوشت", NORM: "سرنوشت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سرنوشتشان": [ - {ORTH: "سرنوشت", LEMMA: "سرنوشت", NORM: "سرنوشت", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "سروتهش": [ - {ORTH: "سروته", LEMMA: "سروته", NORM: "سروته", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سرچشمه‌اش": [ - {ORTH: "سرچشمه‌", LEMMA: "سرچشمه‌", NORM: "سرچشمه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "سقمش": [ - {ORTH: "سقم", LEMMA: "سقم", NORM: "سقم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سنش": [ - {ORTH: "سن", LEMMA: "سن", NORM: "سن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سپاهش": [ - {ORTH: "سپاه", LEMMA: "سپاه", NORM: "سپاه", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "سیاسیشان": [ - {ORTH: "سیاسی", LEMMA: "سیاسی", NORM: "سیاسی", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "سیاه‌چاله‌هاست": [ - { - ORTH: "سیاه‌چاله‌ها", - LEMMA: "سیاه‌چاله‌ها", - NORM: "سیاه‌چاله‌ها", - TAG: "NOUN", - }, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "شاخه‌هایشان": [ - {ORTH: "شاخه‌های", LEMMA: "شاخه‌های", NORM: "شاخه‌های", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "شالوده‌اش": [ - {ORTH: "شالوده‌", LEMMA: "شالوده‌", NORM: "شالوده‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "شانه‌هایش": [ - {ORTH: "شانه‌های", LEMMA: "شانه‌های", NORM: "شانه‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شاهدیم": [ - {ORTH: "شاهد", LEMMA: "شاهد", NORM: "شاهد", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "شاهکارهایش": [ - {ORTH: "شاهکارهای", LEMMA: "شاهکارهای", NORM: "شاهکارهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شخصیتش": [ - {ORTH: "شخصیت", LEMMA: "شخصیت", NORM: "شخصیت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شدنشان": [ - {ORTH: "شدن", LEMMA: "شدن", NORM: "شدن", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "شرکتیست": [ - {ORTH: "شرکتی", LEMMA: "شرکتی", NORM: "شرکتی", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "شعارهاشان": [ - {ORTH: "شعارها", LEMMA: "شعارها", NORM: "شعارها", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "شعورش": [ - {ORTH: "شعور", LEMMA: "شعور", NORM: "شعور", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شغلش": [ - {ORTH: "شغل", LEMMA: "شغل", NORM: "شغل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شماست": [ - {ORTH: "شما", LEMMA: "شما", NORM: "شما", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "شمشیرش": [ - {ORTH: "شمشیر", LEMMA: "شمشیر", NORM: "شمشیر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شنیدنش": [ - {ORTH: "شنیدن", LEMMA: "شنیدن", NORM: "شنیدن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شوراست": [ - {ORTH: "شورا", LEMMA: "شورا", NORM: "شورا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "شومت": [ - {ORTH: "شوم", LEMMA: "شوم", NORM: "شوم", TAG: "ADJ"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "شیرینترش": [ - {ORTH: "شیرینتر", LEMMA: "شیرینتر", NORM: "شیرینتر", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "شیطان‌اند": [ - {ORTH: "شیطان‌", LEMMA: "شیطان‌", NORM: "شیطان‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "شیوه‌هاست": [ - {ORTH: "شیوه‌ها", LEMMA: "شیوه‌ها", NORM: "شیوه‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "صاحبش": [ - {ORTH: "صاحب", LEMMA: "صاحب", NORM: "صاحب", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "صحنه‌اش": [ - {ORTH: "صحنه‌", LEMMA: "صحنه‌", NORM: "صحنه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "صدایش": [ - {ORTH: "صدای", LEMMA: "صدای", NORM: "صدای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "صددند": [ - {ORTH: "صدد", LEMMA: "صدد", NORM: "صدد", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "صندوق‌هاست": [ - {ORTH: "صندوق‌ها", LEMMA: "صندوق‌ها", NORM: "صندوق‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "صندوق‌هایش": [ - {ORTH: "صندوق‌های", LEMMA: "صندوق‌های", NORM: "صندوق‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "صورتش": [ - {ORTH: "صورت", LEMMA: "صورت", NORM: "صورت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ضروری‌اند": [ - {ORTH: "ضروری‌", LEMMA: "ضروری‌", NORM: "ضروری‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "ضمیرش": [ - {ORTH: "ضمیر", LEMMA: "ضمیر", NORM: "ضمیر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "طرفش": [ - {ORTH: "طرف", LEMMA: "طرف", NORM: "طرف", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "طلسمش": [ - {ORTH: "طلسم", LEMMA: "طلسم", NORM: "طلسم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "طوره": [ - {ORTH: "طور", LEMMA: "طور", NORM: "طور", TAG: "NOUN"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "عاشوراست": [ - {ORTH: "عاشورا", LEMMA: "عاشورا", NORM: "عاشورا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "عبارتند": [ - {ORTH: "عبارت", LEMMA: "عبارت", NORM: "عبارت", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "عزیزانتان": [ - {ORTH: "عزیزان", LEMMA: "عزیزان", NORM: "عزیزان", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "عزیزانش": [ - {ORTH: "عزیزان", LEMMA: "عزیزان", NORM: "عزیزان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عزیزش": [ - {ORTH: "عزیز", LEMMA: "عزیز", NORM: "عزیز", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عشرت‌طلبی‌اش": [ - {ORTH: "عشرت‌طلبی‌", LEMMA: "عشرت‌طلبی‌", NORM: "عشرت‌طلبی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "عقبیم": [ - {ORTH: "عقب", LEMMA: "عقب", NORM: "عقب", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "علاقه‌اش": [ - {ORTH: "علاقه‌", LEMMA: "علاقه‌", NORM: "علاقه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "علمیمان": [ - {ORTH: "علمی", LEMMA: "علمی", NORM: "علمی", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "عمرش": [ - {ORTH: "عمر", LEMMA: "عمر", NORM: "عمر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عمرشان": [ - {ORTH: "عمر", LEMMA: "عمر", NORM: "عمر", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "عملش": [ - {ORTH: "عمل", LEMMA: "عمل", NORM: "عمل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عملی‌اند": [ - {ORTH: "عملی‌", LEMMA: "عملی‌", NORM: "عملی‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "عمویت": [ - {ORTH: "عموی", LEMMA: "عموی", NORM: "عموی", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "عمویش": [ - {ORTH: "عموی", LEMMA: "عموی", NORM: "عموی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عمیقش": [ - {ORTH: "عمیق", LEMMA: "عمیق", NORM: "عمیق", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عواملش": [ - {ORTH: "عوامل", LEMMA: "عوامل", NORM: "عوامل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "عوضشان": [ - {ORTH: "عوض", LEMMA: "عوض", NORM: "عوض", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "غذایی‌شان": [ - {ORTH: "غذایی‌", LEMMA: "غذایی‌", NORM: "غذایی‌", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "غریبه‌اند": [ - {ORTH: "غریبه‌", LEMMA: "غریبه‌", NORM: "غریبه‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "غلامانش": [ - {ORTH: "غلامان", LEMMA: "غلامان", NORM: "غلامان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "غلطهاست": [ - {ORTH: "غلطها", LEMMA: "غلطها", NORM: "غلطها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "فراموشتان": [ - {ORTH: "فراموش", LEMMA: "فراموش", NORM: "فراموش", TAG: "ADJ"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "فردی‌اند": [ - {ORTH: "فردی‌", LEMMA: "فردی‌", NORM: "فردی‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "فرزندانش": [ - {ORTH: "فرزندان", LEMMA: "فرزندان", NORM: "فرزندان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "فرزندش": [ - {ORTH: "فرزند", LEMMA: "فرزند", NORM: "فرزند", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "فرم‌هایش": [ - {ORTH: "فرم‌های", LEMMA: "فرم‌های", NORM: "فرم‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "فرهنگی‌مان": [ - {ORTH: "فرهنگی‌", LEMMA: "فرهنگی‌", NORM: "فرهنگی‌", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "فریادشان": [ - {ORTH: "فریاد", LEMMA: "فریاد", NORM: "فریاد", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "فضایی‌شان": [ - {ORTH: "فضایی‌", LEMMA: "فضایی‌", NORM: "فضایی‌", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "فقیرشان": [ - {ORTH: "فقیر", LEMMA: "فقیر", NORM: "فقیر", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "فوری‌شان": [ - {ORTH: "فوری‌", LEMMA: "فوری‌", NORM: "فوری‌", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "قائلند": [ - {ORTH: "قائل", LEMMA: "قائل", NORM: "قائل", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "قائلیم": [ - {ORTH: "قائل", LEMMA: "قائل", NORM: "قائل", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "قادرند": [ - {ORTH: "قادر", LEMMA: "قادر", NORM: "قادر", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "قانونمندش": [ - {ORTH: "قانونمند", LEMMA: "قانونمند", NORM: "قانونمند", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "قبلند": [ - {ORTH: "قبل", LEMMA: "قبل", NORM: "قبل", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "قبلی‌اش": [ - {ORTH: "قبلی‌", LEMMA: "قبلی‌", NORM: "قبلی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "قبلی‌مان": [ - {ORTH: "قبلی‌", LEMMA: "قبلی‌", NORM: "قبلی‌", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "قدریست": [ - {ORTH: "قدری", LEMMA: "قدری", NORM: "قدری", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "قدمش": [ - {ORTH: "قدم", LEMMA: "قدم", NORM: "قدم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "قسمتش": [ - {ORTH: "قسمت", LEMMA: "قسمت", NORM: "قسمت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "قضایاست": [ - {ORTH: "قضایا", LEMMA: "قضایا", NORM: "قضایا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "قضیه‌شان": [ - {ORTH: "قضیه‌", LEMMA: "قضیه‌", NORM: "قضیه‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "قهرمانهایشان": [ - {ORTH: "قهرمانهای", LEMMA: "قهرمانهای", NORM: "قهرمانهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "قهرمانیش": [ - {ORTH: "قهرمانی", LEMMA: "قهرمانی", NORM: "قهرمانی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "قومت": [ - {ORTH: "قوم", LEMMA: "قوم", NORM: "قوم", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "لازمه‌اش": [ - {ORTH: "لازمه‌", LEMMA: "لازمه‌", NORM: "لازمه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "مأموریتش": [ - {ORTH: "مأموریت", LEMMA: "مأموریت", NORM: "مأموریت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مأموریتم": [ - {ORTH: "مأموریت", LEMMA: "مأموریت", NORM: "مأموریت", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "مأموریت‌اند": [ - {ORTH: "مأموریت‌", LEMMA: "مأموریت‌", NORM: "مأموریت‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "مادرانشان": [ - {ORTH: "مادران", LEMMA: "مادران", NORM: "مادران", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مادرت": [ - {ORTH: "مادر", LEMMA: "مادر", NORM: "مادر", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "مادرش": [ - {ORTH: "مادر", LEMMA: "مادر", NORM: "مادر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مادرم": [ - {ORTH: "مادر", LEMMA: "مادر", NORM: "مادر", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "ماست": [ - {ORTH: "ما", LEMMA: "ما", NORM: "ما", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "مالی‌اش": [ - {ORTH: "مالی‌", LEMMA: "مالی‌", NORM: "مالی‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "ماهیتش": [ - {ORTH: "ماهیت", LEMMA: "ماهیت", NORM: "ماهیت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مایی": [ - {ORTH: "ما", LEMMA: "ما", NORM: "ما", TAG: "NOUN"}, - {ORTH: "یی", LEMMA: "یی", NORM: "یی", TAG: "VERB"}, - ], - "مجازاتش": [ - {ORTH: "مجازات", LEMMA: "مجازات", NORM: "مجازات", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مجبورند": [ - {ORTH: "مجبور", LEMMA: "مجبور", NORM: "مجبور", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "محتاجند": [ - {ORTH: "محتاج", LEMMA: "محتاج", NORM: "محتاج", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "محرمم": [ - {ORTH: "محرم", LEMMA: "محرم", NORM: "محرم", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "SCONJ"}, - ], - "محلش": [ - {ORTH: "محل", LEMMA: "محل", NORM: "محل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مخالفند": [ - {ORTH: "مخالف", LEMMA: "مخالف", NORM: "مخالف", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مخدرش": [ - {ORTH: "مخدر", LEMMA: "مخدر", NORM: "مخدر", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مدتهاست": [ - {ORTH: "مدتها", LEMMA: "مدتها", NORM: "مدتها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "مدرسه‌ات": [ - {ORTH: "مدرسه", LEMMA: "مدرسه", NORM: "مدرسه", TAG: "NOUN"}, - {ORTH: "‌ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "مدرکم": [ - {ORTH: "مدرک", LEMMA: "مدرک", NORM: "مدرک", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "مدیرانش": [ - {ORTH: "مدیران", LEMMA: "مدیران", NORM: "مدیران", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مدیونم": [ - {ORTH: "مدیون", LEMMA: "مدیون", NORM: "مدیون", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "مذهبی‌اند": [ - {ORTH: "مذهبی‌", LEMMA: "مذهبی‌", NORM: "مذهبی‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "مرا": [ - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - {ORTH: "را", LEMMA: "را", NORM: "را", TAG: "PART"}, - ], - "مرادت": [ - {ORTH: "مراد", LEMMA: "مراد", NORM: "مراد", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "مردمشان": [ - {ORTH: "مردم", LEMMA: "مردم", NORM: "مردم", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مردمند": [ - {ORTH: "مردم", LEMMA: "مردم", NORM: "مردم", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مردم‌اند": [ - {ORTH: "مردم‌", LEMMA: "مردم‌", NORM: "مردم‌", TAG: "NOUN"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "مرزشان": [ - {ORTH: "مرز", LEMMA: "مرز", NORM: "مرز", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مرزهاشان": [ - {ORTH: "مرزها", LEMMA: "مرزها", NORM: "مرزها", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مزدورش": [ - {ORTH: "مزدور", LEMMA: "مزدور", NORM: "مزدور", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مسئولیتش": [ - {ORTH: "مسئولیت", LEMMA: "مسئولیت", NORM: "مسئولیت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مسائلش": [ - {ORTH: "مسائل", LEMMA: "مسائل", NORM: "مسائل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مستحضرید": [ - {ORTH: "مستحضر", LEMMA: "مستحضر", NORM: "مستحضر", TAG: "ADJ"}, - {ORTH: "ید", LEMMA: "ید", NORM: "ید", TAG: "VERB"}, - ], - "مسلمانم": [ - {ORTH: "مسلمان", LEMMA: "مسلمان", NORM: "مسلمان", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "مسلمانند": [ - {ORTH: "مسلمان", LEMMA: "مسلمان", NORM: "مسلمان", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مشتریانش": [ - {ORTH: "مشتریان", LEMMA: "مشتریان", NORM: "مشتریان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مشتهایمان": [ - {ORTH: "مشتهای", LEMMA: "مشتهای", NORM: "مشتهای", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "مشخصند": [ - {ORTH: "مشخص", LEMMA: "مشخص", NORM: "مشخص", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مشغولند": [ - {ORTH: "مشغول", LEMMA: "مشغول", NORM: "مشغول", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مشغولیم": [ - {ORTH: "مشغول", LEMMA: "مشغول", NORM: "مشغول", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "مشهورش": [ - {ORTH: "مشهور", LEMMA: "مشهور", NORM: "مشهور", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مشکلاتشان": [ - {ORTH: "مشکلات", LEMMA: "مشکلات", NORM: "مشکلات", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مشکلم": [ - {ORTH: "مشکل", LEMMA: "مشکل", NORM: "مشکل", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "مطمئنم": [ - {ORTH: "مطمئن", LEMMA: "مطمئن", NORM: "مطمئن", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "معامله‌مان": [ - {ORTH: "معامله‌", LEMMA: "معامله‌", NORM: "معامله‌", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "معتقدم": [ - {ORTH: "معتقد", LEMMA: "معتقد", NORM: "معتقد", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "معتقدند": [ - {ORTH: "معتقد", LEMMA: "معتقد", NORM: "معتقد", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "معتقدیم": [ - {ORTH: "معتقد", LEMMA: "معتقد", NORM: "معتقد", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "معرفی‌اش": [ - {ORTH: "معرفی‌", LEMMA: "معرفی‌", NORM: "معرفی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "معروفش": [ - {ORTH: "معروف", LEMMA: "معروف", NORM: "معروف", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "معضلاتمان": [ - {ORTH: "معضلات", LEMMA: "معضلات", NORM: "معضلات", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "معلمش": [ - {ORTH: "معلم", LEMMA: "معلم", NORM: "معلم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "معنایش": [ - {ORTH: "معنای", LEMMA: "معنای", NORM: "معنای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مغزشان": [ - {ORTH: "مغز", LEMMA: "مغز", NORM: "مغز", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مفیدند": [ - {ORTH: "مفید", LEMMA: "مفید", NORM: "مفید", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مقابلش": [ - {ORTH: "مقابل", LEMMA: "مقابل", NORM: "مقابل", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مقاله‌اش": [ - {ORTH: "مقاله‌", LEMMA: "مقاله‌", NORM: "مقاله‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "مقدمش": [ - {ORTH: "مقدم", LEMMA: "مقدم", NORM: "مقدم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مقرش": [ - {ORTH: "مقر", LEMMA: "مقر", NORM: "مقر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مقصدشان": [ - {ORTH: "مقصد", LEMMA: "مقصد", NORM: "مقصد", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مقصرند": [ - {ORTH: "مقصر", LEMMA: "مقصر", NORM: "مقصر", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مقصودتان": [ - {ORTH: "مقصود", LEMMA: "مقصود", NORM: "مقصود", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "ملاقاتهایش": [ - {ORTH: "ملاقاتهای", LEMMA: "ملاقاتهای", NORM: "ملاقاتهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ممکنشان": [ - {ORTH: "ممکن", LEMMA: "ممکن", NORM: "ممکن", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "ممیزیهاست": [ - {ORTH: "ممیزیها", LEMMA: "ممیزیها", NORM: "ممیزیها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "منظورم": [ - {ORTH: "منظور", LEMMA: "منظور", NORM: "منظور", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "منی": [ - {ORTH: "من", LEMMA: "من", NORM: "من", TAG: "NOUN"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "منید": [ - {ORTH: "من", LEMMA: "من", NORM: "من", TAG: "NOUN"}, - {ORTH: "ید", LEMMA: "ید", NORM: "ید", TAG: "VERB"}, - ], - "مهربانش": [ - {ORTH: "مهربان", LEMMA: "مهربان", NORM: "مهربان", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "مهم‌اند": [ - {ORTH: "مهم‌", LEMMA: "مهم‌", NORM: "مهم‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "مواجهند": [ - {ORTH: "مواجه", LEMMA: "مواجه", NORM: "مواجه", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "مواجه‌اند": [ - {ORTH: "مواجه‌", LEMMA: "مواجه‌", NORM: "مواجه‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "مواخذه‌ات": [ - {ORTH: "مواخذه", LEMMA: "مواخذه", NORM: "مواخذه", TAG: "NOUN"}, - {ORTH: "‌ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "مواضعشان": [ - {ORTH: "مواضع", LEMMA: "مواضع", NORM: "مواضع", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "مواضعمان": [ - {ORTH: "مواضع", LEMMA: "مواضع", NORM: "مواضع", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "موافقند": [ - {ORTH: "موافق", LEMMA: "موافق", NORM: "موافق", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "موجوداتش": [ - {ORTH: "موجودات", LEMMA: "موجودات", NORM: "موجودات", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "موجودند": [ - {ORTH: "موجود", LEMMA: "موجود", NORM: "موجود", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "موردش": [ - {ORTH: "مورد", LEMMA: "مورد", NORM: "مورد", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "موضعشان": [ - {ORTH: "موضع", LEMMA: "موضع", NORM: "موضع", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "موظفند": [ - {ORTH: "موظف", LEMMA: "موظف", NORM: "موظف", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "موهایش": [ - {ORTH: "موهای", LEMMA: "موهای", NORM: "موهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "موهایمان": [ - {ORTH: "موهای", LEMMA: "موهای", NORM: "موهای", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "مویم": [ - {ORTH: "مو", LEMMA: "مو", NORM: "مو", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "ناخرسندند": [ - {ORTH: "ناخرسند", LEMMA: "ناخرسند", NORM: "ناخرسند", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "ناراحتیش": [ - {ORTH: "ناراحتی", LEMMA: "ناراحتی", NORM: "ناراحتی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "ناراضی‌اند": [ - {ORTH: "ناراضی‌", LEMMA: "ناراضی‌", NORM: "ناراضی‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "نارواست": [ - {ORTH: "ناروا", LEMMA: "ناروا", NORM: "ناروا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "نازش": [ - {ORTH: "ناز", LEMMA: "ناز", NORM: "ناز", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نامش": [ - {ORTH: "نام", LEMMA: "نام", NORM: "نام", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نامشان": [ - {ORTH: "نام", LEMMA: "نام", NORM: "نام", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نامم": [ - {ORTH: "نام", LEMMA: "نام", NORM: "نام", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "نامه‌ات": [ - {ORTH: "نامه", LEMMA: "نامه", NORM: "نامه", TAG: "NOUN"}, - {ORTH: "‌ات", LEMMA: "ات", NORM: "ات", TAG: "NOUN"}, - ], - "نامه‌ام": [ - {ORTH: "نامه‌", LEMMA: "نامه‌", NORM: "نامه‌", TAG: "NOUN"}, - {ORTH: "ام", LEMMA: "ام", NORM: "ام", TAG: "NOUN"}, - ], - "ناچارم": [ - {ORTH: "ناچار", LEMMA: "ناچار", NORM: "ناچار", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "نخست‌وزیری‌اش": [ - { - ORTH: "نخست‌وزیری‌", - LEMMA: "نخست‌وزیری‌", - NORM: "نخست‌وزیری‌", - TAG: "NOUN", - }, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "نزدش": [ - {ORTH: "نزد", LEMMA: "نزد", NORM: "نزد", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نشانم": [ - {ORTH: "نشان", LEMMA: "نشان", NORM: "نشان", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "نظرات‌شان": [ - {ORTH: "نظرات‌", LEMMA: "نظرات‌", NORM: "نظرات‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نظرتان": [ - {ORTH: "نظر", LEMMA: "نظر", NORM: "نظر", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "نظرش": [ - {ORTH: "نظر", LEMMA: "نظر", NORM: "نظر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نظرشان": [ - {ORTH: "نظر", LEMMA: "نظر", NORM: "نظر", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نظرم": [ - {ORTH: "نظر", LEMMA: "نظر", NORM: "نظر", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "نظرهایشان": [ - {ORTH: "نظرهای", LEMMA: "نظرهای", NORM: "نظرهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نفاقش": [ - {ORTH: "نفاق", LEMMA: "نفاق", NORM: "نفاق", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نفرند": [ - {ORTH: "نفر", LEMMA: "نفر", NORM: "نفر", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "نفوذیند": [ - {ORTH: "نفوذی", LEMMA: "نفوذی", NORM: "نفوذی", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "نقطه‌نظراتتان": [ - {ORTH: "نقطه‌نظرات", LEMMA: "نقطه‌نظرات", NORM: "نقطه‌نظرات", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "نمایشی‌مان": [ - {ORTH: "نمایشی‌", LEMMA: "نمایشی‌", NORM: "نمایشی‌", TAG: "ADJ"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "نمایندگی‌شان": [ - {ORTH: "نمایندگی‌", LEMMA: "نمایندگی‌", NORM: "نمایندگی‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نمونه‌اش": [ - {ORTH: "نمونه‌", LEMMA: "نمونه‌", NORM: "نمونه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "نمی‌پذیرندش": [ - {ORTH: "نمی‌پذیرند", LEMMA: "نمی‌پذیرند", NORM: "نمی‌پذیرند", TAG: "VERB"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نوآوری‌اش": [ - {ORTH: "نوآوری‌", LEMMA: "نوآوری‌", NORM: "نوآوری‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "نوشته‌هایشان": [ - {ORTH: "نوشته‌های", LEMMA: "نوشته‌های", NORM: "نوشته‌های", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نوشته‌هایم": [ - {ORTH: "نوشته‌ها", LEMMA: "نوشته‌ها", NORM: "نوشته‌ها", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "نکردنشان": [ - {ORTH: "نکردن", LEMMA: "نکردن", NORM: "نکردن", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نگاهداری‌شان": [ - {ORTH: "نگاهداری‌", LEMMA: "نگاهداری‌", NORM: "نگاهداری‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نگاهش": [ - {ORTH: "نگاه", LEMMA: "نگاه", NORM: "نگاه", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "نگرانم": [ - {ORTH: "نگران", LEMMA: "نگران", NORM: "نگران", TAG: "ADJ"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "VERB"}, - ], - "نگرشهایشان": [ - {ORTH: "نگرشهای", LEMMA: "نگرشهای", NORM: "نگرشهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "نیازمندند": [ - {ORTH: "نیازمند", LEMMA: "نیازمند", NORM: "نیازمند", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "هدفش": [ - {ORTH: "هدف", LEMMA: "هدف", NORM: "هدف", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "همانست": [ - {ORTH: "همان", LEMMA: "همان", NORM: "همان", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "همراهش": [ - {ORTH: "همراه", LEMMA: "همراه", NORM: "همراه", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "همسرتان": [ - {ORTH: "همسر", LEMMA: "همسر", NORM: "همسر", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "همسرش": [ - {ORTH: "همسر", LEMMA: "همسر", NORM: "همسر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "همسرم": [ - {ORTH: "همسر", LEMMA: "همسر", NORM: "همسر", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "همفکرانش": [ - {ORTH: "همفکران", LEMMA: "همفکران", NORM: "همفکران", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "همه‌اش": [ - {ORTH: "همه‌", LEMMA: "همه‌", NORM: "همه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "همه‌شان": [ - {ORTH: "همه‌", LEMMA: "همه‌", NORM: "همه‌", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "همکارانش": [ - {ORTH: "همکاران", LEMMA: "همکاران", NORM: "همکاران", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "هم‌نظریم": [ - {ORTH: "هم‌نظر", LEMMA: "هم‌نظر", NORM: "هم‌نظر", TAG: "ADJ"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "VERB"}, - ], - "هنرش": [ - {ORTH: "هنر", LEMMA: "هنر", NORM: "هنر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "هواست": [ - {ORTH: "هوا", LEMMA: "هوا", NORM: "هوا", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "هویتش": [ - {ORTH: "هویت", LEMMA: "هویت", NORM: "هویت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "وابسته‌اند": [ - {ORTH: "وابسته‌", LEMMA: "وابسته‌", NORM: "وابسته‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "واقفند": [ - {ORTH: "واقف", LEMMA: "واقف", NORM: "واقف", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "والدینشان": [ - {ORTH: "والدین", LEMMA: "والدین", NORM: "والدین", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "وجدان‌تان": [ - {ORTH: "وجدان‌", LEMMA: "وجدان‌", NORM: "وجدان‌", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "وجودشان": [ - {ORTH: "وجود", LEMMA: "وجود", NORM: "وجود", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "وطنم": [ - {ORTH: "وطن", LEMMA: "وطن", NORM: "وطن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "وعده‌اش": [ - {ORTH: "وعده‌", LEMMA: "وعده‌", NORM: "وعده‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "وقتمان": [ - {ORTH: "وقت", LEMMA: "وقت", NORM: "وقت", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "ولادتش": [ - {ORTH: "ولادت", LEMMA: "ولادت", NORM: "ولادت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پایانش": [ - {ORTH: "پایان", LEMMA: "پایان", NORM: "پایان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پایش": [ - {ORTH: "پای", LEMMA: "پای", NORM: "پای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پایین‌ترند": [ - {ORTH: "پایین‌تر", LEMMA: "پایین‌تر", NORM: "پایین‌تر", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "پدرت": [ - {ORTH: "پدر", LEMMA: "پدر", NORM: "پدر", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "پدرش": [ - {ORTH: "پدر", LEMMA: "پدر", NORM: "پدر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پدرشان": [ - {ORTH: "پدر", LEMMA: "پدر", NORM: "پدر", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "پدرم": [ - {ORTH: "پدر", LEMMA: "پدر", NORM: "پدر", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "پربارش": [ - {ORTH: "پربار", LEMMA: "پربار", NORM: "پربار", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پروردگارت": [ - {ORTH: "پروردگار", LEMMA: "پروردگار", NORM: "پروردگار", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "پسرتان": [ - {ORTH: "پسر", LEMMA: "پسر", NORM: "پسر", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "پسرش": [ - {ORTH: "پسر", LEMMA: "پسر", NORM: "پسر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پسرعمویش": [ - {ORTH: "پسرعموی", LEMMA: "پسرعموی", NORM: "پسرعموی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پسر‌عمویت": [ - {ORTH: "پسر‌عموی", LEMMA: "پسر‌عموی", NORM: "پسر‌عموی", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "پشتش": [ - {ORTH: "پشت", LEMMA: "پشت", NORM: "پشت", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پشیمونی": [ - {ORTH: "پشیمون", LEMMA: "پشیمون", NORM: "پشیمون", TAG: "ADJ"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "پولش": [ - {ORTH: "پول", LEMMA: "پول", NORM: "پول", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پژوهش‌هایش": [ - {ORTH: "پژوهش‌های", LEMMA: "پژوهش‌های", NORM: "پژوهش‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پیامبرش": [ - {ORTH: "پیامبر", LEMMA: "پیامبر", NORM: "پیامبر", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پیامبری": [ - {ORTH: "پیامبر", LEMMA: "پیامبر", NORM: "پیامبر", TAG: "NOUN"}, - {ORTH: "ی", LEMMA: "ی", NORM: "ی", TAG: "VERB"}, - ], - "پیامش": [ - {ORTH: "پیام", LEMMA: "پیام", NORM: "پیام", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پیداست": [ - {ORTH: "پیدا", LEMMA: "پیدا", NORM: "پیدا", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "پیراهنش": [ - {ORTH: "پیراهن", LEMMA: "پیراهن", NORM: "پیراهن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پیروانش": [ - {ORTH: "پیروان", LEMMA: "پیروان", NORM: "پیروان", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "پیشانی‌اش": [ - {ORTH: "پیشانی‌", LEMMA: "پیشانی‌", NORM: "پیشانی‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "پیمانت": [ - {ORTH: "پیمان", LEMMA: "پیمان", NORM: "پیمان", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "پیوندشان": [ - {ORTH: "پیوند", LEMMA: "پیوند", NORM: "پیوند", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "چاپش": [ - {ORTH: "چاپ", LEMMA: "چاپ", NORM: "چاپ", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "چت": [ - {ORTH: "چ", LEMMA: "چ", NORM: "چ", TAG: "ADV"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "چته": [ - {ORTH: "چ", LEMMA: "چ", NORM: "چ", TAG: "ADV"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "چرخ‌هایش": [ - {ORTH: "چرخ‌های", LEMMA: "چرخ‌های", NORM: "چرخ‌های", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "چشمم": [ - {ORTH: "چشم", LEMMA: "چشم", NORM: "چشم", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "چشمهایش": [ - {ORTH: "چشمهای", LEMMA: "چشمهای", NORM: "چشمهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "چشمهایشان": [ - {ORTH: "چشمهای", LEMMA: "چشمهای", NORM: "چشمهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "چمنم": [ - {ORTH: "چمن", LEMMA: "چمن", NORM: "چمن", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "چهره‌اش": [ - {ORTH: "چهره‌", LEMMA: "چهره‌", NORM: "چهره‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "چکاره‌اند": [ - {ORTH: "چکاره‌", LEMMA: "چکاره‌", NORM: "چکاره‌", TAG: "ADV"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "چیزهاست": [ - {ORTH: "چیزها", LEMMA: "چیزها", NORM: "چیزها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "چیزهایش": [ - {ORTH: "چیزهای", LEMMA: "چیزهای", NORM: "چیزهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "چیزیست": [ - {ORTH: "چیزی", LEMMA: "چیزی", NORM: "چیزی", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "چیست": [ - {ORTH: "چی", LEMMA: "چی", NORM: "چی", TAG: "ADV"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کارش": [ - {ORTH: "کار", LEMMA: "کار", NORM: "کار", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کارشان": [ - {ORTH: "کار", LEMMA: "کار", NORM: "کار", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "کارم": [ - {ORTH: "کار", LEMMA: "کار", NORM: "کار", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "کارند": [ - {ORTH: "کار", LEMMA: "کار", NORM: "کار", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "کارهایم": [ - {ORTH: "کارها", LEMMA: "کارها", NORM: "کارها", TAG: "NOUN"}, - {ORTH: "یم", LEMMA: "یم", NORM: "یم", TAG: "NOUN"}, - ], - "کافیست": [ - {ORTH: "کافی", LEMMA: "کافی", NORM: "کافی", TAG: "ADJ"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کتابخانه‌اش": [ - {ORTH: "کتابخانه‌", LEMMA: "کتابخانه‌", NORM: "کتابخانه‌", TAG: "NOUN"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "کتابش": [ - {ORTH: "کتاب", LEMMA: "کتاب", NORM: "کتاب", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کتابهاشان": [ - {ORTH: "کتابها", LEMMA: "کتابها", NORM: "کتابها", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "کجاست": [ - {ORTH: "کجا", LEMMA: "کجا", NORM: "کجا", TAG: "ADV"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کدورتهایشان": [ - {ORTH: "کدورتهای", LEMMA: "کدورتهای", NORM: "کدورتهای", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "کردنش": [ - {ORTH: "کردن", LEMMA: "کردن", NORM: "کردن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کرم‌خورده‌اش": [ - {ORTH: "کرم‌خورده‌", LEMMA: "کرم‌خورده‌", NORM: "کرم‌خورده‌", TAG: "ADJ"}, - {ORTH: "اش", LEMMA: "اش", NORM: "اش", TAG: "NOUN"}, - ], - "کشش": [ - {ORTH: "کش", LEMMA: "کش", NORM: "کش", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کشورش": [ - {ORTH: "کشور", LEMMA: "کشور", NORM: "کشور", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کشورشان": [ - {ORTH: "کشور", LEMMA: "کشور", NORM: "کشور", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "کشورمان": [ - {ORTH: "کشور", LEMMA: "کشور", NORM: "کشور", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "کشورهاست": [ - {ORTH: "کشورها", LEMMA: "کشورها", NORM: "کشورها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کلیشه‌هاست": [ - {ORTH: "کلیشه‌ها", LEMMA: "کلیشه‌ها", NORM: "کلیشه‌ها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کمبودهاست": [ - {ORTH: "کمبودها", LEMMA: "کمبودها", NORM: "کمبودها", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کمتره": [ - {ORTH: "کمتر", LEMMA: "کمتر", NORM: "کمتر", TAG: "ADJ"}, - {ORTH: "ه", LEMMA: "ه", NORM: "ه", TAG: "VERB"}, - ], - "کمکم": [ - {ORTH: "کمک", LEMMA: "کمک", NORM: "کمک", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "کنارش": [ - {ORTH: "کنار", LEMMA: "کنار", NORM: "کنار", TAG: "ADP"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کودکانشان": [ - {ORTH: "کودکان", LEMMA: "کودکان", NORM: "کودکان", TAG: "NOUN"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "کوچکش": [ - {ORTH: "کوچک", LEMMA: "کوچک", NORM: "کوچک", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "کیست": [ - {ORTH: "کی", LEMMA: "کی", NORM: "کی", TAG: "NOUN"}, - {ORTH: "ست", LEMMA: "ست", NORM: "ست", TAG: "VERB"}, - ], - "کیفش": [ - {ORTH: "کیف", LEMMA: "کیف", NORM: "کیف", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گذشته‌اند": [ - {ORTH: "گذشته‌", LEMMA: "گذشته‌", NORM: "گذشته‌", TAG: "ADJ"}, - {ORTH: "اند", LEMMA: "اند", NORM: "اند", TAG: "VERB"}, - ], - "گرانقدرش": [ - {ORTH: "گرانقدر", LEMMA: "گرانقدر", NORM: "گرانقدر", TAG: "ADJ"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گرانقدرشان": [ - {ORTH: "گرانقدر", LEMMA: "گرانقدر", NORM: "گرانقدر", TAG: "ADJ"}, - {ORTH: "شان", LEMMA: "شان", NORM: "شان", TAG: "NOUN"}, - ], - "گردنتان": [ - {ORTH: "گردن", LEMMA: "گردن", NORM: "گردن", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "گردنش": [ - {ORTH: "گردن", LEMMA: "گردن", NORM: "گردن", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گرفتارند": [ - {ORTH: "گرفتار", LEMMA: "گرفتار", NORM: "گرفتار", TAG: "ADJ"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "گرفتنت": [ - {ORTH: "گرفتن", LEMMA: "گرفتن", NORM: "گرفتن", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "گروهند": [ - {ORTH: "گروه", LEMMA: "گروه", NORM: "گروه", TAG: "NOUN"}, - {ORTH: "ند", LEMMA: "ند", NORM: "ند", TAG: "VERB"}, - ], - "گروگانهایش": [ - {ORTH: "گروگانهای", LEMMA: "گروگانهای", NORM: "گروگانهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گریمش": [ - {ORTH: "گریم", LEMMA: "گریم", NORM: "گریم", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گفتارمان": [ - {ORTH: "گفتار", LEMMA: "گفتار", NORM: "گفتار", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "گلهایش": [ - {ORTH: "گلهای", LEMMA: "گلهای", NORM: "گلهای", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گلویش": [ - {ORTH: "گلوی", LEMMA: "گلوی", NORM: "گلوی", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گناهت": [ - {ORTH: "گناه", LEMMA: "گناه", NORM: "گناه", TAG: "NOUN"}, - {ORTH: "ت", LEMMA: "ت", NORM: "ت", TAG: "NOUN"}, - ], - "گوشش": [ - {ORTH: "گوش", LEMMA: "گوش", NORM: "گوش", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "گوشم": [ - {ORTH: "گوش", LEMMA: "گوش", NORM: "گوش", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "گولش": [ - {ORTH: "گول", LEMMA: "گول", NORM: "گول", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - "یادتان": [ - {ORTH: "یاد", LEMMA: "یاد", NORM: "یاد", TAG: "NOUN"}, - {ORTH: "تان", LEMMA: "تان", NORM: "تان", TAG: "NOUN"}, - ], - "یادم": [ - {ORTH: "یاد", LEMMA: "یاد", NORM: "یاد", TAG: "NOUN"}, - {ORTH: "م", LEMMA: "م", NORM: "م", TAG: "NOUN"}, - ], - "یادمان": [ - {ORTH: "یاد", LEMMA: "یاد", NORM: "یاد", TAG: "NOUN"}, - {ORTH: "مان", LEMMA: "مان", NORM: "مان", TAG: "NOUN"}, - ], - "یارانش": [ - {ORTH: "یاران", LEMMA: "یاران", NORM: "یاران", TAG: "NOUN"}, - {ORTH: "ش", LEMMA: "ش", NORM: "ش", TAG: "NOUN"}, - ], - } -) -TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index faaf609f9..22d710cb0 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH from ...util import update_exc @@ -8,74 +8,74 @@ _exc = {} # Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html for exc_data in [ - {ORTH: "aik.", LEMMA: "aikaisempi"}, - {ORTH: "alk.", LEMMA: "alkaen"}, - {ORTH: "alv.", LEMMA: "arvonlisävero"}, - {ORTH: "ark.", LEMMA: "arkisin"}, - {ORTH: "as.", LEMMA: "asunto"}, - {ORTH: "eaa.", LEMMA: "ennen ajanlaskun alkua"}, - {ORTH: "ed.", LEMMA: "edellinen"}, - {ORTH: "esim.", LEMMA: "esimerkki"}, - {ORTH: "huom.", LEMMA: "huomautus"}, - {ORTH: "jne.", LEMMA: "ja niin edelleen"}, - {ORTH: "joht.", LEMMA: "johtaja"}, - {ORTH: "k.", LEMMA: "kuollut"}, - {ORTH: "ks.", LEMMA: "katso"}, - {ORTH: "lk.", LEMMA: "luokka"}, - {ORTH: "lkm.", LEMMA: "lukumäärä"}, - {ORTH: "lyh.", LEMMA: "lyhenne"}, - {ORTH: "läh.", LEMMA: "lähettäjä"}, - {ORTH: "miel.", LEMMA: "mieluummin"}, - {ORTH: "milj.", LEMMA: "miljoona"}, - {ORTH: "Mm.", LEMMA: "muun muassa"}, - {ORTH: "mm.", LEMMA: "muun muassa"}, - {ORTH: "myöh.", LEMMA: "myöhempi"}, - {ORTH: "n.", LEMMA: "noin"}, - {ORTH: "nimim.", LEMMA: "nimimerkki"}, - {ORTH: "n:o", LEMMA: "numero"}, - {ORTH: "N:o", LEMMA: "numero"}, - {ORTH: "nro", LEMMA: "numero"}, - {ORTH: "ns.", LEMMA: "niin sanottu"}, - {ORTH: "nyk.", LEMMA: "nykyinen"}, - {ORTH: "oik.", LEMMA: "oikealla"}, - {ORTH: "os.", LEMMA: "osoite"}, - {ORTH: "p.", LEMMA: "päivä"}, - {ORTH: "par.", LEMMA: "paremmin"}, - {ORTH: "per.", LEMMA: "perustettu"}, - {ORTH: "pj.", LEMMA: "puheenjohtaja"}, - {ORTH: "puh.joht.", LEMMA: "puheenjohtaja"}, - {ORTH: "prof.", LEMMA: "professori"}, - {ORTH: "puh.", LEMMA: "puhelin"}, - {ORTH: "pvm.", LEMMA: "päivämäärä"}, - {ORTH: "rak.", LEMMA: "rakennettu"}, - {ORTH: "ry.", LEMMA: "rekisteröity yhdistys"}, - {ORTH: "s.", LEMMA: "sivu"}, - {ORTH: "siht.", LEMMA: "sihteeri"}, - {ORTH: "synt.", LEMMA: "syntynyt"}, - {ORTH: "t.", LEMMA: "toivoo"}, - {ORTH: "tark.", LEMMA: "tarkastanut"}, - {ORTH: "til.", LEMMA: "tilattu"}, - {ORTH: "tms.", LEMMA: "tai muuta sellaista"}, - {ORTH: "toim.", LEMMA: "toimittanut"}, - {ORTH: "v.", LEMMA: "vuosi"}, - {ORTH: "vas.", LEMMA: "vasen"}, - {ORTH: "vast.", LEMMA: "vastaus"}, - {ORTH: "vrt.", LEMMA: "vertaa"}, - {ORTH: "yht.", LEMMA: "yhteensä"}, - {ORTH: "yl.", LEMMA: "yleinen"}, - {ORTH: "ym.", LEMMA: "ynnä muuta"}, - {ORTH: "yms.", LEMMA: "ynnä muuta sellaista"}, - {ORTH: "yo.", LEMMA: "ylioppilas"}, - {ORTH: "yliopp.", LEMMA: "ylioppilas"}, - {ORTH: "ao.", LEMMA: "asianomainen"}, - {ORTH: "em.", LEMMA: "edellä mainittu"}, - {ORTH: "ko.", LEMMA: "kyseessä oleva"}, - {ORTH: "ml.", LEMMA: "mukaan luettuna"}, - {ORTH: "po.", LEMMA: "puheena oleva"}, - {ORTH: "so.", LEMMA: "se on"}, - {ORTH: "ts.", LEMMA: "toisin sanoen"}, - {ORTH: "vm.", LEMMA: "viimeksi mainittu"}, - {ORTH: "srk.", LEMMA: "seurakunta"}, + {ORTH: "aik."}, + {ORTH: "alk."}, + {ORTH: "alv."}, + {ORTH: "ark."}, + {ORTH: "as."}, + {ORTH: "eaa."}, + {ORTH: "ed."}, + {ORTH: "esim."}, + {ORTH: "huom."}, + {ORTH: "jne."}, + {ORTH: "joht."}, + {ORTH: "k."}, + {ORTH: "ks."}, + {ORTH: "lk."}, + {ORTH: "lkm."}, + {ORTH: "lyh."}, + {ORTH: "läh."}, + {ORTH: "miel."}, + {ORTH: "milj."}, + {ORTH: "Mm."}, + {ORTH: "mm."}, + {ORTH: "myöh."}, + {ORTH: "n."}, + {ORTH: "nimim."}, + {ORTH: "n:o"}, + {ORTH: "N:o"}, + {ORTH: "nro"}, + {ORTH: "ns."}, + {ORTH: "nyk."}, + {ORTH: "oik."}, + {ORTH: "os."}, + {ORTH: "p."}, + {ORTH: "par."}, + {ORTH: "per."}, + {ORTH: "pj."}, + {ORTH: "puh.joht."}, + {ORTH: "prof."}, + {ORTH: "puh."}, + {ORTH: "pvm."}, + {ORTH: "rak."}, + {ORTH: "ry."}, + {ORTH: "s."}, + {ORTH: "siht."}, + {ORTH: "synt."}, + {ORTH: "t."}, + {ORTH: "tark."}, + {ORTH: "til."}, + {ORTH: "tms."}, + {ORTH: "toim."}, + {ORTH: "v."}, + {ORTH: "vas."}, + {ORTH: "vast."}, + {ORTH: "vrt."}, + {ORTH: "yht."}, + {ORTH: "yl."}, + {ORTH: "ym."}, + {ORTH: "yms."}, + {ORTH: "yo."}, + {ORTH: "yliopp."}, + {ORTH: "ao."}, + {ORTH: "em."}, + {ORTH: "ko."}, + {ORTH: "ml."}, + {ORTH: "po."}, + {ORTH: "so."}, + {ORTH: "ts."}, + {ORTH: "vm."}, + {ORTH: "srk."}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index a1ad7bcbb..6f429eecc 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -3,7 +3,7 @@ import re from ..tokenizer_exceptions import BASE_EXCEPTIONS from .punctuation import ELISION, HYPHENS from ..char_classes import ALPHA_LOWER, ALPHA -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH from ...util import update_exc @@ -28,29 +28,29 @@ def lower_first_letter(text): return text[0].lower() + text[1:] -_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]} +_exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]} for exc_data in [ - {LEMMA: "avant", ORTH: "av."}, - {LEMMA: "janvier", ORTH: "janv."}, - {LEMMA: "février", ORTH: "févr."}, - {LEMMA: "avril", ORTH: "avr."}, - {LEMMA: "juillet", ORTH: "juill."}, - {LEMMA: "septembre", ORTH: "sept."}, - {LEMMA: "octobre", ORTH: "oct."}, - {LEMMA: "novembre", ORTH: "nov."}, - {LEMMA: "décembre", ORTH: "déc."}, - {LEMMA: "après", ORTH: "apr."}, - {LEMMA: "docteur", ORTH: "Dr."}, - {LEMMA: "monsieur", ORTH: "M."}, - {LEMMA: "monsieur", ORTH: "Mr."}, - {LEMMA: "madame", ORTH: "Mme."}, - {LEMMA: "mademoiselle", ORTH: "Mlle."}, - {LEMMA: "numéro", ORTH: "n°"}, - {LEMMA: "degrés", ORTH: "d°"}, - {LEMMA: "saint", ORTH: "St."}, - {LEMMA: "sainte", ORTH: "Ste."}, + {ORTH: "av."}, + {ORTH: "janv."}, + {ORTH: "févr."}, + {ORTH: "avr."}, + {ORTH: "juill."}, + {ORTH: "sept."}, + {ORTH: "oct."}, + {ORTH: "nov."}, + {ORTH: "déc."}, + {ORTH: "apr."}, + {ORTH: "Dr."}, + {ORTH: "M."}, + {ORTH: "Mr."}, + {ORTH: "Mme."}, + {ORTH: "Mlle."}, + {ORTH: "n°"}, + {ORTH: "d°"}, + {ORTH: "St."}, + {ORTH: "Ste."}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -80,55 +80,37 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -for verb, verb_lemma in [ - ("a", "avoir"), - ("est", "être"), - ("semble", "sembler"), - ("indique", "indiquer"), - ("moque", "moquer"), - ("passe", "passer"), +for verb in [ + "a", + "est" "semble", + "indique", + "moque", + "passe", ]: for orth in [verb, verb.title()]: for pronoun in ["elle", "il", "on"]: token = f"{orth}-t-{pronoun}" - _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, - {LEMMA: "t", ORTH: "-t"}, - {LEMMA: pronoun, ORTH: "-" + pronoun}, - ] + _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}] -for verb, verb_lemma in [("est", "être")]: +for verb in ["est"]: for orth in [verb, verb.title()]: - token = f"{orth}-ce" - _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, - {LEMMA: "ce", ORTH: "-ce"}, - ] + _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}] -for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: +for pre in ["qu'", "n'"]: for orth in [pre, pre.title()]: - _exc[f"{orth}est-ce"] = [ - {LEMMA: pre_lemma, ORTH: orth}, - {LEMMA: "être", ORTH: "est"}, - {LEMMA: "ce", ORTH: "-ce"}, - ] + _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}] for verb, pronoun in [("est", "il"), ("EST", "IL")]: - token = "{}-{}".format(verb, pronoun) - _exc[token] = [ - {LEMMA: "être", ORTH: verb}, - {LEMMA: pronoun, ORTH: "-" + pronoun}, - ] + _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}] for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: - token = "{}'{}-{}".format(s, verb, pronoun) - _exc[token] = [ - {LEMMA: "se", ORTH: s + "'"}, - {LEMMA: "être", ORTH: verb}, - {LEMMA: pronoun, ORTH: "-" + pronoun}, + _exc[f"{s}'{verb}-{pronoun}"] = [ + {ORTH: s + "'"}, + {ORTH: verb}, + {ORTH: "-" + pronoun}, ] diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index fbd6fa0f5..abf49c511 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,81 +1,65 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc _exc = { - "'acha'n": [ - {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, - {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}, - ], - "dem'": [ - {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}, - ], - "ded'": [ - {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, - {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET}, - ], - "lem'": [ - {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}, - ], - "led'": [ - {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, - {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}, - ], + "'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}], + "dem'": [{ORTH: "de", NORM: "de"}, {ORTH: "m'", NORM: "mo"}], + "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}], + "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}], + "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}], } for exc_data in [ - {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, - {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, - {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, - {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, - {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, - {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, - {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, - {ORTH: "m'", LEMMA: "mo", POS: DET}, - {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, - {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, - {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, - {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, - {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, - {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, - {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, - {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, - {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, - {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, - {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, - {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, - {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, - {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, - {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, - {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, - {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, - {ORTH: "B'", LEMMA: "ba", POS: AUX}, - {ORTH: "b'", LEMMA: "ba", POS: AUX}, - {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, - {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, - {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, - {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, - {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, - {ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, - {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, - {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, - {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, - {ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, - {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, - {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, - {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, - {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, - {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, - {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}, + {ORTH: "'gus", NORM: "agus"}, + {ORTH: "'ach", NORM: "gach"}, + {ORTH: "ao'", NORM: "aon"}, + {ORTH: "'niar", NORM: "aniar"}, + {ORTH: "'níos", NORM: "aníos"}, + {ORTH: "'ndiu", NORM: "inniu"}, + {ORTH: "'nocht", NORM: "anocht"}, + {ORTH: "m'"}, + {ORTH: "Aib."}, + {ORTH: "Ath."}, + {ORTH: "Beal."}, + {ORTH: "a.C.n."}, + {ORTH: "m.sh."}, + {ORTH: "M.F."}, + {ORTH: "M.Fómh."}, + {ORTH: "D.F."}, + {ORTH: "D.Fómh."}, + {ORTH: "r.C."}, + {ORTH: "R.C."}, + {ORTH: "r.Ch."}, + {ORTH: "r.Chr."}, + {ORTH: "R.Ch."}, + {ORTH: "R.Chr."}, + {ORTH: "⁊rl."}, + {ORTH: "srl."}, + {ORTH: "Co."}, + {ORTH: "Ean."}, + {ORTH: "Feab."}, + {ORTH: "gCo."}, + {ORTH: ".i."}, + {ORTH: "B'"}, + {ORTH: "b'"}, + {ORTH: "lch."}, + {ORTH: "Lch."}, + {ORTH: "lgh."}, + {ORTH: "Lgh."}, + {ORTH: "Lún."}, + {ORTH: "Már."}, + {ORTH: "Meith."}, + {ORTH: "Noll."}, + {ORTH: "Samh."}, + {ORTH: "tAth."}, + {ORTH: "tUas."}, + {ORTH: "teo."}, + {ORTH: "Teo."}, + {ORTH: "Uas."}, + {ORTH: "uimh."}, + {ORTH: "Uimh."}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 50ccfa33a..ff77ede9f 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -1,6 +1,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -11,53 +11,47 @@ _exc = {} for orth in ID_BASE_EXCEPTIONS: _exc[orth] = [{ORTH: orth}] - orth_title = orth.title() _exc[orth_title] = [{ORTH: orth_title}] - orth_caps = orth.upper() _exc[orth_caps] = [{ORTH: orth_caps}] - orth_lower = orth.lower() _exc[orth_lower] = [{ORTH: orth_lower}] - orth_first_upper = orth[0].upper() + orth[1:] _exc[orth_first_upper] = [{ORTH: orth_first_upper}] - if "-" in orth: orth_title = "-".join([part.title() for part in orth.split("-")]) _exc[orth_title] = [{ORTH: orth_title}] - orth_caps = "-".join([part.upper() for part in orth.split("-")]) _exc[orth_caps] = [{ORTH: orth_caps}] for exc_data in [ - {ORTH: "Jan.", LEMMA: "Januari", NORM: "Januari"}, - {ORTH: "Feb.", LEMMA: "Februari", NORM: "Februari"}, - {ORTH: "Mar.", LEMMA: "Maret", NORM: "Maret"}, - {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, - {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, - {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, - {ORTH: "Agu.", LEMMA: "Agustus", NORM: "Agustus"}, - {ORTH: "Ags.", LEMMA: "Agustus", NORM: "Agustus"}, - {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, - {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, - {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, - {ORTH: "Des.", LEMMA: "Desember", NORM: "Desember"}, + {ORTH: "Jan.", NORM: "Januari"}, + {ORTH: "Feb.", NORM: "Februari"}, + {ORTH: "Mar.", NORM: "Maret"}, + {ORTH: "Apr.", NORM: "April"}, + {ORTH: "Jun.", NORM: "Juni"}, + {ORTH: "Jul.", NORM: "Juli"}, + {ORTH: "Agu.", NORM: "Agustus"}, + {ORTH: "Ags.", NORM: "Agustus"}, + {ORTH: "Sep.", NORM: "September"}, + {ORTH: "Okt.", NORM: "Oktober"}, + {ORTH: "Nov.", NORM: "November"}, + {ORTH: "Des.", NORM: "Desember"}, ]: _exc[exc_data[ORTH]] = [exc_data] _other_exc = { - "do'a": [{ORTH: "do'a", LEMMA: "doa", NORM: "doa"}], - "jum'at": [{ORTH: "jum'at", LEMMA: "Jumat", NORM: "Jumat"}], - "Jum'at": [{ORTH: "Jum'at", LEMMA: "Jumat", NORM: "Jumat"}], - "la'nat": [{ORTH: "la'nat", LEMMA: "laknat", NORM: "laknat"}], - "ma'af": [{ORTH: "ma'af", LEMMA: "maaf", NORM: "maaf"}], - "mu'jizat": [{ORTH: "mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], - "Mu'jizat": [{ORTH: "Mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], - "ni'mat": [{ORTH: "ni'mat", LEMMA: "nikmat", NORM: "nikmat"}], - "raka'at": [{ORTH: "raka'at", LEMMA: "rakaat", NORM: "rakaat"}], - "ta'at": [{ORTH: "ta'at", LEMMA: "taat", NORM: "taat"}], + "do'a": [{ORTH: "do'a", NORM: "doa"}], + "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}], + "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}], + "la'nat": [{ORTH: "la'nat", NORM: "laknat"}], + "ma'af": [{ORTH: "ma'af", NORM: "maaf"}], + "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}], + "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}], + "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}], + "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}], + "ta'at": [{ORTH: "ta'at", NORM: "taat"}], } _exc.update(_other_exc) diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index c9c729d63..0c9968bc6 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH from ...util import update_exc @@ -10,7 +10,7 @@ _exc = { "L'art.": [{ORTH: "L'"}, {ORTH: "art."}], "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], - "po'": [{ORTH: "po'", LEMMA: "poco"}], + "po'": [{ORTH: "po'"}], "sett..": [{ORTH: "sett."}, {ORTH: "."}], } diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index f6cdc7f34..d00dc9610 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -10,19 +10,19 @@ _exc = {} # translate / delete what is not necessary for exc_data in [ - {ORTH: "’t", LEMMA: "et", NORM: "et"}, - {ORTH: "’T", LEMMA: "et", NORM: "et"}, - {ORTH: "'t", LEMMA: "et", NORM: "et"}, - {ORTH: "'T", LEMMA: "et", NORM: "et"}, - {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, - {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"}, - {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, - {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, - {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, - {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, - {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, - {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, - {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, + {ORTH: "’t", NORM: "et"}, + {ORTH: "’T", NORM: "et"}, + {ORTH: "'t", NORM: "et"}, + {ORTH: "'T", NORM: "et"}, + {ORTH: "wgl.", NORM: "wannechgelift"}, + {ORTH: "M.", NORM: "Monsieur"}, + {ORTH: "Mme.", NORM: "Madame"}, + {ORTH: "Dr.", NORM: "Dokter"}, + {ORTH: "Tel.", NORM: "Telefon"}, + {ORTH: "asw.", NORM: "an sou weider"}, + {ORTH: "etc.", NORM: "et cetera"}, + {ORTH: "bzw.", NORM: "bezéiungsweis"}, + {ORTH: "Jan.", NORM: "Januar"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index 61fa0df52..52eae2c89 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -1,53 +1,50 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH from ...util import update_exc _exc = {} -for raw, lemma in [ - ("a-a", "a-o"), - ("a-e", "a-o"), - ("a-o", "a-o"), - ("a-i", "a-o"), - ("co-a", "co-o"), - ("co-e", "co-o"), - ("co-i", "co-o"), - ("co-o", "co-o"), - ("da-a", "da-o"), - ("da-e", "da-o"), - ("da-i", "da-o"), - ("da-o", "da-o"), - ("pe-a", "pe-o"), - ("pe-e", "pe-o"), - ("pe-i", "pe-o"), - ("pe-o", "pe-o"), +for raw in [ + "a-e", + "a-o", + "a-i", + "a-a", + "co-a", + "co-e", + "co-i", + "co-o", + "da-a", + "da-e", + "da-i", + "da-o", + "pe-a", + "pe-e", + "pe-i", + "pe-o", ]: for orth in [raw, raw.capitalize()]: - _exc[orth] = [{ORTH: orth, LEMMA: lemma}] + _exc[orth] = [{ORTH: orth}] # Prefix + prepositions with à (e.g. "sott'a-o") -for prep, prep_lemma in [ - ("a-a", "a-o"), - ("a-e", "a-o"), - ("a-o", "a-o"), - ("a-i", "a-o"), +for prep in [ + "a-a", + "a-e", + "a-o", + "a-i", ]: - for prefix, prefix_lemma in [ - ("sott'", "sotta"), - ("sott’", "sotta"), - ("contr'", "contra"), - ("contr’", "contra"), - ("ch'", "che"), - ("ch’", "che"), - ("s'", "se"), - ("s’", "se"), + for prefix in [ + "sott'", + "sott’", + "contr'", + "contr’", + "ch'", + "ch’", + "s'", + "s’", ]: for prefix_orth in [prefix, prefix.capitalize()]: - _exc[prefix_orth + prep] = [ - {ORTH: prefix_orth, LEMMA: prefix_lemma}, - {ORTH: prep, LEMMA: prep_lemma}, - ] + _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 9a604cedc..0be436ae4 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc @@ -7,17 +7,17 @@ _exc = {} for exc_data in [ - {ORTH: "jan.", LEMMA: "januar"}, - {ORTH: "feb.", LEMMA: "februar"}, - {ORTH: "mar.", LEMMA: "mars"}, - {ORTH: "apr.", LEMMA: "april"}, - {ORTH: "jun.", LEMMA: "juni"}, - {ORTH: "jul.", LEMMA: "juli"}, - {ORTH: "aug.", LEMMA: "august"}, - {ORTH: "sep.", LEMMA: "september"}, - {ORTH: "okt.", LEMMA: "oktober"}, - {ORTH: "nov.", LEMMA: "november"}, - {ORTH: "des.", LEMMA: "desember"}, + {ORTH: "jan.", NORM: "januar"}, + {ORTH: "feb.", NORM: "februar"}, + {ORTH: "mar.", NORM: "mars"}, + {ORTH: "apr.", NORM: "april"}, + {ORTH: "jun.", NORM: "juni"}, + {ORTH: "jul.", NORM: "juli"}, + {ORTH: "aug.", NORM: "august"}, + {ORTH: "sep.", NORM: "september"}, + {ORTH: "okt.", NORM: "oktober"}, + {ORTH: "nov.", NORM: "november"}, + {ORTH: "des.", NORM: "desember"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index e4fbd2d75..1dc363fae 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -7,58 +7,56 @@ _exc = {} _abbrev_exc = [ # Weekdays abbreviations - {ORTH: "пн", LEMMA: "понедельник", NORM: "понедельник"}, - {ORTH: "вт", LEMMA: "вторник", NORM: "вторник"}, - {ORTH: "ср", LEMMA: "среда", NORM: "среда"}, - {ORTH: "чт", LEMMA: "четверг", NORM: "четверг"}, - {ORTH: "чтв", LEMMA: "четверг", NORM: "четверг"}, - {ORTH: "пт", LEMMA: "пятница", NORM: "пятница"}, - {ORTH: "сб", LEMMA: "суббота", NORM: "суббота"}, - {ORTH: "сбт", LEMMA: "суббота", NORM: "суббота"}, - {ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"}, - {ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"}, - {ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"}, + {ORTH: "пн", NORM: "понедельник"}, + {ORTH: "вт", NORM: "вторник"}, + {ORTH: "ср", NORM: "среда"}, + {ORTH: "чт", NORM: "четверг"}, + {ORTH: "чтв", NORM: "четверг"}, + {ORTH: "пт", NORM: "пятница"}, + {ORTH: "сб", NORM: "суббота"}, + {ORTH: "сбт", NORM: "суббота"}, + {ORTH: "вс", NORM: "воскресенье"}, + {ORTH: "вскр", NORM: "воскресенье"}, + {ORTH: "воскр", NORM: "воскресенье"}, # Months abbreviations - {ORTH: "янв", LEMMA: "январь", NORM: "январь"}, - {ORTH: "фев", LEMMA: "февраль", NORM: "февраль"}, - {ORTH: "февр", LEMMA: "февраль", NORM: "февраль"}, - {ORTH: "мар", LEMMA: "март", NORM: "март"}, - # {ORTH: "март", LEMMA: "март", NORM: "март"}, - {ORTH: "мрт", LEMMA: "март", NORM: "март"}, - {ORTH: "апр", LEMMA: "апрель", NORM: "апрель"}, - # {ORTH: "май", LEMMA: "май", NORM: "май"}, - {ORTH: "июн", LEMMA: "июнь", NORM: "июнь"}, - # {ORTH: "июнь", LEMMA: "июнь", NORM: "июнь"}, - {ORTH: "июл", LEMMA: "июль", NORM: "июль"}, - # {ORTH: "июль", LEMMA: "июль", NORM: "июль"}, - {ORTH: "авг", LEMMA: "август", NORM: "август"}, - {ORTH: "сен", LEMMA: "сентябрь", NORM: "сентябрь"}, - {ORTH: "сент", LEMMA: "сентябрь", NORM: "сентябрь"}, - {ORTH: "окт", LEMMA: "октябрь", NORM: "октябрь"}, - {ORTH: "октб", LEMMA: "октябрь", NORM: "октябрь"}, - {ORTH: "ноя", LEMMA: "ноябрь", NORM: "ноябрь"}, - {ORTH: "нояб", LEMMA: "ноябрь", NORM: "ноябрь"}, - {ORTH: "нбр", LEMMA: "ноябрь", NORM: "ноябрь"}, - {ORTH: "дек", LEMMA: "декабрь", NORM: "декабрь"}, + {ORTH: "янв", NORM: "январь"}, + {ORTH: "фев", NORM: "февраль"}, + {ORTH: "февр", NORM: "февраль"}, + {ORTH: "мар", NORM: "март"}, + # {ORTH: "март", NORM: "март"}, + {ORTH: "мрт", NORM: "март"}, + {ORTH: "апр", NORM: "апрель"}, + # {ORTH: "май", NORM: "май"}, + {ORTH: "июн", NORM: "июнь"}, + # {ORTH: "июнь", NORM: "июнь"}, + {ORTH: "июл", NORM: "июль"}, + # {ORTH: "июль", NORM: "июль"}, + {ORTH: "авг", NORM: "август"}, + {ORTH: "сен", NORM: "сентябрь"}, + {ORTH: "сент", NORM: "сентябрь"}, + {ORTH: "окт", NORM: "октябрь"}, + {ORTH: "октб", NORM: "октябрь"}, + {ORTH: "ноя", NORM: "ноябрь"}, + {ORTH: "нояб", NORM: "ноябрь"}, + {ORTH: "нбр", NORM: "ноябрь"}, + {ORTH: "дек", NORM: "декабрь"}, ] for abbrev_desc in _abbrev_exc: abbrev = abbrev_desc[ORTH] for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): - _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] - _exc[orth + "."] = [ - {ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]} - ] + _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}] + _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}] _slang_exc = [ - {ORTH: "2к15", LEMMA: "2015", NORM: "2015"}, - {ORTH: "2к16", LEMMA: "2016", NORM: "2016"}, - {ORTH: "2к17", LEMMA: "2017", NORM: "2017"}, - {ORTH: "2к18", LEMMA: "2018", NORM: "2018"}, - {ORTH: "2к19", LEMMA: "2019", NORM: "2019"}, - {ORTH: "2к20", LEMMA: "2020", NORM: "2020"}, + {ORTH: "2к15", NORM: "2015"}, + {ORTH: "2к16", NORM: "2016"}, + {ORTH: "2к17", NORM: "2017"}, + {ORTH: "2к18", NORM: "2018"}, + {ORTH: "2к19", NORM: "2019"}, + {ORTH: "2к20", NORM: "2020"}, ] for slang_desc in _slang_exc: diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index a41fe7e4e..dcaa3e239 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -7,85 +7,83 @@ _exc = {} _abbrev_exc = [ # Weekdays abbreviations - {ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"}, - {ORTH: "уто", LEMMA: "уторак", NORM: "уторак"}, - {ORTH: "сре", LEMMA: "среда", NORM: "среда"}, - {ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"}, - {ORTH: "пет", LEMMA: "петак", NORM: "петак"}, - {ORTH: "суб", LEMMA: "субота", NORM: "субота"}, - {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, + {ORTH: "пoн", NORM: "понедељак"}, + {ORTH: "уто", NORM: "уторак"}, + {ORTH: "сре", NORM: "среда"}, + {ORTH: "чет", NORM: "четвртак"}, + {ORTH: "пет", NORM: "петак"}, + {ORTH: "суб", NORM: "субота"}, + {ORTH: "нед", NORM: "недеља"}, # Months abbreviations - {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, - {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, - {ORTH: "мар", LEMMA: "март", NORM: "март"}, - {ORTH: "апр", LEMMA: "април", NORM: "април"}, - {ORTH: "јуни", LEMMA: "јун", NORM: "јун"}, - {ORTH: "јули", LEMMA: "јул", NORM: "јул"}, - {ORTH: "авг", LEMMA: "август", NORM: "август"}, - {ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"}, - {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, - {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, - {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, - {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}, + {ORTH: "јан", NORM: "јануар"}, + {ORTH: "феб", NORM: "фебруар"}, + {ORTH: "мар", NORM: "март"}, + {ORTH: "апр", NORM: "април"}, + {ORTH: "јуни", NORM: "јун"}, + {ORTH: "јули", NORM: "јул"}, + {ORTH: "авг", NORM: "август"}, + {ORTH: "сеп", NORM: "септембар"}, + {ORTH: "септ", NORM: "септембар"}, + {ORTH: "окт", NORM: "октобар"}, + {ORTH: "нов", NORM: "новембар"}, + {ORTH: "дец", NORM: "децембар"}, ] for abbrev_desc in _abbrev_exc: abbrev = abbrev_desc[ORTH] for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): - _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] - _exc[orth + "."] = [ - {ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]} - ] + _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}] + _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}] # common abbreviations _slang_exc = [ # without dot - {ORTH: "др", LEMMA: "доктор", NORM: "доктор"}, - {ORTH: "гдин", LEMMA: "господин", NORM: "господин"}, - {ORTH: "гђа", LEMMA: "госпођа", NORM: "госпођа"}, - {ORTH: "гђица", LEMMA: "госпођица", NORM: "госпођица"}, - {ORTH: "мр", LEMMA: "магистар", NORM: "магистар"}, - {ORTH: "Бгд", LEMMA: "Београд", NORM: "београд"}, - {ORTH: "цм", LEMMA: "центиметар", NORM: "центиметар"}, - {ORTH: "м", LEMMA: "метар", NORM: "метар"}, - {ORTH: "км", LEMMA: "километар", NORM: "километар"}, - {ORTH: "мг", LEMMA: "милиграм", NORM: "милиграм"}, - {ORTH: "кг", LEMMA: "килограм", NORM: "килограм"}, - {ORTH: "дл", LEMMA: "децилитар", NORM: "децилитар"}, - {ORTH: "хл", LEMMA: "хектолитар", NORM: "хектолитар"}, + {ORTH: "др", NORM: "доктор"}, + {ORTH: "гдин", NORM: "господин"}, + {ORTH: "гђа", NORM: "госпођа"}, + {ORTH: "гђица", NORM: "госпођица"}, + {ORTH: "мр", NORM: "магистар"}, + {ORTH: "Бгд", NORM: "београд"}, + {ORTH: "цм", NORM: "центиметар"}, + {ORTH: "м", NORM: "метар"}, + {ORTH: "км", NORM: "километар"}, + {ORTH: "мг", NORM: "милиграм"}, + {ORTH: "кг", NORM: "килограм"}, + {ORTH: "дл", NORM: "децилитар"}, + {ORTH: "хл", NORM: "хектолитар"}, # with dot - {ORTH: "ул.", LEMMA: "улица", NORM: "улица"}, - {ORTH: "бр.", LEMMA: "број", NORM: "број"}, - {ORTH: "нпр.", LEMMA: "на пример", NORM: "на пример"}, - {ORTH: "тзв.", LEMMA: "такозван", NORM: "такозван"}, - {ORTH: "проф.", LEMMA: "професор", NORM: "професор"}, - {ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, - {ORTH: "једн.", LEMMA: "једнина", NORM: "једнина"}, - {ORTH: "мн.", LEMMA: "множина", NORM: "множина"}, - {ORTH: "уч.", LEMMA: "ученик", NORM: "ученик"}, - {ORTH: "разр.", LEMMA: "разред", NORM: "разред"}, - {ORTH: "инж.", LEMMA: "инжењер", NORM: "инжењер"}, - {ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, - {ORTH: "год.", LEMMA: "година", NORM: "година"}, - {ORTH: "мед.", LEMMA: "медицина", NORM: "медицина"}, - {ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, - {ORTH: "акад.", LEMMA: "академик", NORM: "академик"}, - {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"}, - {ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"}, - {ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"}, - {ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"}, - {ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"}, - {ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"}, - {ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"}, - {ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, + {ORTH: "ул.", NORM: "улица"}, + {ORTH: "бр.", NORM: "број"}, + {ORTH: "нпр.", NORM: "на пример"}, + {ORTH: "тзв.", NORM: "такозван"}, + {ORTH: "проф.", NORM: "професор"}, + {ORTH: "стр.", NORM: "страна"}, + {ORTH: "једн.", NORM: "једнина"}, + {ORTH: "мн.", NORM: "множина"}, + {ORTH: "уч.", NORM: "ученик"}, + {ORTH: "разр.", NORM: "разред"}, + {ORTH: "инж.", NORM: "инжењер"}, + {ORTH: "гимн.", NORM: "гимназија"}, + {ORTH: "год.", NORM: "година"}, + {ORTH: "мед.", NORM: "медицина"}, + {ORTH: "гимн.", NORM: "гимназија"}, + {ORTH: "акад.", NORM: "академик"}, + {ORTH: "доц.", NORM: "доцент"}, + {ORTH: "итд.", NORM: "и тако даље"}, + {ORTH: "и сл.", NORM: "и слично"}, + {ORTH: "н.е.", NORM: "нове ере"}, + {ORTH: "о.г.", NORM: "ове године"}, + {ORTH: "л.к.", NORM: "лична карта"}, + {ORTH: "в.д.", NORM: "вршилац дужности"}, + {ORTH: "стр.", NORM: "страна"}, # with qoute - {ORTH: "ал'", LEMMA: "али", NORM: "али"}, - {ORTH: "ил'", LEMMA: "или", NORM: "или"}, - {ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"}, - {ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"}, - {ORTH: "држ'те", LEMMA: "држати", NORM: "држите"}, + {ORTH: "ал'", NORM: "али"}, + {ORTH: "ил'", NORM: "или"}, + {ORTH: "је л'", NORM: "је ли"}, + {ORTH: "да л'", NORM: "да ли"}, + {ORTH: "држ'те", NORM: "држите"}, ] for slang_desc in _slang_exc: diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index f1b914bff..64206f2f2 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA +from ...symbols import NORM, ORTH from ...util import update_exc _exc = {} @@ -10,61 +10,58 @@ _exc = {} for verb_data in [ {ORTH: "driver"}, {ORTH: "kör"}, - {ORTH: "hörr", LEMMA: "hör"}, + {ORTH: "hörr"}, {ORTH: "fattar"}, - {ORTH: "hajar", LEMMA: "förstår"}, + {ORTH: "hajar"}, {ORTH: "lever"}, - {ORTH: "serr", LEMMA: "ser"}, + {ORTH: "serr"}, {ORTH: "fixar"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: - _exc[data[ORTH] + "u"] = [ - dict(data), - {ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}, - ] + _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}] # Abbreviations for weekdays "sön." (for "söndag" / "söner") # are left out because they are ambiguous. The same is the case # for abbreviations "jul." and "Jul." ("juli" / "jul"). for exc_data in [ - {ORTH: "jan.", LEMMA: "januari"}, - {ORTH: "febr.", LEMMA: "februari"}, - {ORTH: "feb.", LEMMA: "februari"}, - {ORTH: "apr.", LEMMA: "april"}, - {ORTH: "jun.", LEMMA: "juni"}, - {ORTH: "aug.", LEMMA: "augusti"}, - {ORTH: "sept.", LEMMA: "september"}, - {ORTH: "sep.", LEMMA: "september"}, - {ORTH: "okt.", LEMMA: "oktober"}, - {ORTH: "nov.", LEMMA: "november"}, - {ORTH: "dec.", LEMMA: "december"}, - {ORTH: "mån.", LEMMA: "måndag"}, - {ORTH: "tis.", LEMMA: "tisdag"}, - {ORTH: "ons.", LEMMA: "onsdag"}, - {ORTH: "tors.", LEMMA: "torsdag"}, - {ORTH: "fre.", LEMMA: "fredag"}, - {ORTH: "lör.", LEMMA: "lördag"}, - {ORTH: "Jan.", LEMMA: "Januari"}, - {ORTH: "Febr.", LEMMA: "Februari"}, - {ORTH: "Feb.", LEMMA: "Februari"}, - {ORTH: "Apr.", LEMMA: "April"}, - {ORTH: "Jun.", LEMMA: "Juni"}, - {ORTH: "Aug.", LEMMA: "Augusti"}, - {ORTH: "Sept.", LEMMA: "September"}, - {ORTH: "Sep.", LEMMA: "September"}, - {ORTH: "Okt.", LEMMA: "Oktober"}, - {ORTH: "Nov.", LEMMA: "November"}, - {ORTH: "Dec.", LEMMA: "December"}, - {ORTH: "Mån.", LEMMA: "Måndag"}, - {ORTH: "Tis.", LEMMA: "Tisdag"}, - {ORTH: "Ons.", LEMMA: "Onsdag"}, - {ORTH: "Tors.", LEMMA: "Torsdag"}, - {ORTH: "Fre.", LEMMA: "Fredag"}, - {ORTH: "Lör.", LEMMA: "Lördag"}, - {ORTH: "sthlm", LEMMA: "Stockholm"}, - {ORTH: "gbg", LEMMA: "Göteborg"}, + {ORTH: "jan.", NORM: "januari"}, + {ORTH: "febr.", NORM: "februari"}, + {ORTH: "feb.", NORM: "februari"}, + {ORTH: "apr.", NORM: "april"}, + {ORTH: "jun.", NORM: "juni"}, + {ORTH: "aug.", NORM: "augusti"}, + {ORTH: "sept.", NORM: "september"}, + {ORTH: "sep.", NORM: "september"}, + {ORTH: "okt.", NORM: "oktober"}, + {ORTH: "nov.", NORM: "november"}, + {ORTH: "dec.", NORM: "december"}, + {ORTH: "mån.", NORM: "måndag"}, + {ORTH: "tis.", NORM: "tisdag"}, + {ORTH: "ons.", NORM: "onsdag"}, + {ORTH: "tors.", NORM: "torsdag"}, + {ORTH: "fre.", NORM: "fredag"}, + {ORTH: "lör.", NORM: "lördag"}, + {ORTH: "Jan.", NORM: "Januari"}, + {ORTH: "Febr.", NORM: "Februari"}, + {ORTH: "Feb.", NORM: "Februari"}, + {ORTH: "Apr.", NORM: "April"}, + {ORTH: "Jun.", NORM: "Juni"}, + {ORTH: "Aug.", NORM: "Augusti"}, + {ORTH: "Sept.", NORM: "September"}, + {ORTH: "Sep.", NORM: "September"}, + {ORTH: "Okt.", NORM: "Oktober"}, + {ORTH: "Nov.", NORM: "November"}, + {ORTH: "Dec.", NORM: "December"}, + {ORTH: "Mån.", NORM: "Måndag"}, + {ORTH: "Tis.", NORM: "Tisdag"}, + {ORTH: "Ons.", NORM: "Onsdag"}, + {ORTH: "Tors.", NORM: "Torsdag"}, + {ORTH: "Fre.", NORM: "Fredag"}, + {ORTH: "Lör.", NORM: "Lördag"}, + {ORTH: "sthlm", NORM: "Stockholm"}, + {ORTH: "gbg", NORM: "Göteborg"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -154,6 +151,6 @@ for orth in ABBREVIATIONS: # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: - _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] + _exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py index 0529b3a99..92116d474 100644 --- a/spacy/lang/th/tokenizer_exceptions.py +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -1,469 +1,438 @@ -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH _exc = { # หน่วยงานรัฐ / government agency - "กกต.": [{ORTH: "กกต.", LEMMA: "คณะกรรมการการเลือกตั้ง"}], - "กทท.": [{ORTH: "กทท.", LEMMA: "การท่าเรือแห่งประเทศไทย"}], - "กทพ.": [{ORTH: "กทพ.", LEMMA: "การทางพิเศษแห่งประเทศไทย"}], - "กบข.": [{ORTH: "กบข.", LEMMA: "กองทุนบำเหน็จบำนาญข้าราชการพลเรือน"}], - "กบว.": [{ORTH: "กบว.", LEMMA: "คณะกรรมการบริหารวิทยุกระจายเสียงและวิทยุโทรทัศน์"}], - "กปน.": [{ORTH: "กปน.", LEMMA: "การประปานครหลวง"}], - "กปภ.": [{ORTH: "กปภ.", LEMMA: "การประปาส่วนภูมิภาค"}], - "กปส.": [{ORTH: "กปส.", LEMMA: "กรมประชาสัมพันธ์"}], - "กผม.": [{ORTH: "กผม.", LEMMA: "กองผังเมือง"}], - "กฟน.": [{ORTH: "กฟน.", LEMMA: "การไฟฟ้านครหลวง"}], - "กฟผ.": [{ORTH: "กฟผ.", LEMMA: "การไฟฟ้าฝ่ายผลิตแห่งประเทศไทย"}], - "กฟภ.": [{ORTH: "กฟภ.", LEMMA: "การไฟฟ้าส่วนภูมิภาค"}], - "ก.ช.น.": [{ORTH: "ก.ช.น.", LEMMA: "คณะกรรมการช่วยเหลือชาวนาชาวไร่"}], - "กยศ.": [{ORTH: "กยศ.", LEMMA: "กองทุนเงินให้กู้ยืมเพื่อการศึกษา"}], - "ก.ล.ต.": [{ORTH: "ก.ล.ต.", LEMMA: "คณะกรรมการกำกับหลักทรัพย์และตลาดหลักทรัพย์"}], - "กศ.บ.": [{ORTH: "กศ.บ.", LEMMA: "การศึกษาบัณฑิต"}], - "กศน.": [{ORTH: "กศน.", LEMMA: "กรมการศึกษานอกโรงเรียน"}], - "กสท.": [{ORTH: "กสท.", LEMMA: "การสื่อสารแห่งประเทศไทย"}], - "กอ.รมน.": [{ORTH: "กอ.รมน.", LEMMA: "กองอำนวยการรักษาความมั่นคงภายใน"}], - "กร.": [{ORTH: "กร.", LEMMA: "กองเรือยุทธการ"}], - "ขสมก.": [{ORTH: "ขสมก.", LEMMA: "องค์การขนส่งมวลชนกรุงเทพ"}], - "คตง.": [{ORTH: "คตง.", LEMMA: "คณะกรรมการตรวจเงินแผ่นดิน"}], - "ครม.": [{ORTH: "ครม.", LEMMA: "คณะรัฐมนตรี"}], - "คมช.": [{ORTH: "คมช.", LEMMA: "คณะมนตรีความมั่นคงแห่งชาติ"}], - "ตชด.": [{ORTH: "ตชด.", LEMMA: "ตำรวจตะเวนชายเดน"}], - "ตม.": [{ORTH: "ตม.", LEMMA: "กองตรวจคนเข้าเมือง"}], - "ตร.": [{ORTH: "ตร.", LEMMA: "ตำรวจ"}], - "ททท.": [{ORTH: "ททท.", LEMMA: "การท่องเที่ยวแห่งประเทศไทย"}], - "ททบ.": [{ORTH: "ททบ.", LEMMA: "สถานีวิทยุโทรทัศน์กองทัพบก"}], - "ทบ.": [{ORTH: "ทบ.", LEMMA: "กองทัพบก"}], - "ทร.": [{ORTH: "ทร.", LEMMA: "กองทัพเรือ"}], - "ทอ.": [{ORTH: "ทอ.", LEMMA: "กองทัพอากาศ"}], - "ทอท.": [{ORTH: "ทอท.", LEMMA: "การท่าอากาศยานแห่งประเทศไทย"}], - "ธ.ก.ส.": [{ORTH: "ธ.ก.ส.", LEMMA: "ธนาคารเพื่อการเกษตรและสหกรณ์การเกษตร"}], - "ธปท.": [{ORTH: "ธปท.", LEMMA: "ธนาคารแห่งประเทศไทย"}], - "ธอส.": [{ORTH: "ธอส.", LEMMA: "ธนาคารอาคารสงเคราะห์"}], - "นย.": [{ORTH: "นย.", LEMMA: "นาวิกโยธิน"}], - "ปตท.": [{ORTH: "ปตท.", LEMMA: "การปิโตรเลียมแห่งประเทศไทย"}], - "ป.ป.ช.": [ - { - ORTH: "ป.ป.ช.", - LEMMA: "คณะกรรมการป้องกันและปราบปรามการทุจริตและประพฤติมิชอบในวงราชการ", - } - ], - "ป.ป.ส.": [{ORTH: "ป.ป.ส.", LEMMA: "คณะกรรมการป้องกันและปราบปรามยาเสพติด"}], - "บพร.": [{ORTH: "บพร.", LEMMA: "กรมการบินพลเรือน"}], - "บย.": [{ORTH: "บย.", LEMMA: "กองบินยุทธการ"}], - "พสวท.": [ - { - ORTH: "พสวท.", - LEMMA: "โครงการพัฒนาและส่งเสริมผู้มีความรู้ความสามารถพิเศษทางวิทยาศาสตร์และเทคโนโลยี", - } - ], - "มอก.": [{ORTH: "มอก.", LEMMA: "สำนักงานมาตรฐานผลิตภัณฑ์อุตสาหกรรม"}], - "ยธ.": [{ORTH: "ยธ.", LEMMA: "กรมโยธาธิการ"}], - "รพช.": [{ORTH: "รพช.", LEMMA: "สำนักงานเร่งรัดพัฒนาชนบท"}], - "รฟท.": [{ORTH: "รฟท.", LEMMA: "การรถไฟแห่งประเทศไทย"}], - "รฟม.": [{ORTH: "รฟม.", LEMMA: "การรถไฟฟ้าขนส่งมวลชนแห่งประเทศไทย"}], - "ศธ.": [{ORTH: "ศธ.", LEMMA: "กระทรวงศึกษาธิการ"}], - "ศนธ.": [{ORTH: "ศนธ.", LEMMA: "ศูนย์กลางนิสิตนักศึกษาแห่งประเทศไทย"}], - "สกจ.": [{ORTH: "สกจ.", LEMMA: "สหกรณ์จังหวัด"}], - "สกท.": [{ORTH: "สกท.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมการลงทุน"}], - "สกว.": [{ORTH: "สกว.", LEMMA: "สำนักงานกองทุนสนับสนุนการวิจัย"}], - "สคบ.": [{ORTH: "สคบ.", LEMMA: "สำนักงานคณะกรรมการคุ้มครองผู้บริโภค"}], - "สจร.": [{ORTH: "สจร.", LEMMA: "สำนักงานคณะกรรมการจัดระบบการจราจรทางบก"}], - "สตง.": [{ORTH: "สตง.", LEMMA: "สำนักงานตรวจเงินแผ่นดิน"}], - "สทท.": [{ORTH: "สทท.", LEMMA: "สถานีวิทยุโทรทัศน์แห่งประเทศไทย"}], - "สทร.": [{ORTH: "สทร.", LEMMA: "สำนักงานกลางทะเบียนราษฎร์"}], - "สธ": [{ORTH: "สธ", LEMMA: "กระทรวงสาธารณสุข"}], - "สนช.": [{ORTH: "สนช.", LEMMA: "สภานิติบัญญัติแห่งชาติ,สำนักงานนวัตกรรมแห่งชาติ"}], - "สนนท.": [{ORTH: "สนนท.", LEMMA: "สหพันธ์นิสิตนักศึกษาแห่งประเทศไทย"}], - "สปก.": [{ORTH: "สปก.", LEMMA: "สำนักงานการปฏิรูปที่ดินเพื่อเกษตรกรรม"}], - "สปช.": [{ORTH: "สปช.", LEMMA: "สำนักงานคณะกรรมการการประถมศึกษาแห่งชาติ"}], - "สปอ.": [{ORTH: "สปอ.", LEMMA: "สำนักงานการประถมศึกษาอำเภอ"}], - "สพช.": [{ORTH: "สพช.", LEMMA: "สำนักงานคณะกรรมการนโยบายพลังงานแห่งชาติ"}], - "สยช.": [ - {ORTH: "สยช.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมและประสานงานเยาวชนแห่งชาติ"} - ], - "สวช.": [{ORTH: "สวช.", LEMMA: "สำนักงานคณะกรรมการวัฒนธรรมแห่งชาติ"}], - "สวท.": [{ORTH: "สวท.", LEMMA: "สถานีวิทยุกระจายเสียงแห่งประเทศไทย"}], - "สวทช.": [{ORTH: "สวทช.", LEMMA: "สำนักงานพัฒนาวิทยาศาสตร์และเทคโนโลยีแห่งชาติ"}], - "สคช.": [ - {ORTH: "สคช.", LEMMA: "สำนักงานคณะกรรมการพัฒนาการเศรษฐกิจและสังคมแห่งชาติ"} - ], - "สสว.": [{ORTH: "สสว.", LEMMA: "สำนักงานส่งเสริมวิสาหกิจขนาดกลางและขนาดย่อม"}], - "สสส.": [{ORTH: "สสส.", LEMMA: "สำนักงานกองทุนสนับสนุนการสร้างเสริมสุขภาพ"}], - "สสวท.": [{ORTH: "สสวท.", LEMMA: "สถาบันส่งเสริมการสอนวิทยาศาสตร์และเทคโนโลยี"}], - "อตก.": [{ORTH: "อตก.", LEMMA: "องค์การตลาดเพื่อเกษตรกร"}], - "อบจ.": [{ORTH: "อบจ.", LEMMA: "องค์การบริหารส่วนจังหวัด"}], - "อบต.": [{ORTH: "อบต.", LEMMA: "องค์การบริหารส่วนตำบล"}], - "อปพร.": [{ORTH: "อปพร.", LEMMA: "อาสาสมัครป้องกันภัยฝ่ายพลเรือน"}], - "อย.": [{ORTH: "อย.", LEMMA: "สำนักงานคณะกรรมการอาหารและยา"}], - "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท.", LEMMA: "องค์การสื่อสารมวลชนแห่งประเทศไทย"}], + "กกต.": [{ORTH: "กกต."}], + "กทท.": [{ORTH: "กทท."}], + "กทพ.": [{ORTH: "กทพ."}], + "กบข.": [{ORTH: "กบข."}], + "กบว.": [{ORTH: "กบว."}], + "กปน.": [{ORTH: "กปน."}], + "กปภ.": [{ORTH: "กปภ."}], + "กปส.": [{ORTH: "กปส."}], + "กผม.": [{ORTH: "กผม."}], + "กฟน.": [{ORTH: "กฟน."}], + "กฟผ.": [{ORTH: "กฟผ."}], + "กฟภ.": [{ORTH: "กฟภ."}], + "ก.ช.น.": [{ORTH: "ก.ช.น."}], + "กยศ.": [{ORTH: "กยศ."}], + "ก.ล.ต.": [{ORTH: "ก.ล.ต."}], + "กศ.บ.": [{ORTH: "กศ.บ."}], + "กศน.": [{ORTH: "กศน."}], + "กสท.": [{ORTH: "กสท."}], + "กอ.รมน.": [{ORTH: "กอ.รมน."}], + "กร.": [{ORTH: "กร."}], + "ขสมก.": [{ORTH: "ขสมก."}], + "คตง.": [{ORTH: "คตง."}], + "ครม.": [{ORTH: "ครม."}], + "คมช.": [{ORTH: "คมช."}], + "ตชด.": [{ORTH: "ตชด."}], + "ตม.": [{ORTH: "ตม."}], + "ตร.": [{ORTH: "ตร."}], + "ททท.": [{ORTH: "ททท."}], + "ททบ.": [{ORTH: "ททบ."}], + "ทบ.": [{ORTH: "ทบ."}], + "ทร.": [{ORTH: "ทร."}], + "ทอ.": [{ORTH: "ทอ."}], + "ทอท.": [{ORTH: "ทอท."}], + "ธ.ก.ส.": [{ORTH: "ธ.ก.ส."}], + "ธปท.": [{ORTH: "ธปท."}], + "ธอส.": [{ORTH: "ธอส."}], + "นย.": [{ORTH: "นย."}], + "ปตท.": [{ORTH: "ปตท."}], + "ป.ป.ช.": [{ORTH: "ป.ป.ช."}], + "ป.ป.ส.": [{ORTH: "ป.ป.ส."}], + "บพร.": [{ORTH: "บพร."}], + "บย.": [{ORTH: "บย."}], + "พสวท.": [{ORTH: "พสวท."}], + "มอก.": [{ORTH: "มอก."}], + "ยธ.": [{ORTH: "ยธ."}], + "รพช.": [{ORTH: "รพช."}], + "รฟท.": [{ORTH: "รฟท."}], + "รฟม.": [{ORTH: "รฟม."}], + "ศธ.": [{ORTH: "ศธ."}], + "ศนธ.": [{ORTH: "ศนธ."}], + "สกจ.": [{ORTH: "สกจ."}], + "สกท.": [{ORTH: "สกท."}], + "สกว.": [{ORTH: "สกว."}], + "สคบ.": [{ORTH: "สคบ."}], + "สจร.": [{ORTH: "สจร."}], + "สตง.": [{ORTH: "สตง."}], + "สทท.": [{ORTH: "สทท."}], + "สทร.": [{ORTH: "สทร."}], + "สธ": [{ORTH: "สธ"}], + "สนช.": [{ORTH: "สนช."}], + "สนนท.": [{ORTH: "สนนท."}], + "สปก.": [{ORTH: "สปก."}], + "สปช.": [{ORTH: "สปช."}], + "สปอ.": [{ORTH: "สปอ."}], + "สพช.": [{ORTH: "สพช."}], + "สยช.": [{ORTH: "สยช."}], + "สวช.": [{ORTH: "สวช."}], + "สวท.": [{ORTH: "สวท."}], + "สวทช.": [{ORTH: "สวทช."}], + "สคช.": [{ORTH: "สคช."}], + "สสว.": [{ORTH: "สสว."}], + "สสส.": [{ORTH: "สสส."}], + "สสวท.": [{ORTH: "สสวท."}], + "อตก.": [{ORTH: "อตก."}], + "อบจ.": [{ORTH: "อบจ."}], + "อบต.": [{ORTH: "อบต."}], + "อปพร.": [{ORTH: "อปพร."}], + "อย.": [{ORTH: "อย."}], + "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท."}], # มหาวิทยาลัย / สถานศึกษา / university / college - "มทส.": [{ORTH: "มทส.", LEMMA: "มหาวิทยาลัยเทคโนโลยีสุรนารี"}], - "มธ.": [{ORTH: "มธ.", LEMMA: "มหาวิทยาลัยธรรมศาสตร์"}], - "ม.อ.": [{ORTH: "ม.อ.", LEMMA: "มหาวิทยาลัยสงขลานครินทร์"}], - "มทร.": [{ORTH: "มทร.", LEMMA: "มหาวิทยาลัยเทคโนโลยีราชมงคล"}], - "มมส.": [{ORTH: "มมส.", LEMMA: "มหาวิทยาลัยมหาสารคาม"}], - "วท.": [{ORTH: "วท.", LEMMA: "วิทยาลัยเทคนิค"}], - "สตม.": [{ORTH: "สตม.", LEMMA: "สำนักงานตรวจคนเข้าเมือง (ตำรวจ)"}], + "มทส.": [{ORTH: "มทส."}], + "มธ.": [{ORTH: "มธ."}], + "ม.อ.": [{ORTH: "ม.อ."}], + "มทร.": [{ORTH: "มทร."}], + "มมส.": [{ORTH: "มมส."}], + "วท.": [{ORTH: "วท."}], + "สตม.": [{ORTH: "สตม."}], # ยศ / rank - "ดร.": [{ORTH: "ดร.", LEMMA: "ดอกเตอร์"}], - "ด.ต.": [{ORTH: "ด.ต.", LEMMA: "ดาบตำรวจ"}], - "จ.ต.": [{ORTH: "จ.ต.", LEMMA: "จ่าตรี"}], - "จ.ท.": [{ORTH: "จ.ท.", LEMMA: "จ่าโท"}], - "จ.ส.ต.": [{ORTH: "จ.ส.ต.", LEMMA: "จ่าสิบตรี (ทหารบก)"}], - "จสต.": [{ORTH: "จสต.", LEMMA: "จ่าสิบตำรวจ"}], - "จ.ส.ท.": [{ORTH: "จ.ส.ท.", LEMMA: "จ่าสิบโท"}], - "จ.ส.อ.": [{ORTH: "จ.ส.อ.", LEMMA: "จ่าสิบเอก"}], - "จ.อ.": [{ORTH: "จ.อ.", LEMMA: "จ่าเอก"}], - "ทพญ.": [{ORTH: "ทพญ.", LEMMA: "ทันตแพทย์หญิง"}], - "ทนพ.": [{ORTH: "ทนพ.", LEMMA: "เทคนิคการแพทย์"}], - "นจอ.": [{ORTH: "นจอ.", LEMMA: "นักเรียนจ่าอากาศ"}], - "น.ช.": [{ORTH: "น.ช.", LEMMA: "นักโทษชาย"}], - "น.ญ.": [{ORTH: "น.ญ.", LEMMA: "นักโทษหญิง"}], - "น.ต.": [{ORTH: "น.ต.", LEMMA: "นาวาตรี"}], - "น.ท.": [{ORTH: "น.ท.", LEMMA: "นาวาโท"}], - "นตท.": [{ORTH: "นตท.", LEMMA: "นักเรียนเตรียมทหาร"}], - "นนส.": [{ORTH: "นนส.", LEMMA: "นักเรียนนายสิบทหารบก"}], - "นนร.": [{ORTH: "นนร.", LEMMA: "นักเรียนนายร้อย"}], - "นนอ.": [{ORTH: "นนอ.", LEMMA: "นักเรียนนายเรืออากาศ"}], - "นพ.": [{ORTH: "นพ.", LEMMA: "นายแพทย์"}], - "นพท.": [{ORTH: "นพท.", LEMMA: "นายแพทย์ทหาร"}], - "นรจ.": [{ORTH: "นรจ.", LEMMA: "นักเรียนจ่าทหารเรือ"}], - "นรต.": [{ORTH: "นรต.", LEMMA: "นักเรียนนายร้อยตำรวจ"}], - "นศพ.": [{ORTH: "นศพ.", LEMMA: "นักศึกษาแพทย์"}], - "นศท.": [{ORTH: "นศท.", LEMMA: "นักศึกษาวิชาทหาร"}], - "น.สพ.": [{ORTH: "น.สพ.", LEMMA: "นายสัตวแพทย์ (พ.ร.บ.วิชาชีพการสัตวแพทย์)"}], - "น.อ.": [{ORTH: "น.อ.", LEMMA: "นาวาเอก"}], - "บช.ก.": [{ORTH: "บช.ก.", LEMMA: "กองบัญชาการตำรวจสอบสวนกลาง"}], - "บช.น.": [{ORTH: "บช.น.", LEMMA: "กองบัญชาการตำรวจนครบาล"}], - "ผกก.": [{ORTH: "ผกก.", LEMMA: "ผู้กำกับการ"}], - "ผกก.ภ.": [{ORTH: "ผกก.ภ.", LEMMA: "ผู้กำกับการตำรวจภูธร"}], - "ผจก.": [{ORTH: "ผจก.", LEMMA: "ผู้จัดการ"}], - "ผช.": [{ORTH: "ผช.", LEMMA: "ผู้ช่วย"}], - "ผชก.": [{ORTH: "ผชก.", LEMMA: "ผู้ชำนาญการ"}], - "ผช.ผอ.": [{ORTH: "ผช.ผอ.", LEMMA: "ผู้ช่วยผู้อำนวยการ"}], - "ผญบ.": [{ORTH: "ผญบ.", LEMMA: "ผู้ใหญ่บ้าน"}], - "ผบ.": [{ORTH: "ผบ.", LEMMA: "ผู้บังคับบัญชา"}], - "ผบก.": [{ORTH: "ผบก.", LEMMA: "ผู้บังคับบัญชาการ (ตำรวจ)"}], - "ผบก.น.": [{ORTH: "ผบก.น.", LEMMA: "ผู้บังคับการตำรวจนครบาล"}], - "ผบก.ป.": [{ORTH: "ผบก.ป.", LEMMA: "ผู้บังคับการตำรวจกองปราบปราม"}], - "ผบก.ปค.": [ - { - ORTH: "ผบก.ปค.", - LEMMA: "ผู้บังคับการ กองบังคับการปกครอง (โรงเรียนนายร้อยตำรวจ)", - } - ], - "ผบก.ปม.": [{ORTH: "ผบก.ปม.", LEMMA: "ผู้บังคับการตำรวจป่าไม้"}], - "ผบก.ภ.": [{ORTH: "ผบก.ภ.", LEMMA: "ผู้บังคับการตำรวจภูธร"}], - "ผบช.": [{ORTH: "ผบช.", LEMMA: "ผู้บัญชาการ (ตำรวจ)"}], - "ผบช.ก.": [{ORTH: "ผบช.ก.", LEMMA: "ผู้บัญชาการตำรวจสอบสวนกลาง"}], - "ผบช.ตชด.": [{ORTH: "ผบช.ตชด.", LEMMA: "ผู้บัญชาการตำรวจตระเวนชายแดน"}], - "ผบช.น.": [{ORTH: "ผบช.น.", LEMMA: "ผู้บัญชาการตำรวจนครบาล"}], - "ผบช.ภ.": [{ORTH: "ผบช.ภ.", LEMMA: "ผู้บัญชาการตำรวจภูธร"}], - "ผบ.ทบ.": [{ORTH: "ผบ.ทบ.", LEMMA: "ผู้บัญชาการทหารบก"}], - "ผบ.ตร.": [{ORTH: "ผบ.ตร.", LEMMA: "ผู้บัญชาการตำรวจแห่งชาติ"}], - "ผบ.ทร.": [{ORTH: "ผบ.ทร.", LEMMA: "ผู้บัญชาการทหารเรือ"}], - "ผบ.ทอ.": [{ORTH: "ผบ.ทอ.", LEMMA: "ผู้บัญชาการทหารอากาศ"}], - "ผบ.ทสส.": [{ORTH: "ผบ.ทสส.", LEMMA: "ผู้บัญชาการทหารสูงสุด"}], - "ผวจ.": [{ORTH: "ผวจ.", LEMMA: "ผู้ว่าราชการจังหวัด"}], - "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ", LEMMA: "ผู้ว่าราชการจังหวัด"}], - "พ.จ.ต.": [{ORTH: "พ.จ.ต.", LEMMA: "พันจ่าตรี"}], - "พ.จ.ท.": [{ORTH: "พ.จ.ท.", LEMMA: "พันจ่าโท"}], - "พ.จ.อ.": [{ORTH: "พ.จ.อ.", LEMMA: "พันจ่าเอก"}], - "พญ.": [{ORTH: "พญ.", LEMMA: "แพทย์หญิง"}], - "ฯพณฯ": [{ORTH: "ฯพณฯ", LEMMA: "พณท่าน"}], - "พ.ต.": [{ORTH: "พ.ต.", LEMMA: "พันตรี"}], - "พ.ท.": [{ORTH: "พ.ท.", LEMMA: "พันโท"}], - "พ.อ.": [{ORTH: "พ.อ.", LEMMA: "พันเอก"}], - "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ", LEMMA: "พันตำรวจเอกพิเศษ"}], - "พลฯ": [{ORTH: "พลฯ", LEMMA: "พลทหาร"}], - "พล.๑ รอ.": [{ORTH: "พล.๑ รอ.", LEMMA: "กองพลที่ ๑ รักษาพระองค์ กองทัพบก"}], - "พล.ต.": [{ORTH: "พล.ต.", LEMMA: "พลตรี"}], - "พล.ต.ต.": [{ORTH: "พล.ต.ต.", LEMMA: "พลตำรวจตรี"}], - "พล.ต.ท.": [{ORTH: "พล.ต.ท.", LEMMA: "พลตำรวจโท"}], - "พล.ต.อ.": [{ORTH: "พล.ต.อ.", LEMMA: "พลตำรวจเอก"}], - "พล.ท.": [{ORTH: "พล.ท.", LEMMA: "พลโท"}], - "พล.ปตอ.": [{ORTH: "พล.ปตอ.", LEMMA: "กองพลทหารปืนใหญ่ต่อสู่อากาศยาน"}], - "พล.ม.": [{ORTH: "พล.ม.", LEMMA: "กองพลทหารม้า"}], - "พล.ม.๒": [{ORTH: "พล.ม.๒", LEMMA: "กองพลทหารม้าที่ ๒"}], - "พล.ร.ต.": [{ORTH: "พล.ร.ต.", LEMMA: "พลเรือตรี"}], - "พล.ร.ท.": [{ORTH: "พล.ร.ท.", LEMMA: "พลเรือโท"}], - "พล.ร.อ.": [{ORTH: "พล.ร.อ.", LEMMA: "พลเรือเอก"}], - "พล.อ.": [{ORTH: "พล.อ.", LEMMA: "พลเอก"}], - "พล.อ.ต.": [{ORTH: "พล.อ.ต.", LEMMA: "พลอากาศตรี"}], - "พล.อ.ท.": [{ORTH: "พล.อ.ท.", LEMMA: "พลอากาศโท"}], - "พล.อ.อ.": [{ORTH: "พล.อ.อ.", LEMMA: "พลอากาศเอก"}], - "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ", LEMMA: "พันเอกพิเศษ"}], - "พ.อ.ต.": [{ORTH: "พ.อ.ต.", LEMMA: "พันจ่าอากาศตรี"}], - "พ.อ.ท.": [{ORTH: "พ.อ.ท.", LEMMA: "พันจ่าอากาศโท"}], - "พ.อ.อ.": [{ORTH: "พ.อ.อ.", LEMMA: "พันจ่าอากาศเอก"}], - "ภกญ.": [{ORTH: "ภกญ.", LEMMA: "เภสัชกรหญิง"}], - "ม.จ.": [{ORTH: "ม.จ.", LEMMA: "หม่อมเจ้า"}], - "มท1": [{ORTH: "มท1", LEMMA: "รัฐมนตรีว่าการกระทรวงมหาดไทย"}], - "ม.ร.ว.": [{ORTH: "ม.ร.ว.", LEMMA: "หม่อมราชวงศ์"}], - "มล.": [{ORTH: "มล.", LEMMA: "หม่อมหลวง"}], - "ร.ต.": [{ORTH: "ร.ต.", LEMMA: "ร้อยตรี,เรือตรี,เรืออากาศตรี"}], - "ร.ต.ต.": [{ORTH: "ร.ต.ต.", LEMMA: "ร้อยตำรวจตรี"}], - "ร.ต.ท.": [{ORTH: "ร.ต.ท.", LEMMA: "ร้อยตำรวจโท"}], - "ร.ต.อ.": [{ORTH: "ร.ต.อ.", LEMMA: "ร้อยตำรวจเอก"}], - "ร.ท.": [{ORTH: "ร.ท.", LEMMA: "ร้อยโท,เรือโท,เรืออากาศโท"}], - "รมช.": [{ORTH: "รมช.", LEMMA: "รัฐมนตรีช่วยว่าการกระทรวง"}], - "รมต.": [{ORTH: "รมต.", LEMMA: "รัฐมนตรี"}], - "รมว.": [{ORTH: "รมว.", LEMMA: "รัฐมนตรีว่าการกระทรวง"}], - "รศ.": [{ORTH: "รศ.", LEMMA: "รองศาสตราจารย์"}], - "ร.อ.": [{ORTH: "ร.อ.", LEMMA: "ร้อยเอก,เรือเอก,เรืออากาศเอก"}], - "ศ.": [{ORTH: "ศ.", LEMMA: "ศาสตราจารย์"}], - "ส.ต.": [{ORTH: "ส.ต.", LEMMA: "สิบตรี"}], - "ส.ต.ต.": [{ORTH: "ส.ต.ต.", LEMMA: "สิบตำรวจตรี"}], - "ส.ต.ท.": [{ORTH: "ส.ต.ท.", LEMMA: "สิบตำรวจโท"}], - "ส.ต.อ.": [{ORTH: "ส.ต.อ.", LEMMA: "สิบตำรวจเอก"}], - "ส.ท.": [{ORTH: "ส.ท.", LEMMA: "สิบโท"}], - "สพ.": [{ORTH: "สพ.", LEMMA: "สัตวแพทย์"}], - "สพ.ญ.": [{ORTH: "สพ.ญ.", LEMMA: "สัตวแพทย์หญิง"}], - "สพ.ช.": [{ORTH: "สพ.ช.", LEMMA: "สัตวแพทย์ชาย"}], - "ส.อ.": [{ORTH: "ส.อ.", LEMMA: "สิบเอก"}], - "อจ.": [{ORTH: "อจ.", LEMMA: "อาจารย์"}], - "อจญ.": [{ORTH: "อจญ.", LEMMA: "อาจารย์ใหญ่"}], + "ดร.": [{ORTH: "ดร."}], + "ด.ต.": [{ORTH: "ด.ต."}], + "จ.ต.": [{ORTH: "จ.ต."}], + "จ.ท.": [{ORTH: "จ.ท."}], + "จ.ส.ต.": [{ORTH: "จ.ส.ต."}], + "จสต.": [{ORTH: "จสต."}], + "จ.ส.ท.": [{ORTH: "จ.ส.ท."}], + "จ.ส.อ.": [{ORTH: "จ.ส.อ."}], + "จ.อ.": [{ORTH: "จ.อ."}], + "ทพญ.": [{ORTH: "ทพญ."}], + "ทนพ.": [{ORTH: "ทนพ."}], + "นจอ.": [{ORTH: "นจอ."}], + "น.ช.": [{ORTH: "น.ช."}], + "น.ญ.": [{ORTH: "น.ญ."}], + "น.ต.": [{ORTH: "น.ต."}], + "น.ท.": [{ORTH: "น.ท."}], + "นตท.": [{ORTH: "นตท."}], + "นนส.": [{ORTH: "นนส."}], + "นนร.": [{ORTH: "นนร."}], + "นนอ.": [{ORTH: "นนอ."}], + "นพ.": [{ORTH: "นพ."}], + "นพท.": [{ORTH: "นพท."}], + "นรจ.": [{ORTH: "นรจ."}], + "นรต.": [{ORTH: "นรต."}], + "นศพ.": [{ORTH: "นศพ."}], + "นศท.": [{ORTH: "นศท."}], + "น.สพ.": [{ORTH: "น.สพ."}], + "น.อ.": [{ORTH: "น.อ."}], + "บช.ก.": [{ORTH: "บช.ก."}], + "บช.น.": [{ORTH: "บช.น."}], + "ผกก.": [{ORTH: "ผกก."}], + "ผกก.ภ.": [{ORTH: "ผกก.ภ."}], + "ผจก.": [{ORTH: "ผจก."}], + "ผช.": [{ORTH: "ผช."}], + "ผชก.": [{ORTH: "ผชก."}], + "ผช.ผอ.": [{ORTH: "ผช.ผอ."}], + "ผญบ.": [{ORTH: "ผญบ."}], + "ผบ.": [{ORTH: "ผบ."}], + "ผบก.": [{ORTH: "ผบก."}], + "ผบก.น.": [{ORTH: "ผบก.น."}], + "ผบก.ป.": [{ORTH: "ผบก.ป."}], + "ผบก.ปค.": [{ORTH: "ผบก.ปค."}], + "ผบก.ปม.": [{ORTH: "ผบก.ปม."}], + "ผบก.ภ.": [{ORTH: "ผบก.ภ."}], + "ผบช.": [{ORTH: "ผบช."}], + "ผบช.ก.": [{ORTH: "ผบช.ก."}], + "ผบช.ตชด.": [{ORTH: "ผบช.ตชด."}], + "ผบช.น.": [{ORTH: "ผบช.น."}], + "ผบช.ภ.": [{ORTH: "ผบช.ภ."}], + "ผบ.ทบ.": [{ORTH: "ผบ.ทบ."}], + "ผบ.ตร.": [{ORTH: "ผบ.ตร."}], + "ผบ.ทร.": [{ORTH: "ผบ.ทร."}], + "ผบ.ทอ.": [{ORTH: "ผบ.ทอ."}], + "ผบ.ทสส.": [{ORTH: "ผบ.ทสส."}], + "ผวจ.": [{ORTH: "ผวจ."}], + "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ"}], + "พ.จ.ต.": [{ORTH: "พ.จ.ต."}], + "พ.จ.ท.": [{ORTH: "พ.จ.ท."}], + "พ.จ.อ.": [{ORTH: "พ.จ.อ."}], + "พญ.": [{ORTH: "พญ."}], + "ฯพณฯ": [{ORTH: "ฯพณฯ"}], + "พ.ต.": [{ORTH: "พ.ต."}], + "พ.ท.": [{ORTH: "พ.ท."}], + "พ.อ.": [{ORTH: "พ.อ."}], + "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ"}], + "พลฯ": [{ORTH: "พลฯ"}], + "พล.๑ รอ.": [{ORTH: "พล.๑ รอ."}], + "พล.ต.": [{ORTH: "พล.ต."}], + "พล.ต.ต.": [{ORTH: "พล.ต.ต."}], + "พล.ต.ท.": [{ORTH: "พล.ต.ท."}], + "พล.ต.อ.": [{ORTH: "พล.ต.อ."}], + "พล.ท.": [{ORTH: "พล.ท."}], + "พล.ปตอ.": [{ORTH: "พล.ปตอ."}], + "พล.ม.": [{ORTH: "พล.ม."}], + "พล.ม.๒": [{ORTH: "พล.ม.๒"}], + "พล.ร.ต.": [{ORTH: "พล.ร.ต."}], + "พล.ร.ท.": [{ORTH: "พล.ร.ท."}], + "พล.ร.อ.": [{ORTH: "พล.ร.อ."}], + "พล.อ.": [{ORTH: "พล.อ."}], + "พล.อ.ต.": [{ORTH: "พล.อ.ต."}], + "พล.อ.ท.": [{ORTH: "พล.อ.ท."}], + "พล.อ.อ.": [{ORTH: "พล.อ.อ."}], + "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ"}], + "พ.อ.ต.": [{ORTH: "พ.อ.ต."}], + "พ.อ.ท.": [{ORTH: "พ.อ.ท."}], + "พ.อ.อ.": [{ORTH: "พ.อ.อ."}], + "ภกญ.": [{ORTH: "ภกญ."}], + "ม.จ.": [{ORTH: "ม.จ."}], + "มท1": [{ORTH: "มท1"}], + "ม.ร.ว.": [{ORTH: "ม.ร.ว."}], + "มล.": [{ORTH: "มล."}], + "ร.ต.": [{ORTH: "ร.ต."}], + "ร.ต.ต.": [{ORTH: "ร.ต.ต."}], + "ร.ต.ท.": [{ORTH: "ร.ต.ท."}], + "ร.ต.อ.": [{ORTH: "ร.ต.อ."}], + "ร.ท.": [{ORTH: "ร.ท."}], + "รมช.": [{ORTH: "รมช."}], + "รมต.": [{ORTH: "รมต."}], + "รมว.": [{ORTH: "รมว."}], + "รศ.": [{ORTH: "รศ."}], + "ร.อ.": [{ORTH: "ร.อ."}], + "ศ.": [{ORTH: "ศ."}], + "ส.ต.": [{ORTH: "ส.ต."}], + "ส.ต.ต.": [{ORTH: "ส.ต.ต."}], + "ส.ต.ท.": [{ORTH: "ส.ต.ท."}], + "ส.ต.อ.": [{ORTH: "ส.ต.อ."}], + "ส.ท.": [{ORTH: "ส.ท."}], + "สพ.": [{ORTH: "สพ."}], + "สพ.ญ.": [{ORTH: "สพ.ญ."}], + "สพ.ช.": [{ORTH: "สพ.ช."}], + "ส.อ.": [{ORTH: "ส.อ."}], + "อจ.": [{ORTH: "อจ."}], + "อจญ.": [{ORTH: "อจญ."}], # วุฒิ / bachelor degree - "ป.": [{ORTH: "ป.", LEMMA: "ประถมศึกษา"}], - "ป.กศ.": [{ORTH: "ป.กศ.", LEMMA: "ประกาศนียบัตรวิชาการศึกษา"}], - "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง", LEMMA: "ประกาศนียบัตรวิชาการศึกษาชั้นสูง"}], - "ปวช.": [{ORTH: "ปวช.", LEMMA: "ประกาศนียบัตรวิชาชีพ"}], - "ปวท.": [{ORTH: "ปวท.", LEMMA: "ประกาศนียบัตรวิชาชีพเทคนิค"}], - "ปวส.": [{ORTH: "ปวส.", LEMMA: "ประกาศนียบัตรวิชาชีพชั้นสูง"}], - "ปทส.": [{ORTH: "ปทส.", LEMMA: "ประกาศนียบัตรครูเทคนิคชั้นสูง"}], - "กษ.บ.": [{ORTH: "กษ.บ.", LEMMA: "เกษตรศาสตรบัณฑิต"}], - "กษ.ม.": [{ORTH: "กษ.ม.", LEMMA: "เกษตรศาสตรมหาบัณฑิต"}], - "กษ.ด.": [{ORTH: "กษ.ด.", LEMMA: "เกษตรศาสตรดุษฎีบัณฑิต"}], - "ค.บ.": [{ORTH: "ค.บ.", LEMMA: "ครุศาสตรบัณฑิต"}], - "คศ.บ.": [{ORTH: "คศ.บ.", LEMMA: "คหกรรมศาสตรบัณฑิต"}], - "คศ.ม.": [{ORTH: "คศ.ม.", LEMMA: "คหกรรมศาสตรมหาบัณฑิต"}], - "คศ.ด.": [{ORTH: "คศ.ด.", LEMMA: "คหกรรมศาสตรดุษฎีบัณฑิต"}], - "ค.อ.บ.": [{ORTH: "ค.อ.บ.", LEMMA: "ครุศาสตรอุตสาหกรรมบัณฑิต"}], - "ค.อ.ม.": [{ORTH: "ค.อ.ม.", LEMMA: "ครุศาสตรอุตสาหกรรมมหาบัณฑิต"}], - "ค.อ.ด.": [{ORTH: "ค.อ.ด.", LEMMA: "ครุศาสตรอุตสาหกรรมดุษฎีบัณฑิต"}], - "ทก.บ.": [{ORTH: "ทก.บ.", LEMMA: "เทคโนโลยีการเกษตรบัณฑิต"}], - "ทก.ม.": [{ORTH: "ทก.ม.", LEMMA: "เทคโนโลยีการเกษตรมหาบัณฑิต"}], - "ทก.ด.": [{ORTH: "ทก.ด.", LEMMA: "เทคโนโลยีการเกษตรดุษฎีบัณฑิต"}], - "ท.บ.": [{ORTH: "ท.บ.", LEMMA: "ทันตแพทยศาสตรบัณฑิต"}], - "ท.ม.": [{ORTH: "ท.ม.", LEMMA: "ทันตแพทยศาสตรมหาบัณฑิต"}], - "ท.ด.": [{ORTH: "ท.ด.", LEMMA: "ทันตแพทยศาสตรดุษฎีบัณฑิต"}], - "น.บ.": [{ORTH: "น.บ.", LEMMA: "นิติศาสตรบัณฑิต"}], - "น.ม.": [{ORTH: "น.ม.", LEMMA: "นิติศาสตรมหาบัณฑิต"}], - "น.ด.": [{ORTH: "น.ด.", LEMMA: "นิติศาสตรดุษฎีบัณฑิต"}], - "นศ.บ.": [{ORTH: "นศ.บ.", LEMMA: "นิเทศศาสตรบัณฑิต"}], - "นศ.ม.": [{ORTH: "นศ.ม.", LEMMA: "นิเทศศาสตรมหาบัณฑิต"}], - "นศ.ด.": [{ORTH: "นศ.ด.", LEMMA: "นิเทศศาสตรดุษฎีบัณฑิต"}], - "บช.บ.": [{ORTH: "บช.บ.", LEMMA: "บัญชีบัณฑิต"}], - "บช.ม.": [{ORTH: "บช.ม.", LEMMA: "บัญชีมหาบัณฑิต"}], - "บช.ด.": [{ORTH: "บช.ด.", LEMMA: "บัญชีดุษฎีบัณฑิต"}], - "บธ.บ.": [{ORTH: "บธ.บ.", LEMMA: "บริหารธุรกิจบัณฑิต"}], - "บธ.ม.": [{ORTH: "บธ.ม.", LEMMA: "บริหารธุรกิจมหาบัณฑิต"}], - "บธ.ด.": [{ORTH: "บธ.ด.", LEMMA: "บริหารธุรกิจดุษฎีบัณฑิต"}], - "พณ.บ.": [{ORTH: "พณ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], - "พณ.ม.": [{ORTH: "พณ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], - "พณ.ด.": [{ORTH: "พณ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], - "พ.บ.": [{ORTH: "พ.บ.", LEMMA: "แพทยศาสตรบัณฑิต"}], - "พ.ม.": [{ORTH: "พ.ม.", LEMMA: "แพทยศาสตรมหาบัณฑิต"}], - "พ.ด.": [{ORTH: "พ.ด.", LEMMA: "แพทยศาสตรดุษฎีบัณฑิต"}], - "พธ.บ.": [{ORTH: "พธ.บ.", LEMMA: "พุทธศาสตรบัณฑิต"}], - "พธ.ม.": [{ORTH: "พธ.ม.", LEMMA: "พุทธศาสตรมหาบัณฑิต"}], - "พธ.ด.": [{ORTH: "พธ.ด.", LEMMA: "พุทธศาสตรดุษฎีบัณฑิต"}], - "พบ.บ.": [{ORTH: "พบ.บ.", LEMMA: "พัฒนบริหารศาสตรบัณฑิต"}], - "พบ.ม.": [{ORTH: "พบ.ม.", LEMMA: "พัฒนบริหารศาสตรมหาบัณฑิต"}], - "พบ.ด.": [{ORTH: "พบ.ด.", LEMMA: "พัฒนบริหารศาสตรดุษฎีบัณฑิต"}], - "พย.บ.": [{ORTH: "พย.บ.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], - "พย.ม.": [{ORTH: "พย.ม.", LEMMA: "พยาบาลศาสตรมหาบัณฑิต"}], - "พย.ด.": [{ORTH: "พย.ด.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], - "พศ.บ.": [{ORTH: "พศ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], - "พศ.ม.": [{ORTH: "พศ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], - "พศ.ด.": [{ORTH: "พศ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], - "ภ.บ.": [{ORTH: "ภ.บ.", LEMMA: "เภสัชศาสตรบัณฑิต"}], - "ภ.ม.": [{ORTH: "ภ.ม.", LEMMA: "เภสัชศาสตรมหาบัณฑิต"}], - "ภ.ด.": [{ORTH: "ภ.ด.", LEMMA: "เภสัชศาสตรดุษฎีบัณฑิต"}], - "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ.", LEMMA: "ภูมิสถาปัตยกรรมศาสตรบัณฑิต"}], - "รป.บ.": [{ORTH: "รป.บ.", LEMMA: "รัฐประศาสนศาสตร์บัณฑิต"}], - "รป.ม.": [{ORTH: "รป.ม.", LEMMA: "รัฐประศาสนศาสตร์มหาบัณฑิต"}], - "วท.บ.": [{ORTH: "วท.บ.", LEMMA: "วิทยาศาสตรบัณฑิต"}], - "วท.ม.": [{ORTH: "วท.ม.", LEMMA: "วิทยาศาสตรมหาบัณฑิต"}], - "วท.ด.": [{ORTH: "วท.ด.", LEMMA: "วิทยาศาสตรดุษฎีบัณฑิต"}], - "ศ.บ.": [{ORTH: "ศ.บ.", LEMMA: "ศิลปบัณฑิต"}], - "ศศ.บ.": [{ORTH: "ศศ.บ.", LEMMA: "ศิลปศาสตรบัณฑิต"}], - "ศษ.บ.": [{ORTH: "ศษ.บ.", LEMMA: "ศึกษาศาสตรบัณฑิต"}], - "ศส.บ.": [{ORTH: "ศส.บ.", LEMMA: "เศรษฐศาสตรบัณฑิต"}], - "สถ.บ.": [{ORTH: "สถ.บ.", LEMMA: "สถาปัตยกรรมศาสตรบัณฑิต"}], - "สถ.ม.": [{ORTH: "สถ.ม.", LEMMA: "สถาปัตยกรรมศาสตรมหาบัณฑิต"}], - "สถ.ด.": [{ORTH: "สถ.ด.", LEMMA: "สถาปัตยกรรมศาสตรดุษฎีบัณฑิต"}], - "สพ.บ.": [{ORTH: "สพ.บ.", LEMMA: "สัตวแพทยศาสตรบัณฑิต"}], - "อ.บ.": [{ORTH: "อ.บ.", LEMMA: "อักษรศาสตรบัณฑิต"}], - "อ.ม.": [{ORTH: "อ.ม.", LEMMA: "อักษรศาสตรมหาบัณฑิต"}], - "อ.ด.": [{ORTH: "อ.ด.", LEMMA: "อักษรศาสตรดุษฎีบัณฑิต"}], + "ป.": [{ORTH: "ป."}], + "ป.กศ.": [{ORTH: "ป.กศ."}], + "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง"}], + "ปวช.": [{ORTH: "ปวช."}], + "ปวท.": [{ORTH: "ปวท."}], + "ปวส.": [{ORTH: "ปวส."}], + "ปทส.": [{ORTH: "ปทส."}], + "กษ.บ.": [{ORTH: "กษ.บ."}], + "กษ.ม.": [{ORTH: "กษ.ม."}], + "กษ.ด.": [{ORTH: "กษ.ด."}], + "ค.บ.": [{ORTH: "ค.บ."}], + "คศ.บ.": [{ORTH: "คศ.บ."}], + "คศ.ม.": [{ORTH: "คศ.ม."}], + "คศ.ด.": [{ORTH: "คศ.ด."}], + "ค.อ.บ.": [{ORTH: "ค.อ.บ."}], + "ค.อ.ม.": [{ORTH: "ค.อ.ม."}], + "ค.อ.ด.": [{ORTH: "ค.อ.ด."}], + "ทก.บ.": [{ORTH: "ทก.บ."}], + "ทก.ม.": [{ORTH: "ทก.ม."}], + "ทก.ด.": [{ORTH: "ทก.ด."}], + "ท.บ.": [{ORTH: "ท.บ."}], + "ท.ม.": [{ORTH: "ท.ม."}], + "ท.ด.": [{ORTH: "ท.ด."}], + "น.บ.": [{ORTH: "น.บ."}], + "น.ม.": [{ORTH: "น.ม."}], + "น.ด.": [{ORTH: "น.ด."}], + "นศ.บ.": [{ORTH: "นศ.บ."}], + "นศ.ม.": [{ORTH: "นศ.ม."}], + "นศ.ด.": [{ORTH: "นศ.ด."}], + "บช.บ.": [{ORTH: "บช.บ."}], + "บช.ม.": [{ORTH: "บช.ม."}], + "บช.ด.": [{ORTH: "บช.ด."}], + "บธ.บ.": [{ORTH: "บธ.บ."}], + "บธ.ม.": [{ORTH: "บธ.ม."}], + "บธ.ด.": [{ORTH: "บธ.ด."}], + "พณ.บ.": [{ORTH: "พณ.บ."}], + "พณ.ม.": [{ORTH: "พณ.ม."}], + "พณ.ด.": [{ORTH: "พณ.ด."}], + "พ.บ.": [{ORTH: "พ.บ."}], + "พ.ม.": [{ORTH: "พ.ม."}], + "พ.ด.": [{ORTH: "พ.ด."}], + "พธ.บ.": [{ORTH: "พธ.บ."}], + "พธ.ม.": [{ORTH: "พธ.ม."}], + "พธ.ด.": [{ORTH: "พธ.ด."}], + "พบ.บ.": [{ORTH: "พบ.บ."}], + "พบ.ม.": [{ORTH: "พบ.ม."}], + "พบ.ด.": [{ORTH: "พบ.ด."}], + "พย.บ.": [{ORTH: "พย.บ."}], + "พย.ม.": [{ORTH: "พย.ม."}], + "พย.ด.": [{ORTH: "พย.ด."}], + "พศ.บ.": [{ORTH: "พศ.บ."}], + "พศ.ม.": [{ORTH: "พศ.ม."}], + "พศ.ด.": [{ORTH: "พศ.ด."}], + "ภ.บ.": [{ORTH: "ภ.บ."}], + "ภ.ม.": [{ORTH: "ภ.ม."}], + "ภ.ด.": [{ORTH: "ภ.ด."}], + "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ."}], + "รป.บ.": [{ORTH: "รป.บ."}], + "รป.ม.": [{ORTH: "รป.ม."}], + "วท.บ.": [{ORTH: "วท.บ."}], + "วท.ม.": [{ORTH: "วท.ม."}], + "วท.ด.": [{ORTH: "วท.ด."}], + "ศ.บ.": [{ORTH: "ศ.บ."}], + "ศศ.บ.": [{ORTH: "ศศ.บ."}], + "ศษ.บ.": [{ORTH: "ศษ.บ."}], + "ศส.บ.": [{ORTH: "ศส.บ."}], + "สถ.บ.": [{ORTH: "สถ.บ."}], + "สถ.ม.": [{ORTH: "สถ.ม."}], + "สถ.ด.": [{ORTH: "สถ.ด."}], + "สพ.บ.": [{ORTH: "สพ.บ."}], + "อ.บ.": [{ORTH: "อ.บ."}], + "อ.ม.": [{ORTH: "อ.ม."}], + "อ.ด.": [{ORTH: "อ.ด."}], # ปี / เวลา / year / time - "ชม.": [{ORTH: "ชม.", LEMMA: "ชั่วโมง"}], - "จ.ศ.": [{ORTH: "จ.ศ.", LEMMA: "จุลศักราช"}], - "ค.ศ.": [{ORTH: "ค.ศ.", LEMMA: "คริสต์ศักราช"}], - "ฮ.ศ.": [{ORTH: "ฮ.ศ.", LEMMA: "ฮิจเราะห์ศักราช"}], - "ว.ด.ป.": [{ORTH: "ว.ด.ป.", LEMMA: "วัน เดือน ปี"}], + "ชม.": [{ORTH: "ชม."}], + "จ.ศ.": [{ORTH: "จ.ศ."}], + "ค.ศ.": [{ORTH: "ค.ศ."}], + "ฮ.ศ.": [{ORTH: "ฮ.ศ."}], + "ว.ด.ป.": [{ORTH: "ว.ด.ป."}], # ระยะทาง / distance - "ฮม.": [{ORTH: "ฮม.", LEMMA: "เฮกโตเมตร"}], - "ดคม.": [{ORTH: "ดคม.", LEMMA: "เดคาเมตร"}], - "ดม.": [{ORTH: "ดม.", LEMMA: "เดซิเมตร"}], - "มม.": [{ORTH: "มม.", LEMMA: "มิลลิเมตร"}], - "ซม.": [{ORTH: "ซม.", LEMMA: "เซนติเมตร"}], - "กม.": [{ORTH: "กม.", LEMMA: "กิโลเมตร"}], + "ฮม.": [{ORTH: "ฮม."}], + "ดคม.": [{ORTH: "ดคม."}], + "ดม.": [{ORTH: "ดม."}], + "มม.": [{ORTH: "มม."}], + "ซม.": [{ORTH: "ซม."}], + "กม.": [{ORTH: "กม."}], # น้ำหนัก / weight - "น.น.": [{ORTH: "น.น.", LEMMA: "น้ำหนัก"}], - "ฮก.": [{ORTH: "ฮก.", LEMMA: "เฮกโตกรัม"}], - "ดคก.": [{ORTH: "ดคก.", LEMMA: "เดคากรัม"}], - "ดก.": [{ORTH: "ดก.", LEMMA: "เดซิกรัม"}], - "ซก.": [{ORTH: "ซก.", LEMMA: "เซนติกรัม"}], - "มก.": [{ORTH: "มก.", LEMMA: "มิลลิกรัม"}], - "ก.": [{ORTH: "ก.", LEMMA: "กรัม"}], - "กก.": [{ORTH: "กก.", LEMMA: "กิโลกรัม"}], + "น.น.": [{ORTH: "น.น."}], + "ฮก.": [{ORTH: "ฮก."}], + "ดคก.": [{ORTH: "ดคก."}], + "ดก.": [{ORTH: "ดก."}], + "ซก.": [{ORTH: "ซก."}], + "มก.": [{ORTH: "มก."}], + "ก.": [{ORTH: "ก."}], + "กก.": [{ORTH: "กก."}], # ปริมาตร / volume - "ฮล.": [{ORTH: "ฮล.", LEMMA: "เฮกโตลิตร"}], - "ดคล.": [{ORTH: "ดคล.", LEMMA: "เดคาลิตร"}], - "ดล.": [{ORTH: "ดล.", LEMMA: "เดซิลิตร"}], - "ซล.": [{ORTH: "ซล.", LEMMA: "เซนติลิตร"}], - "ล.": [{ORTH: "ล.", LEMMA: "ลิตร"}], - "กล.": [{ORTH: "กล.", LEMMA: "กิโลลิตร"}], - "ลบ.": [{ORTH: "ลบ.", LEMMA: "ลูกบาศก์"}], + "ฮล.": [{ORTH: "ฮล."}], + "ดคล.": [{ORTH: "ดคล."}], + "ดล.": [{ORTH: "ดล."}], + "ซล.": [{ORTH: "ซล."}], + "ล.": [{ORTH: "ล."}], + "กล.": [{ORTH: "กล."}], + "ลบ.": [{ORTH: "ลบ."}], # พื้นที่ / area - "ตร.ซม.": [{ORTH: "ตร.ซม.", LEMMA: "ตารางเซนติเมตร"}], - "ตร.ม.": [{ORTH: "ตร.ม.", LEMMA: "ตารางเมตร"}], - "ตร.ว.": [{ORTH: "ตร.ว.", LEMMA: "ตารางวา"}], - "ตร.กม.": [{ORTH: "ตร.กม.", LEMMA: "ตารางกิโลเมตร"}], + "ตร.ซม.": [{ORTH: "ตร.ซม."}], + "ตร.ม.": [{ORTH: "ตร.ม."}], + "ตร.ว.": [{ORTH: "ตร.ว."}], + "ตร.กม.": [{ORTH: "ตร.กม."}], # เดือน / month - "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}], - "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}], - "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}], - "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}], - "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}], - "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}], - "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}], - "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}], - "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}], - "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}], - "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}], - "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}], + "ม.ค.": [{ORTH: "ม.ค."}], + "ก.พ.": [{ORTH: "ก.พ."}], + "มี.ค.": [{ORTH: "มี.ค."}], + "เม.ย.": [{ORTH: "เม.ย."}], + "พ.ค.": [{ORTH: "พ.ค."}], + "มิ.ย.": [{ORTH: "มิ.ย."}], + "ก.ค.": [{ORTH: "ก.ค."}], + "ส.ค.": [{ORTH: "ส.ค."}], + "ก.ย.": [{ORTH: "ก.ย."}], + "ต.ค.": [{ORTH: "ต.ค."}], + "พ.ย.": [{ORTH: "พ.ย."}], + "ธ.ค.": [{ORTH: "ธ.ค."}], # เพศ / gender - "ช.": [{ORTH: "ช.", LEMMA: "ชาย"}], - "ญ.": [{ORTH: "ญ.", LEMMA: "หญิง"}], - "ด.ช.": [{ORTH: "ด.ช.", LEMMA: "เด็กชาย"}], - "ด.ญ.": [{ORTH: "ด.ญ.", LEMMA: "เด็กหญิง"}], + "ช.": [{ORTH: "ช."}], + "ญ.": [{ORTH: "ญ."}], + "ด.ช.": [{ORTH: "ด.ช."}], + "ด.ญ.": [{ORTH: "ด.ญ."}], # ที่อยู่ / address - "ถ.": [{ORTH: "ถ.", LEMMA: "ถนน"}], - "ต.": [{ORTH: "ต.", LEMMA: "ตำบล"}], - "อ.": [{ORTH: "อ.", LEMMA: "อำเภอ"}], - "จ.": [{ORTH: "จ.", LEMMA: "จังหวัด"}], + "ถ.": [{ORTH: "ถ."}], + "ต.": [{ORTH: "ต."}], + "อ.": [{ORTH: "อ."}], + "จ.": [{ORTH: "จ."}], # สรรพนาม / pronoun - "ข้าฯ": [{ORTH: "ข้าฯ", LEMMA: "ข้าพระพุทธเจ้า"}], - "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ", LEMMA: "ทูลเกล้าทูลกระหม่อม"}], - "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ", LEMMA: "น้อมเกล้าน้อมกระหม่อม"}], - "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ", LEMMA: "โปรดเกล้าโปรดกระหม่อม"}], + "ข้าฯ": [{ORTH: "ข้าฯ"}], + "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ"}], + "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ"}], + "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ"}], # การเมือง / politic - "ขจก.": [{ORTH: "ขจก.", LEMMA: "ขบวนการโจรก่อการร้าย"}], - "ขบด.": [{ORTH: "ขบด.", LEMMA: "ขบวนการแบ่งแยกดินแดน"}], - "นปช.": [{ORTH: "นปช.", LEMMA: "แนวร่วมประชาธิปไตยขับไล่เผด็จการ"}], - "ปชป.": [{ORTH: "ปชป.", LEMMA: "พรรคประชาธิปัตย์"}], - "ผกค.": [{ORTH: "ผกค.", LEMMA: "ผู้ก่อการร้ายคอมมิวนิสต์"}], - "พท.": [{ORTH: "พท.", LEMMA: "พรรคเพื่อไทย"}], - "พ.ร.ก.": [{ORTH: "พ.ร.ก.", LEMMA: "พระราชกำหนด"}], - "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ.", LEMMA: "พระราชกฤษฎีกา"}], - "พ.ร.บ.": [{ORTH: "พ.ร.บ.", LEMMA: "พระราชบัญญัติ"}], - "รธน.": [{ORTH: "รธน.", LEMMA: "รัฐธรรมนูญ"}], - "รบ.": [{ORTH: "รบ.", LEMMA: "รัฐบาล"}], - "รสช.": [{ORTH: "รสช.", LEMMA: "คณะรักษาความสงบเรียบร้อยแห่งชาติ"}], - "ส.ก.": [{ORTH: "ส.ก.", LEMMA: "สมาชิกสภากรุงเทพมหานคร"}], - "สจ.": [{ORTH: "สจ.", LEMMA: "สมาชิกสภาจังหวัด"}], - "สว.": [{ORTH: "สว.", LEMMA: "สมาชิกวุฒิสภา"}], - "ส.ส.": [{ORTH: "ส.ส.", LEMMA: "สมาชิกสภาผู้แทนราษฎร"}], + "ขจก.": [{ORTH: "ขจก."}], + "ขบด.": [{ORTH: "ขบด."}], + "นปช.": [{ORTH: "นปช."}], + "ปชป.": [{ORTH: "ปชป."}], + "ผกค.": [{ORTH: "ผกค."}], + "พท.": [{ORTH: "พท."}], + "พ.ร.ก.": [{ORTH: "พ.ร.ก."}], + "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ."}], + "พ.ร.บ.": [{ORTH: "พ.ร.บ."}], + "รธน.": [{ORTH: "รธน."}], + "รบ.": [{ORTH: "รบ."}], + "รสช.": [{ORTH: "รสช."}], + "ส.ก.": [{ORTH: "ส.ก."}], + "สจ.": [{ORTH: "สจ."}], + "สว.": [{ORTH: "สว."}], + "ส.ส.": [{ORTH: "ส.ส."}], # ทั่วไป / general - "ก.ข.ค.": [{ORTH: "ก.ข.ค.", LEMMA: "ก้างขวางคอ"}], - "กทม.": [{ORTH: "กทม.", LEMMA: "กรุงเทพมหานคร"}], - "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ", LEMMA: "กรุงเทพมหานคร"}], - "ขรก.": [{ORTH: "ขรก.", LEMMA: "ข้าราชการ"}], - "ขส": [{ORTH: "ขส.", LEMMA: "ขนส่ง"}], - "ค.ร.น.": [{ORTH: "ค.ร.น.", LEMMA: "คูณร่วมน้อย"}], - "ค.ร.ม.": [{ORTH: "ค.ร.ม.", LEMMA: "คูณร่วมมาก"}], - "ง.ด.": [{ORTH: "ง.ด.", LEMMA: "เงินเดือน"}], - "งป.": [{ORTH: "งป.", LEMMA: "งบประมาณ"}], - "จก.": [{ORTH: "จก.", LEMMA: "จำกัด"}], - "จขกท.": [{ORTH: "จขกท.", LEMMA: "เจ้าของกระทู้"}], - "จนท.": [{ORTH: "จนท.", LEMMA: "เจ้าหน้าที่"}], - "จ.ป.ร.": [ - { - ORTH: "จ.ป.ร.", - LEMMA: "มหาจุฬาลงกรณ ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัว)", - } - ], - "จ.ม.": [{ORTH: "จ.ม.", LEMMA: "จดหมาย"}], - "จย.": [{ORTH: "จย.", LEMMA: "จักรยาน"}], - "จยย.": [{ORTH: "จยย.", LEMMA: "จักรยานยนต์"}], - "ตจว.": [{ORTH: "ตจว.", LEMMA: "ต่างจังหวัด"}], - "โทร.": [{ORTH: "โทร.", LEMMA: "โทรศัพท์"}], - "ธ.": [{ORTH: "ธ.", LEMMA: "ธนาคาร"}], - "น.ร.": [{ORTH: "น.ร.", LEMMA: "นักเรียน"}], - "น.ศ.": [{ORTH: "น.ศ.", LEMMA: "นักศึกษา"}], - "น.ส.": [{ORTH: "น.ส.", LEMMA: "นางสาว"}], - "น.ส.๓": [{ORTH: "น.ส.๓", LEMMA: "หนังสือรับรองการทำประโยชน์ในที่ดิน"}], - "น.ส.๓ ก.": [ - {ORTH: "น.ส.๓ ก", LEMMA: "หนังสือแสดงกรรมสิทธิ์ในที่ดิน (มีระวางกำหนด)"} - ], - "นสพ.": [{ORTH: "นสพ.", LEMMA: "หนังสือพิมพ์"}], - "บ.ก.": [{ORTH: "บ.ก.", LEMMA: "บรรณาธิการ"}], - "บจก.": [{ORTH: "บจก.", LEMMA: "บริษัทจำกัด"}], - "บงล.": [{ORTH: "บงล.", LEMMA: "บริษัทเงินทุนและหลักทรัพย์จำกัด"}], - "บบส.": [{ORTH: "บบส.", LEMMA: "บรรษัทบริหารสินทรัพย์สถาบันการเงิน"}], - "บมจ.": [{ORTH: "บมจ.", LEMMA: "บริษัทมหาชนจำกัด"}], - "บลจ.": [{ORTH: "บลจ.", LEMMA: "บริษัทหลักทรัพย์จัดการกองทุนรวมจำกัด"}], - "บ/ช": [{ORTH: "บ/ช", LEMMA: "บัญชี"}], - "บร.": [{ORTH: "บร.", LEMMA: "บรรณารักษ์"}], - "ปชช.": [{ORTH: "ปชช.", LEMMA: "ประชาชน"}], - "ปณ.": [{ORTH: "ปณ.", LEMMA: "ที่ทำการไปรษณีย์"}], - "ปณก.": [{ORTH: "ปณก.", LEMMA: "ที่ทำการไปรษณีย์กลาง"}], - "ปณส.": [{ORTH: "ปณส.", LEMMA: "ที่ทำการไปรษณีย์สาขา"}], - "ปธ.": [{ORTH: "ปธ.", LEMMA: "ประธาน"}], - "ปธน.": [{ORTH: "ปธน.", LEMMA: "ประธานาธิบดี"}], - "ปอ.": [{ORTH: "ปอ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศ"}], - "ปอ.พ.": [{ORTH: "ปอ.พ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศพิเศษ"}], - "พ.ก.ง.": [{ORTH: "พ.ก.ง.", LEMMA: "พัสดุเก็บเงินปลายทาง"}], - "พ.ก.ส.": [{ORTH: "พ.ก.ส.", LEMMA: "พนักงานเก็บค่าโดยสาร"}], - "พขร.": [{ORTH: "พขร.", LEMMA: "พนักงานขับรถ"}], - "ภ.ง.ด.": [{ORTH: "ภ.ง.ด.", LEMMA: "ภาษีเงินได้"}], - "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙", LEMMA: "แบบแสดงรายการเสียภาษีเงินได้ของกรมสรรพากร"}], - "ภ.ป.ร.": [ - { - ORTH: "ภ.ป.ร.", - LEMMA: "ภูมิพลอดุยเดช ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระปรมินทรมหาภูมิพลอดุลยเดช)", - } - ], - "ภ.พ.": [{ORTH: "ภ.พ.", LEMMA: "ภาษีมูลค่าเพิ่ม"}], - "ร.": [{ORTH: "ร.", LEMMA: "รัชกาล"}], - "ร.ง.": [{ORTH: "ร.ง.", LEMMA: "โรงงาน"}], - "ร.ด.": [{ORTH: "ร.ด.", LEMMA: "รักษาดินแดน"}], - "รปภ.": [{ORTH: "รปภ.", LEMMA: "รักษาความปลอดภัย"}], - "รพ.": [{ORTH: "รพ.", LEMMA: "โรงพยาบาล"}], - "ร.พ.": [{ORTH: "ร.พ.", LEMMA: "โรงพิมพ์"}], - "รร.": [{ORTH: "รร.", LEMMA: "โรงเรียน,โรงแรม"}], - "รสก.": [{ORTH: "รสก.", LEMMA: "รัฐวิสาหกิจ"}], - "ส.ค.ส.": [{ORTH: "ส.ค.ส.", LEMMA: "ส่งความสุขปีใหม่"}], - "สต.": [{ORTH: "สต.", LEMMA: "สตางค์"}], - "สน.": [{ORTH: "สน.", LEMMA: "สถานีตำรวจ"}], - "สนข.": [{ORTH: "สนข.", LEMMA: "สำนักงานเขต"}], - "สนง.": [{ORTH: "สนง.", LEMMA: "สำนักงาน"}], - "สนญ.": [{ORTH: "สนญ.", LEMMA: "สำนักงานใหญ่"}], - "ส.ป.ช.": [{ORTH: "ส.ป.ช.", LEMMA: "สร้างเสริมประสบการณ์ชีวิต"}], - "สภ.": [{ORTH: "สภ.", LEMMA: "สถานีตำรวจภูธร"}], - "ส.ล.น.": [{ORTH: "ส.ล.น.", LEMMA: "สร้างเสริมลักษณะนิสัย"}], - "สวญ.": [{ORTH: "สวญ.", LEMMA: "สารวัตรใหญ่"}], - "สวป.": [{ORTH: "สวป.", LEMMA: "สารวัตรป้องกันปราบปราม"}], - "สว.สส.": [{ORTH: "สว.สส.", LEMMA: "สารวัตรสืบสวน"}], - "ส.ห.": [{ORTH: "ส.ห.", LEMMA: "สารวัตรทหาร"}], - "สอ.": [{ORTH: "สอ.", LEMMA: "สถานีอนามัย"}], - "สอท.": [{ORTH: "สอท.", LEMMA: "สถานเอกอัครราชทูต"}], - "เสธ.": [{ORTH: "เสธ.", LEMMA: "เสนาธิการ"}], - "หจก.": [{ORTH: "หจก.", LEMMA: "ห้างหุ้นส่วนจำกัด"}], - "ห.ร.ม.": [{ORTH: "ห.ร.ม.", LEMMA: "ตัวหารร่วมมาก"}], + "ก.ข.ค.": [{ORTH: "ก.ข.ค."}], + "กทม.": [{ORTH: "กทม."}], + "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ"}], + "ขรก.": [{ORTH: "ขรก."}], + "ขส": [{ORTH: "ขส."}], + "ค.ร.น.": [{ORTH: "ค.ร.น."}], + "ค.ร.ม.": [{ORTH: "ค.ร.ม."}], + "ง.ด.": [{ORTH: "ง.ด."}], + "งป.": [{ORTH: "งป."}], + "จก.": [{ORTH: "จก."}], + "จขกท.": [{ORTH: "จขกท."}], + "จนท.": [{ORTH: "จนท."}], + "จ.ป.ร.": [{ORTH: "จ.ป.ร."}], + "จ.ม.": [{ORTH: "จ.ม."}], + "จย.": [{ORTH: "จย."}], + "จยย.": [{ORTH: "จยย."}], + "ตจว.": [{ORTH: "ตจว."}], + "โทร.": [{ORTH: "โทร."}], + "ธ.": [{ORTH: "ธ."}], + "น.ร.": [{ORTH: "น.ร."}], + "น.ศ.": [{ORTH: "น.ศ."}], + "น.ส.": [{ORTH: "น.ส."}], + "น.ส.๓": [{ORTH: "น.ส.๓"}], + "น.ส.๓ ก.": [{ORTH: "น.ส.๓ ก"}], + "นสพ.": [{ORTH: "นสพ."}], + "บ.ก.": [{ORTH: "บ.ก."}], + "บจก.": [{ORTH: "บจก."}], + "บงล.": [{ORTH: "บงล."}], + "บบส.": [{ORTH: "บบส."}], + "บมจ.": [{ORTH: "บมจ."}], + "บลจ.": [{ORTH: "บลจ."}], + "บ/ช": [{ORTH: "บ/ช"}], + "บร.": [{ORTH: "บร."}], + "ปชช.": [{ORTH: "ปชช."}], + "ปณ.": [{ORTH: "ปณ."}], + "ปณก.": [{ORTH: "ปณก."}], + "ปณส.": [{ORTH: "ปณส."}], + "ปธ.": [{ORTH: "ปธ."}], + "ปธน.": [{ORTH: "ปธน."}], + "ปอ.": [{ORTH: "ปอ."}], + "ปอ.พ.": [{ORTH: "ปอ.พ."}], + "พ.ก.ง.": [{ORTH: "พ.ก.ง."}], + "พ.ก.ส.": [{ORTH: "พ.ก.ส."}], + "พขร.": [{ORTH: "พขร."}], + "ภ.ง.ด.": [{ORTH: "ภ.ง.ด."}], + "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙"}], + "ภ.ป.ร.": [{ORTH: "ภ.ป.ร."}], + "ภ.พ.": [{ORTH: "ภ.พ."}], + "ร.": [{ORTH: "ร."}], + "ร.ง.": [{ORTH: "ร.ง."}], + "ร.ด.": [{ORTH: "ร.ด."}], + "รปภ.": [{ORTH: "รปภ."}], + "รพ.": [{ORTH: "รพ."}], + "ร.พ.": [{ORTH: "ร.พ."}], + "รร.": [{ORTH: "รร."}], + "รสก.": [{ORTH: "รสก."}], + "ส.ค.ส.": [{ORTH: "ส.ค.ส."}], + "สต.": [{ORTH: "สต."}], + "สน.": [{ORTH: "สน."}], + "สนข.": [{ORTH: "สนข."}], + "สนง.": [{ORTH: "สนง."}], + "สนญ.": [{ORTH: "สนญ."}], + "ส.ป.ช.": [{ORTH: "ส.ป.ช."}], + "สภ.": [{ORTH: "สภ."}], + "ส.ล.น.": [{ORTH: "ส.ล.น."}], + "สวญ.": [{ORTH: "สวญ."}], + "สวป.": [{ORTH: "สวป."}], + "สว.สส.": [{ORTH: "สว.สส."}], + "ส.ห.": [{ORTH: "ส.ห."}], + "สอ.": [{ORTH: "สอ."}], + "สอท.": [{ORTH: "สอท."}], + "เสธ.": [{ORTH: "เสธ."}], + "หจก.": [{ORTH: "หจก."}], + "ห.ร.ม.": [{ORTH: "ห.ร.ม."}], } diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index f81d35f20..51ad12d9f 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,18 +1,18 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, NORM from ...util import update_exc _exc = { - "tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}], - "isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}], - "baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}], - "sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}], - "ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}], - "siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}], - "nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}], - "papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}], - "'di": [{ORTH: "'di", LEMMA: "hindi"}], + "tayo'y": [{ORTH: "tayo"}, {ORTH: "'y", NORM: "ay"}], + "isa'y": [{ORTH: "isa"}, {ORTH: "'y", NORM: "ay"}], + "baya'y": [{ORTH: "baya"}, {ORTH: "'y", NORM: "ay"}], + "sa'yo": [{ORTH: "sa"}, {ORTH: "'yo", NORM: "iyo"}], + "ano'ng": [{ORTH: "ano"}, {ORTH: "'ng", NORM: "ang"}], + "siya'y": [{ORTH: "siya"}, {ORTH: "'y", NORM: "ay"}], + "nawa'y": [{ORTH: "nawa"}, {ORTH: "'y", NORM: "ay"}], + "papa'no": [{ORTH: "papa'no", NORM: "papaano"}], + "'di": [{ORTH: "'di", NORM: "hindi"}], } diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 45391332e..2532ae104 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,7 +1,7 @@ import re from .char_classes import ALPHA_LOWER -from ..symbols import ORTH, POS, TAG, LEMMA, SPACE +from ..symbols import ORTH, NORM # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex @@ -62,13 +62,13 @@ BASE_EXCEPTIONS = {} for exc_data in [ - {ORTH: " ", POS: SPACE, TAG: "_SP"}, - {ORTH: "\t", POS: SPACE, TAG: "_SP"}, - {ORTH: "\\t", POS: SPACE, TAG: "_SP"}, - {ORTH: "\n", POS: SPACE, TAG: "_SP"}, - {ORTH: "\\n", POS: SPACE, TAG: "_SP"}, + {ORTH: " "}, + {ORTH: "\t"}, + {ORTH: "\\t"}, + {ORTH: "\n"}, + {ORTH: "\\n"}, {ORTH: "\u2014"}, - {ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"}, + {ORTH: "\u00a0", NORM: " "}, ]: BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py index 4ee6b6cd2..3b8cc86b5 100644 --- a/spacy/lang/tt/tokenizer_exceptions.py +++ b/spacy/lang/tt/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc @@ -7,35 +7,35 @@ _exc = {} _abbrev_exc = [ # Weekdays abbreviations - {ORTH: "дш", LEMMA: "дүшәмбе"}, - {ORTH: "сш", LEMMA: "сишәмбе"}, - {ORTH: "чш", LEMMA: "чәршәмбе"}, - {ORTH: "пш", LEMMA: "пәнҗешәмбе"}, - {ORTH: "җм", LEMMA: "җомга"}, - {ORTH: "шб", LEMMA: "шимбә"}, - {ORTH: "яш", LEMMA: "якшәмбе"}, + {ORTH: "дш", NORM: "дүшәмбе"}, + {ORTH: "сш", NORM: "сишәмбе"}, + {ORTH: "чш", NORM: "чәршәмбе"}, + {ORTH: "пш", NORM: "пәнҗешәмбе"}, + {ORTH: "җм", NORM: "җомга"}, + {ORTH: "шб", NORM: "шимбә"}, + {ORTH: "яш", NORM: "якшәмбе"}, # Months abbreviations - {ORTH: "гый", LEMMA: "гыйнвар"}, - {ORTH: "фев", LEMMA: "февраль"}, - {ORTH: "мар", LEMMA: "март"}, - {ORTH: "мар", LEMMA: "март"}, - {ORTH: "апр", LEMMA: "апрель"}, - {ORTH: "июн", LEMMA: "июнь"}, - {ORTH: "июл", LEMMA: "июль"}, - {ORTH: "авг", LEMMA: "август"}, - {ORTH: "сен", LEMMA: "сентябрь"}, - {ORTH: "окт", LEMMA: "октябрь"}, - {ORTH: "ноя", LEMMA: "ноябрь"}, - {ORTH: "дек", LEMMA: "декабрь"}, + {ORTH: "гый", NORM: "гыйнвар"}, + {ORTH: "фев", NORM: "февраль"}, + {ORTH: "мар", NORM: "март"}, + {ORTH: "мар", NORM: "март"}, + {ORTH: "апр", NORM: "апрель"}, + {ORTH: "июн", NORM: "июнь"}, + {ORTH: "июл", NORM: "июль"}, + {ORTH: "авг", NORM: "август"}, + {ORTH: "сен", NORM: "сентябрь"}, + {ORTH: "окт", NORM: "октябрь"}, + {ORTH: "ноя", NORM: "ноябрь"}, + {ORTH: "дек", NORM: "декабрь"}, # Number abbreviations - {ORTH: "млрд", LEMMA: "миллиард"}, - {ORTH: "млн", LEMMA: "миллион"}, + {ORTH: "млрд", NORM: "миллиард"}, + {ORTH: "млн", NORM: "миллион"}, ] for abbr in _abbrev_exc: for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): - _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] - _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth] = [{ORTH: orth, NORM: abbr[NORM]}] + _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbr[NORM]}] for exc_data in [ # "etc." abbreviations {ORTH: "һ.б.ш.", NORM: "һәм башка шундыйлар"}, @@ -43,7 +43,6 @@ for exc_data in [ # "etc." abbreviations {ORTH: "б.э.к.", NORM: "безнең эрага кадәр"}, {ORTH: "б.э.", NORM: "безнең эра"}, ]: - exc_data[LEMMA] = exc_data[NORM] _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index 8ae82a48c..94016fd52 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,24 +1,24 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, POS, NORM, NOUN +from ...symbols import ORTH, NORM from ...util import update_exc _exc = {} for exc_data in [ - {ORTH: "вул.", LEMMA: "вулиця", NORM: "вулиця", POS: NOUN}, - {ORTH: "ім.", LEMMA: "ім'я", NORM: "імені", POS: NOUN}, - {ORTH: "просп.", LEMMA: "проспект", NORM: "проспект", POS: NOUN}, - {ORTH: "бул.", LEMMA: "бульвар", NORM: "бульвар", POS: NOUN}, - {ORTH: "пров.", LEMMA: "провулок", NORM: "провулок", POS: NOUN}, - {ORTH: "пл.", LEMMA: "площа", NORM: "площа", POS: NOUN}, - {ORTH: "г.", LEMMA: "гора", NORM: "гора", POS: NOUN}, - {ORTH: "п.", LEMMA: "пан", NORM: "пан", POS: NOUN}, - {ORTH: "м.", LEMMA: "місто", NORM: "місто", POS: NOUN}, - {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, - {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, - {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, - {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}, + {ORTH: "вул.", NORM: "вулиця"}, + {ORTH: "ім.", NORM: "імені"}, + {ORTH: "просп.", NORM: "проспект"}, + {ORTH: "бул.", NORM: "бульвар"}, + {ORTH: "пров.", NORM: "провулок"}, + {ORTH: "пл.", NORM: "площа"}, + {ORTH: "г.", NORM: "гора"}, + {ORTH: "п.", NORM: "пан"}, + {ORTH: "м.", NORM: "місто"}, + {ORTH: "проф.", NORM: "професор"}, + {ORTH: "акад.", NORM: "академік"}, + {ORTH: "доц.", NORM: "доцент"}, + {ORTH: "оз.", NORM: "озеро"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py index 125220caf..0129c3a19 100644 --- a/spacy/tests/lang/ar/test_exceptions.py +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -12,7 +12,6 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): tokens = ar_tokenizer(text) assert len(tokens) == 7 assert tokens[6].text == "ق.م" - assert tokens[6].lemma_ == "قبل الميلاد" def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer): diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index 71098f094..cfb574b63 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -8,7 +8,6 @@ import pytest def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): tokens = ca_tokenizer(text) assert len(tokens) == 1 - assert tokens[0].lemma_ == lemma def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): @@ -16,4 +15,3 @@ def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): tokens = ca_tokenizer(text) assert len(tokens) == 15 assert tokens[7].text == "aprox." - assert tokens[7].lemma_ == "aproximadament" diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index a1bbaf58b..d51c33992 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -18,4 +18,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer): tokens = de_tokenizer(text) assert len(tokens) == 6 assert tokens[2].text == "z.Zt." - assert tokens[2].lemma_ == "zur Zeit" diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index f72dfbf25..1b56a3b0f 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -49,7 +49,6 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): assert len(tokens) == 2 assert tokens[0].text == text.split("'")[0] assert tokens[1].text == "'ll" - assert tokens[1].lemma_ == "will" @pytest.mark.parametrize( @@ -104,7 +103,6 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer): def test_en_tokenizer_handles_times(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 - assert tokens[1].lemma_ in ["a.m.", "p.m."] @pytest.mark.parametrize( diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py index 90d897a4c..07df5d69e 100644 --- a/spacy/tests/lang/es/test_exception.py +++ b/spacy/tests/lang/es/test_exception.py @@ -13,7 +13,6 @@ import pytest def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma): tokens = es_tokenizer(text) assert len(tokens) == 1 - assert tokens[0].lemma_ == lemma def test_es_tokenizer_handles_exc_in_text(es_tokenizer): @@ -21,4 +20,3 @@ def test_es_tokenizer_handles_exc_in_text(es_tokenizer): tokens = es_tokenizer(text) assert len(tokens) == 7 assert tokens[4].text == "aprox." - assert tokens[4].lemma_ == "aproximadamente" diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 4b7ccad65..77e72a76b 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -37,19 +37,11 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text): @pytest.mark.parametrize( - "text,lemma", - [ - ("janv.", "janvier"), - ("juill.", "juillet"), - ("Dr.", "docteur"), - ("av.", "avant"), - ("sept.", "septembre"), - ], + "text", ["janv.", "juill.", "Dr.", "av.", "sept."], ) -def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma): +def test_fr_tokenizer_handles_abbr(fr_tokenizer, text): tokens = fr_tokenizer(text) assert len(tokens) == 1 - assert tokens[0].lemma_ == lemma def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer): @@ -57,7 +49,6 @@ def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 10 assert tokens[6].text == "janv." - assert tokens[6].lemma_ == "janvier" assert tokens[8].text == "prud’hommes" @@ -74,11 +65,8 @@ def test_fr_tokenizer_handles_title(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 6 assert tokens[0].text == "N'" - assert tokens[0].lemma_ == "ne" assert tokens[1].text == "est" - assert tokens[1].lemma_ == "être" assert tokens[2].text == "-ce" - assert tokens[2].lemma_ == "ce" def test_fr_tokenizer_handles_title_2(fr_tokenizer): @@ -86,9 +74,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 5 assert tokens[0].text == "Est" - assert tokens[0].lemma_ == "être" assert tokens[1].text == "-ce" - assert tokens[1].lemma_ == "ce" def test_fr_tokenizer_handles_title_3(fr_tokenizer): @@ -96,4 +82,3 @@ def test_fr_tokenizer_handles_title_3(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 7 assert tokens[0].text == "Qu'" - assert tokens[0].lemma_ == "que" diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index d941a854b..fc4b4fa7b 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -18,4 +18,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): tokens = lb_tokenizer(text) assert len(tokens) == 9 assert tokens[1].text == "'t" - assert tokens[1].lemma_ == "et" diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 9d2ef999b..b5d586ec6 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -157,8 +157,6 @@ def test_issue1758(en_tokenizer): """Test that "would've" is handled by the English tokenizer exceptions.""" tokens = en_tokenizer("would've") assert len(tokens) == 2 - assert tokens[0].tag_ == "MD" - assert tokens[1].lemma_ == "have" def test_issue1773(en_tokenizer): diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 7917157aa..ac0867189 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -166,7 +166,6 @@ def test_issue2822(it_tokenizer): assert doc[0].text == "Vuoi" assert doc[1].text == "un" assert doc[2].text == "po'" - assert doc[2].lemma_ == "poco" assert doc[3].text == "di" assert doc[4].text == "zucchero" assert doc[5].text == "?" From 87737a5a60e7419929dc2ebb22f7c759d6eb0108 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 23 Jul 2020 00:16:23 +0200 Subject: [PATCH 06/14] Tidy up --- spacy/lang/ne/examples.py | 4 ---- spacy/lang/ne/lex_attrs.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/spacy/lang/ne/examples.py b/spacy/lang/ne/examples.py index b3c4f9e73..a29b77c2f 100644 --- a/spacy/lang/ne/examples.py +++ b/spacy/lang/ne/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py index 652307577..7cb01c515 100644 --- a/spacy/lang/ne/lex_attrs.py +++ b/spacy/lang/ne/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..norm_exceptions import BASE_NORMS from ...attrs import NORM, LIKE_NUM From 38f6ea7a78c5d2d11af31b7bc9a53f6d8bf9d25b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 24 Jul 2020 14:50:26 +0200 Subject: [PATCH 07/14] Simplify language data and revert detailed configs --- spacy/default_config.cfg | 11 ---- spacy/lang/af/__init__.py | 18 ++----- spacy/lang/ar/__init__.py | 33 ++---------- spacy/lang/bg/__init__.py | 18 ++----- spacy/lang/bn/__init__.py | 26 +--------- spacy/lang/ca/__init__.py | 35 ++----------- spacy/lang/cs/__init__.py | 18 ++----- spacy/lang/da/__init__.py | 38 +------------- spacy/lang/de/__init__.py | 43 ++------------- spacy/lang/defaults.py | 9 ---- spacy/lang/el/__init__.py | 49 ++++++----------- spacy/lang/en/__init__.py | 49 ++++++----------- spacy/lang/es/__init__.py | 47 ++--------------- spacy/lang/et/__init__.py | 18 ++----- spacy/lang/eu/__init__.py | 27 +--------- spacy/lang/fa/__init__.py | 50 +++--------------- spacy/lang/fi/__init__.py | 25 +-------- spacy/lang/fr/__init__.py | 54 ++++++------------- spacy/lang/ga/__init__.py | 18 +------ spacy/lang/gu/__init__.py | 18 ++----- spacy/lang/he/__init__.py | 26 +--------- spacy/lang/hi/__init__.py | 25 ++------- spacy/lang/hr/__init__.py | 28 +--------- spacy/lang/hu/__init__.py | 36 +------------ spacy/lang/hy/__init__.py | 25 ++------- spacy/lang/id/__init__.py | 47 ++--------------- spacy/lang/is/__init__.py | 18 ++----- spacy/lang/it/__init__.py | 25 --------- spacy/lang/ja/__init__.py | 37 +++++-------- spacy/lang/kn/__init__.py | 18 ++----- spacy/lang/ko/__init__.py | 28 ++++------ spacy/lang/lb/__init__.py | 38 +------------- spacy/lang/lij/__init__.py | 18 +------ spacy/lang/lt/__init__.py | 33 +----------- spacy/lang/lv/__init__.py | 18 ++----- spacy/lang/ml/__init__.py | 18 ++----- spacy/lang/mr/__init__.py | 18 ++----- spacy/lang/nb/__init__.py | 35 ++----------- spacy/lang/ne/__init__.py | 25 ++------- spacy/lang/nl/__init__.py | 35 +++++-------- spacy/lang/pl/__init__.py | 48 ++++++++--------- spacy/lang/pt/__init__.py | 33 +----------- spacy/lang/punctuation.py | 10 ++-- spacy/lang/ro/__init__.py | 26 +--------- spacy/lang/ru/__init__.py | 33 ++++-------- spacy/lang/si/__init__.py | 25 ++------- spacy/lang/sk/__init__.py | 25 ++------- spacy/lang/sl/__init__.py | 18 ++----- spacy/lang/sq/__init__.py | 18 ++----- spacy/lang/sr/__init__.py | 38 +------------- spacy/lang/sv/__init__.py | 42 ++------------- spacy/lang/ta/__init__.py | 30 ++--------- spacy/lang/te/__init__.py | 25 ++------- spacy/lang/th/__init__.py | 31 ++++------- spacy/lang/tl/__init__.py | 33 +----------- spacy/lang/tr/__init__.py | 26 +--------- spacy/lang/tt/__init__.py | 27 ++-------- spacy/lang/uk/__init__.py | 28 ++++------ spacy/lang/ur/__init__.py | 41 ++------------- spacy/lang/vi/__init__.py | 26 ++++------ spacy/lang/xx/__init__.py | 15 ------ spacy/lang/yo/__init__.py | 27 +--------- spacy/lang/zh/__init__.py | 47 ++++++++--------- spacy/language.py | 104 +++++++++++++++++++++++++------------ spacy/lemmatizer.py | 10 ---- spacy/lookups.py | 18 ++++--- spacy/schemas.py | 4 -- spacy/tests/conftest.py | 4 +- spacy/tokenizer.pyx | 31 ----------- spacy/vocab.pyx | 93 ++++++++++----------------------- 70 files changed, 414 insertions(+), 1677 deletions(-) delete mode 100644 spacy/lang/defaults.py diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 21dbf1798..7ba008fb6 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,24 +1,13 @@ [nlp] lang = null -stop_words = [] -lex_attr_getters = {} vocab_data = {} -get_noun_chunks = null pipeline = [] [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" -token_match = null -url_match = {"@language_data": "spacy.xx.url_match"} [nlp.lemmatizer] @lemmatizers = "spacy.Lemmatizer.v1" -data = {} - -[nlp.writing_system] -direction = "ltr" -has_case = true -has_letters = true [components] diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index ee187ae5a..91917daee 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "af" -stop_words = {"@language_data": "spacy.af.stop_words"} -""" - - -@registry.language_data("spacy.af.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class AfrikaansDefaults(Language.Defaults): + stop_words = STOP_WORDS class Afrikaans(Language): lang = "af" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = AfrikaansDefaults __all__ = ["Afrikaans"] diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index f387d0310..6abb65efb 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -1,46 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ar" -stop_words = {"@language_data": "spacy.ar.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true -""" - - -@registry.language_data("spacy.ar.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ar.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class ArabicDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Arabic(Language): - lang = "ar" Defaults = ArabicDefaults - default_config = Config().from_str(DEFAULT_CONFIG) + lang = "ar" __all__ = ["Arabic"] diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 4a31a3653..a30f49ce7 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "bg" -stop_words = {"@language_data": "spacy.bg.stop_words"} -""" - - -@registry.language_data("spacy.bg.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class BulgarianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Bulgarian(Language): lang = "bg" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = BulgarianDefaults __all__ = ["Bulgarian"] diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index da2ca0c8d..6c1d66cba 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,31 +1,7 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "bn" -stop_words = {"@language_data": "spacy.bn.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules"] -""" - - -@registry.language_data("spacy.bn.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class BengaliDefaults(Language.Defaults): @@ -33,12 +9,12 @@ class BengaliDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Bengali(Language): lang = "bn" Defaults = BengaliDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Bengali"] diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 1fe7516ad..970b23c1e 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,49 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -from .punctuation import TOKENIZER_INFIXES - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ca" -stop_words = {"@language_data": "spacy.ca.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ca.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ca.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class CatalanDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Catalan(Language): lang = "ca" Defaults = CatalanDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Catalan"] diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index f424c83fa..a4b546b13 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "cs" -stop_words = {"@language_data": "spacy.cs.stop_words"} -""" - - -@registry.language_data("spacy.cs.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class CzechDefaults(Language.Defaults): + stop_words = STOP_WORDS class Czech(Language): lang = "cs" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = CzechDefaults __all__ = ["Czech"] diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 4e6ee9383..8cac30b26 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -1,55 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "da" -stop_words = {"@language_data": "spacy.da.stop_words"} -lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.da.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.da.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class DanishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Danish(Language): lang = "da" Defaults = DanishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Danish"] diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 58ee71247..b645d3480 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,44 +1,8 @@ -from typing import Set, Callable -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "de" -stop_words = {"@language_data": "spacy.de.stop_words"} -get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] -""" - - -@registry.language_data("spacy.de.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.de.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class GermanDefaults(Language.Defaults): @@ -46,12 +10,13 @@ class GermanDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class German(Language): lang = "de" Defaults = GermanDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["German"] diff --git a/spacy/lang/defaults.py b/spacy/lang/defaults.py deleted file mode 100644 index 6d692d6a5..000000000 --- a/spacy/lang/defaults.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Pattern - -from .tokenizer_exceptions import URL_MATCH -from ..util import registry - - -@registry.language_data("spacy.xx.url_match") -def url_match() -> Pattern: - return URL_MATCH diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index defe53891..c766c375e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,69 +1,50 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import GreekLemmatizer -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "el" -stop_words = {"@language_data": "spacy.el.stop_words"} -lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"} [nlp.lemmatizer] -@lemmatizers = "spacy.GreekLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_index", "lemma_exc", "lemma_rules"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"] +@lemmatizers = "spacy.el.GreekLemmatizer" """ -@registry.lemmatizers("spacy.GreekLemmatizer.v1") -def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer: - return GreekLemmatizer(data=data) +@registry.lemmatizers("spacy.el.GreekLemmatizer") +def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]: + tables = ["lemma_index", "lemma_exc", "lemma_rules"] + def lemmatizer_factory(nlp: Language) -> GreekLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return GreekLemmatizer(lookups=lookups) -@registry.language_data("spacy.el.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.language_data("spacy.el.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.el.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS + return lemmatizer_factory class GreekDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Greek(Language): lang = "el" Defaults = GreekDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ebe2d1d53..81200da27 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,68 +1,49 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import is_base_form from .punctuation import TOKENIZER_INFIXES from ...language import Language from ...lemmatizer import Lemmatizer +from ...lookups import load_lookups from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "en" -stop_words = {"@language_data": "spacy.en.stop_words"} -lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"} [nlp.lemmatizer] -@lemmatizers = "spacy.EnglishLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"] +@lemmatizers = "spacy.en.EnglishLemmatizer" """ -@registry.language_data("spacy.en.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.en.EnglishLemmatizer") +def create_lemmatizer() -> Callable[[Language], Lemmatizer]: + tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + def lemmatizer_factory(nlp: Language) -> Lemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return Lemmatizer(lookups=lookups, is_base_form=is_base_form) -@registry.language_data("spacy.en.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.EnglishLemmatizer.v1") -def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer": - return Lemmatizer(data=data, is_base_form=is_base_form) - - -@registry.language_data("spacy.en.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks + return lemmatizer_factory class EnglishDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class English(Language): lang = "en" Defaults = EnglishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["English"] diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index bc378f3db..9a47855b1 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,62 +1,23 @@ -from typing import Set, Dict, Callable, Any -from thinc.config import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "es" -stop_words = {"@language_data": "spacy.es.stop_words"} -lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"] -""" - - -@registry.language_data("spacy.es.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.language_data("spacy.es.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.es.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class SpanishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Spanish(Language): lang = "es" Defaults = SpanishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Spanish"] diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index 38da9ab1e..9f71882d2 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "et" -stop_words = {"@language_data": "spacy.et.stop_words"} -""" - - -@registry.language_data("spacy.et.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class EstonianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Estonian(Language): lang = "et" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = EstonianDefaults __all__ = ["Estonian"] diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py index 4df50bca5..89550be96 100644 --- a/spacy/lang/eu/__init__.py +++ b/spacy/lang/eu/__init__.py @@ -1,41 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "eu" -stop_words = {"@language_data": "spacy.eu.stop_words"} -lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"} -""" - - -@registry.language_data("spacy.eu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.eu.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class BasqueDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Basque(Language): lang = "eu" Defaults = BasqueDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Basque"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index a1ab0712f..7fdb9d065 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,61 +1,23 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - -from ...language import Language -from ...util import registry from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES -from .syntax_iterators import noun_chunks - - -DEFAULT_CONFIG = """ -[nlp] -lang = "fa" -stop_words = {"@language_data": "spacy.fa.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc"] -""" - - -@registry.language_data("spacy.fa.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fa.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.fa.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS +from ...language import Language class PersianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Persian(Language): lang = "fa" Defaults = PersianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Persian"] diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 33313aeb6..9233c6547 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -1,42 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "fi" -stop_words = {"@language_data": "spacy.fi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"} -""" - - -@registry.language_data("spacy.fi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class FinnishDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Finnish(Language): lang = "fi" Defaults = FinnishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Finnish"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 41014aa34..a5350d422 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any, Pattern +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH @@ -6,69 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import FrenchLemmatizer, is_base_form -from .syntax_iterators import noun_chunks +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "fr" -stop_words = {"@language_data": "spacy.fr.stop_words"} -lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"} - -[nlp.tokenizer] -@tokenizers = "spacy.Tokenizer.v1" -token_match = {"@language_data": "spacy.fr.token_match"} [nlp.lemmatizer] -@lemmatizers = "spacy.FrenchLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] +@lemmatizers = "spacy.fr.FrenchLemmatizer" """ -@registry.lemmatizers("spacy.FrenchLemmatizer.v1") -def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer: - return FrenchLemmatizer(data=data, is_base_form=is_base_form) +@registry.lemmatizers("spacy.fr.FrenchLemmatizer") +def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]: + tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] + def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form) -@registry.language_data("spacy.fr.token_match") -def token_match() -> Pattern: - return TOKEN_MATCH - - -@registry.language_data("spacy.fr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.fr.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.fr.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks + return lemmatizer_factory class FrenchDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + token_match = TOKEN_MATCH + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class French(Language): lang = "fr" Defaults = FrenchDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["French"] diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 3c13f56fb..80131368b 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,32 +1,16 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ga" -stop_words = {"@language_data": "spacy.ga.stop_words"} -""" - - -@registry.language_data("spacy.ga.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class IrishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS class Irish(Language): lang = "ga" Defaults = IrishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Irish"] diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 3ca8bbd4c..67228ac40 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "gu" -stop_words = {"@language_data": "spacy.gu.stop_words"} -""" - - -@registry.language_data("spacy.gu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class GujaratiDefaults(Language.Defaults): + stop_words = STOP_WORDS class Gujarati(Language): lang = "gu" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = GujaratiDefaults __all__ = ["Gujarati"] diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index cd07d405e..70bd9cf45 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,37 +1,15 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "he" -stop_words = {"@language_data": "spacy.he.stop_words"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true -""" - - -@registry.language_data("spacy.he.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class HebrewDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Hebrew(Language): lang = "he" Defaults = HebrewDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Hebrew"] diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index 48890c4f9..384f040c8 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "hi" -stop_words = {"@language_data": "spacy.hi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"} -""" - - -@registry.language_data("spacy.hi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class HindiDefaults(Language.Defaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Hindi(Language): lang = "hi" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = HindiDefaults __all__ = ["Hindi"] diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 54c1a8f1f..118e0946a 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,40 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "hr" -stop_words = {"@language_data": "spacy.hr.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.hr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class CroatianDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + stop_words = STOP_WORDS class Croatian(Language): lang = "hr" Defaults = CroatianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Croatian"] diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index b9f5a5c34..8962603a6 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,40 +1,7 @@ -from typing import Set, Pattern -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "hu" -stop_words = {"@language_data": "spacy.hu.stop_words"} - -[nlp.tokenizer] -@tokenizers = "spacy.Tokenizer.v1" -token_match = {"@language_data": "spacy.hu.token_match"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.hu.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hu.token_match") -def token_match() -> Pattern: - return TOKEN_MATCH class HungarianDefaults(Language.Defaults): @@ -42,12 +9,13 @@ class HungarianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + token_match = TOKEN_MATCH + stop_words = STOP_WORDS class Hungarian(Language): lang = "hu" Defaults = HungarianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Hungarian"] diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 33bb8d08a..4577ab641 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "hy" -stop_words = {"@language_data": "spacy.hy.stop_words"} -lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"} -""" - - -@registry.language_data("spacy.hy.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.hy.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class ArmenianDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Armenian(Language): lang = "hy" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = ArmenianDefaults __all__ = ["Armenian"] diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ecefd0a66..87373551c 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -1,50 +1,9 @@ -from typing import Set, Dict, Callable, Any -from thinc.config import Config - from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "id" -stop_words = {"@language_data": "spacy.id.stop_words"} -lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.id.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.id.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.id.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class IndonesianDefaults(Language.Defaults): @@ -52,12 +11,14 @@ class IndonesianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Indonesian(Language): lang = "id" Defaults = IndonesianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Indonesian"] diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index 82fc7e0c2..be5de5981 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "is" -stop_words = {"@language_data": "spacy.is.stop_words"} -""" - - -@registry.language_data("spacy.is.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class IcelandicDefaults(Language.Defaults): + stop_words = STOP_WORDS class Icelandic(Language): lang = "is" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = IcelandicDefaults __all__ = ["Icelandic"] diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 107018392..25cbaa651 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,31 +1,7 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "it" -stop_words = {"@language_data": "spacy.it.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.it.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class ItalianDefaults(Language.Defaults): @@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults): class Italian(Language): lang = "it" Defaults = ItalianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Italian"] diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 642b59a4b..d435afe12 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,11 +1,11 @@ -from typing import Optional, Union, Dict, Any, Set, Callable +from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple from thinc.api import Config from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP @@ -20,33 +20,15 @@ from ... import util DEFAULT_CONFIG = """ [nlp] -lang = "ja" -stop_words = {"@language_data": "spacy.ja.stop_words"} -get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"} [nlp.tokenizer] -@tokenizers = "spacy.JapaneseTokenizer.v1" +@tokenizers = "spacy.ja.JapaneseTokenizer" split_mode = null - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false """ -@registry.language_data("spacy.ja.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ja.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - -@registry.tokenizers("spacy.JapaneseTokenizer.v1") -def create_japanese_tokenizer(split_mode: Optional[str] = None): +@registry.tokenizers("spacy.ja.JapaneseTokenizer") +def create_tokenizer(split_mode: Optional[str] = None): def japanese_tokenizer_factory(nlp): return JapaneseTokenizer(nlp, split_mode=split_mode) @@ -179,9 +161,16 @@ class JapaneseTokenizer(DummyTokenizer): return self +class JapaneseDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + class Japanese(Language): lang = "ja" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = JapaneseDefaults # Hold the attributes we need with convenient names diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index c323ca5c7..8e53989e6 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "kn" -stop_words = {"@language_data": "spacy.kn.stop_words"} -""" - - -@registry.language_data("spacy.kn.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class KannadaDefaults(Language.Defaults): + stop_words = STOP_WORDS class Kannada(Language): lang = "kn" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = KannadaDefaults __all__ = ["Kannada"] diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 83cd44ded..d2af9c4b1 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Optional, Any, Dict +from typing import Optional, Any, Dict from thinc.api import Config from .stop_words import STOP_WORDS @@ -11,26 +11,14 @@ from ...util import DummyTokenizer, registry DEFAULT_CONFIG = """ [nlp] -lang = "ko" -stop_words = {"@language_data": "spacy.ko.stop_words"} [nlp.tokenizer] -@tokenizers = "spacy.KoreanTokenizer.v1" - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false +@tokenizers = "spacy.ko.KoreanTokenizer" """ -@registry.language_data("spacy.ko.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.tokenizers("spacy.KoreanTokenizer.v1") -def create_korean_tokenizer(): +@registry.tokenizers("spacy.ko.KoreanTokenizer") +def create_tokenizer(): def korean_tokenizer_factory(nlp): return KoreanTokenizer(nlp) @@ -74,9 +62,15 @@ class KoreanTokenizer(DummyTokenizer): yield {"surface": surface, "lemma": lemma, "tag": tag} +class KoreanDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + stop_words = STOP_WORDS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + class Korean(Language): lang = "ko" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = KoreanDefaults def try_mecab_import() -> None: diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 56b09208f..da6fe55d7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -1,54 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lb" -stop_words = {"@language_data": "spacy.lb.stop_words"} -lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.lb.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.lb.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class LuxembourgishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Luxembourgish(Language): lang = "lb" Defaults = LuxembourgishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Luxembourgish"] diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index 02f9a72b6..5ae280324 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -1,34 +1,18 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lij" -stop_words = {"@language_data": "spacy.lij.stop_words"} -""" - - -@registry.language_data("spacy.lij.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class LigurianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Ligurian(Language): lang = "lij" Defaults = LigurianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Ligurian"] diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index e82c4c4e0..e395a8f62 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,50 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "lt" -stop_words = {"@language_data": "spacy.lt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.lt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.lt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class LithuanianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Lithuanian(Language): lang = "lt" Defaults = LithuanianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Lithuanian"] diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index e37b44b0d..142bc706e 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "lv" -stop_words = {"@language_data": "spacy.lv.stop_words"} -""" - - -@registry.language_data("spacy.lv.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class LatvianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Latvian(Language): lang = "lv" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = LatvianDefaults __all__ = ["Latvian"] diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index e2ac0a641..166d0e061 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ml" -stop_words = {"@language_data": "spacy.ml.stop_words"} -""" - - -@registry.language_data("spacy.ml.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class MalayalamDefaults(Language.Defaults): + stop_words = STOP_WORDS class Malayalam(Language): lang = "ml" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = MalayalamDefaults __all__ = ["Malayalam"] diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index 3d7c621cb..af0c49878 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "af" -stop_words = {"@language_data": "spacy.mr.stop_words"} -""" - - -@registry.language_data("spacy.mr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class MarathiDefaults(Language.Defaults): + stop_words = STOP_WORDS class Marathi(Language): lang = "mr" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = MarathiDefaults __all__ = ["Marathi"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index f26c68e91..d2bb92072 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,39 +1,9 @@ -from typing import Set, Callable -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS -from .syntax_iterators import noun_chunks +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "nb" -stop_words = {"@language_data": "spacy.nb.stop_words"} -get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules", "lemma_exc"] -""" - - -@registry.language_data("spacy.nb.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.nb.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks class NorwegianDefaults(Language.Defaults): @@ -41,12 +11,13 @@ class NorwegianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Norwegian(Language): lang = "nb" Defaults = NorwegianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Norwegian"] diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py index b72af86e4..68632e9ad 100644 --- a/spacy/lang/ne/__init__.py +++ b/spacy/lang/ne/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ne" -stop_words = {"@language_data": "spacy.ne.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"} -""" - - -@registry.language_data("spacy.ne.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ne.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class NepaliDefaults(Language.Defaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS class Nepali(Language): lang = "ne" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = NepaliDefaults __all__ = ["Nepali"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 9bf58fddd..d874ef7a1 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .stop_words import STOP_WORDS @@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "nl" -stop_words = {"@language_data": "spacy.nl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.DutchLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] +@lemmatizers = "spacy.nl.DutchLemmatizer" """ -@registry.language_data("spacy.nl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.nl.DutchLemmatizer") +def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]: + tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] + def lemmatizer_factory(nlp: Language) -> DutchLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return DutchLemmatizer(lookups=lookups) -@registry.language_data("spacy.nl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.DutchLemmatizer.v1") -def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer: - return DutchLemmatizer(data=data) + return lemmatizer_factory class DutchDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Dutch(Language): lang = "nl" Defaults = DutchDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 82957dc7a..2393f1aea 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -7,55 +7,53 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...lookups import load_lookups from ...language import Language from ...util import registry DEFAULT_CONFIG = """ [nlp] -lang = "pl" -stop_words = {"@language_data": "spacy.pl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.PolishLemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"] +@lemmatizers = "spacy.pl.PolishLemmatizer" """ - -@registry.language_data("spacy.pl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +TOKENIZER_EXCEPTIONS = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") +} -@registry.language_data("spacy.pl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +@registry.lemmatizers("spacy.pl.PolishLemmatizer") +def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]: + # fmt: off + tables = [ + "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", + "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", + "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb" + ] + # fmt: on + def lemmatizer_factory(nlp: Language) -> PolishLemmatizer: + lookups = load_lookups(lang=nlp.lang, tables=tables) + return PolishLemmatizer(lookups=lookups) -@registry.lemmatizers("spacy.PolishLemmatizer.v1") -def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer: - return PolishLemmatizer(data=data) + return lemmatizer_factory class PolishDefaults(Language.Defaults): - mod_base_exceptions = { - exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") - } - tokenizer_exceptions = mod_base_exceptions + config = Config().from_str(DEFAULT_CONFIG) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Polish(Language): lang = "pl" Defaults = PolishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Polish"] diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index fce12393d..0447099f0 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -1,50 +1,21 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "pt" -stop_words = {"@language_data": "spacy.pt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.pt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.pt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class PortugueseDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Portuguese(Language): lang = "pt" Defaults = PortugueseDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Portuguese"] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index bf7357e48..e712e71d6 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT -_prefixes = ( +TOKENIZER_PREFIXES = ( ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + LIST_PUNCT + LIST_ELLIPSES @@ -13,7 +13,7 @@ _prefixes = ( ) -_suffixes = ( +TOKENIZER_SUFFIXES = ( LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES @@ -31,7 +31,7 @@ _suffixes = ( ] ) -_infixes = ( +TOKENIZER_INFIXES = ( LIST_ELLIPSES + LIST_ICONS + [ @@ -44,7 +44,3 @@ _infixes = ( r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) - -TOKENIZER_PREFIXES = _prefixes -TOKENIZER_SUFFIXES = _suffixes -TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 881188b21..74016d3e9 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -1,49 +1,25 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from ...language import Language -from ...util import registry # Lemma data note: # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Replaced characters using cedillas with the correct ones (ș and ț) -DEFAULT_CONFIG = """ -[nlp] -lang = "ro" -stop_words = {"@language_data": "spacy.ro.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ro.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - class RomanianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS class Romanian(Language): lang = "ro" Defaults = RomanianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Romanian"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index b37ac6226..5d2333edf 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .stop_words import STOP_WORDS @@ -11,43 +11,30 @@ from ...language import Language DEFAULT_CONFIG = """ [nlp] -lang = "ru" -stop_words = {"@language_data": "spacy.ru.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.RussianLemmatizer.v1" - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] +@lemmatizers = "spacy.ru.RussianLemmatizer" """ -@registry.language_data("spacy.ru.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.ru.RussianLemmatizer") +def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]: + def lemmatizer_factory(nlp: Language) -> RussianLemmatizer: + return RussianLemmatizer() - -@registry.language_data("spacy.ru.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.RussianLemmatizer.v1") -def create_russian_lemmatizer() -> RussianLemmatizer: - return RussianLemmatizer() + return lemmatizer_factory class RussianDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Russian(Language): lang = "ru" Defaults = RussianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Russian"] diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index 69c4718c0..d77e3bb8b 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "si" -stop_words = {"@language_data": "spacy.si.stop_words"} -lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"} -""" - - -@registry.language_data("spacy.si.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.si.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class SinhalaDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Sinhala(Language): lang = "si" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SinhalaDefaults __all__ = ["Sinhala"] diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index c9493e829..4003c7340 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sk" -stop_words = {"@language_data": "spacy.sk.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"} -""" - - -@registry.language_data("spacy.sk.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sk.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class SlovakDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Slovak(Language): lang = "sk" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SlovakDefaults __all__ = ["Slovak"] diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 4f1954669..0330cc4d0 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sl" -stop_words = {"@language_data": "spacy.sl.stop_words"} -""" - - -@registry.language_data("spacy.sl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class SlovenianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Slovenian(Language): lang = "sl" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = SlovenianDefaults __all__ = ["Slovenian"] diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index a3da6b354..a4bacfa49 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,26 +1,14 @@ -from typing import Set -from thinc.api import Config - from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "sq" -stop_words = {"@language_data": "spacy.sq.stop_words"} -""" - - -@registry.language_data("spacy.sq.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +class AlbanianDefaults(Language.Defaults): + stop_words = STOP_WORDS class Albanian(Language): lang = "sq" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = AlbanianDefaults __all__ = ["Albanian"] diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 36703aa5f..165e54975 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,52 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "sr" -stop_words = {"@language_data": "spacy.sr.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.sr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sr.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class SerbianDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Serbian(Language): lang = "sr" Defaults = SerbianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Serbian"] diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index dc9f71ac6..0c6a1b9f4 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,59 +1,25 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...util import registry -from .syntax_iterators import noun_chunks # Punctuation stolen from Danish from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -DEFAULT_CONFIG = """ -[nlp] -lang = "sv" -stop_words = {"@language_data": "spacy.sv.stop_words"} -lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} -get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup", "lemma_rules"] -""" - - -@registry.language_data("spacy.sv.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.sv.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.language_data("spacy.sv.get_noun_chunks") -def get_noun_chunks() -> Callable: - return noun_chunks - - class SwedishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS class Swedish(Language): lang = "sv" Defaults = SwedishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Swedish"] diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index c429127c9..ac5fc7124 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,38 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "ta" -stop_words = {"@language_data": "spacy.ta.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"} - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] -""" - - -@registry.language_data("spacy.ta.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ta.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class TamilDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tamil(Language): lang = "ta" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = TamilDefaults __all__ = ["Tamil"] diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index d012d418d..e6dc80e28 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,33 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] -lang = "te" -stop_words = {"@language_data": "spacy.te.stop_words"} -lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"} -""" - - -@registry.language_data("spacy.te.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.te.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS +class TeluguDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Telugu(Language): lang = "te" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = TeluguDefaults __all__ = ["Telugu"] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 1fdf4311e..989c22a42 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,4 +1,3 @@ -from typing import Set, Dict, Callable, Any from thinc.api import Config from .stop_words import STOP_WORDS @@ -10,31 +9,13 @@ from ...util import DummyTokenizer, registry DEFAULT_CONFIG = """ [nlp] -lang = "th" -stop_words = {"@language_data": "spacy.th.stop_words"} -lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.ThaiTokenizer.v1" - -[nlp.vocab_data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lexeme_norm"] +@tokenizers = "spacy.th.ThaiTokenizer" """ -@registry.language_data("spacy.th.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.th.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.ThaiTokenizer.v1") +@registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): return ThaiTokenizer(nlp) @@ -60,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer): return Doc(self.vocab, words=words, spaces=spaces) +class ThaiDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + class Thai(Language): lang = "th" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = ThaiDefaults __all__ = ["Thai"] diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 7176e07d4..61530dc30 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,47 +1,18 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tl" -stop_words = {"@language_data": "spacy.tl.stop_words"} -lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.tl.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.tl.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class TagalogDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tagalog(Language): lang = "tl" Defaults = TagalogDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Tagalog"] diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 3bb1e0d06..70b277487 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,40 +1,16 @@ -from typing import Set -from thinc.api import Config - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tr" -stop_words = {"@language_data": "spacy.tr.stop_words"} - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.tr.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS class TurkishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS class Turkish(Language): lang = "tr" Defaults = TurkishDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Turkish"] diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index d4828d96c..c8e293f29 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -1,41 +1,20 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "tt" -stop_words = {"@language_data": "spacy.tt.stop_words"} -lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"} -""" - - -@registry.language_data("spacy.tt.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.tt.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class TatarDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - infixes = tuple(TOKENIZER_INFIXES) + infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Tatar(Language): lang = "tt" Defaults = TatarDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Tatar"] diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 24a859951..6b44a7144 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,4 +1,4 @@ -from typing import Set, Dict, Callable, Any +from typing import Callable from thinc.api import Config from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -11,38 +11,30 @@ from .lemmatizer import UkrainianLemmatizer DEFAULT_CONFIG = """ [nlp] -lang = "uk" -stop_words = {"@language_data": "spacy.uk.stop_words"} -lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"} [nlp.lemmatizer] -@lemmatizers = "spacy.UkrainianLemmatizer.v1" +@lemmatizers = "spacy.uk.UkrainianLemmatizer" """ -@registry.language_data("spacy.uk.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS +@registry.lemmatizers("spacy.uk.UkrainianLemmatizer") +def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]: + def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer: + return UkrainianLemmatizer() - -@registry.language_data("spacy.uk.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.lemmatizers("spacy.UkrainianLemmatizer.v1") -def create_ukrainian_lemmatizer() -> UkrainianLemmatizer: - return UkrainianLemmatizer() + return lemmatizer_factory class UkrainianDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Ukrainian(Language): lang = "uk" Defaults = UkrainianDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Ukrainian"] diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index db714c296..e3dee5805 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,54 +1,19 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "ur" -stop_words = {"@language_data": "spacy.ur.stop_words"} -lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"} - -[nlp.writing_system] -direction = "rtl" -has_case = false -has_letters = true - -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - -[nlp.lemmatizer.data] -@language_data = "spacy-lookups-data" -lang = ${nlp:lang} -tables = ["lemma_lookup"] -""" - - -@registry.language_data("spacy.ur.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.ur.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class UrduDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Urdu(Language): lang = "ur" Defaults = UrduDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Urdu"] diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 2003e904b..2b06d33f7 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,4 +1,3 @@ -from typing import Set, Dict, Callable, Any from thinc.api import Config from ...language import Language @@ -10,27 +9,14 @@ from .lex_attrs import LEX_ATTRS DEFAULT_CONFIG = """ [nlp] -lang = "vi" -stop_words = {"@language_data": "spacy.vi.stop_words"} -lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.VietnameseTokenizer.v1" +@tokenizers = "spacy.vi.VietnameseTokenizer" use_pyvi = true """ -@registry.language_data("spacy.vi.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.vi.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.VietnameseTokenizer.v1") +@registry.tokenizers("spacy.vi.VietnameseTokenizer") def create_vietnamese_tokenizer(use_pyvi: bool = True,): def vietnamese_tokenizer_factory(nlp): return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) @@ -68,9 +54,15 @@ class VietnameseTokenizer(DummyTokenizer): return Doc(self.vocab, words=words, spaces=spaces) +class VietnameseDefaults(Language.Defaults): + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + class Vietnamese(Language): lang = "vi" - default_config = Config().from_str(DEFAULT_CONFIG) + Defaults = VietnameseDefaults __all__ = ["Vietnamese"] diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 2167d9a5e..aff8403ff 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,27 +1,12 @@ -from thinc.api import Config - -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -DEFAULT_CONFIG = """ -[nlp] -lang = "xx" -""" - - -class MultiLanguageDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS - - class MultiLanguage(Language): """Language class to be used for models that support multiple languages. This module allows models to specify their language ID as 'xx'. """ lang = "xx" - Defaults = MultiLanguageDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["MultiLanguage"] diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index b739ffbd7..df6bb7d4a 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,39 +1,16 @@ -from typing import Set, Dict, Callable, Any -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] -lang = "si" -stop_words = {"@language_data": "spacy.yo.stop_words"} -lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"} -""" - - -@registry.language_data("spacy.yo.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.yo.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS class YorubaDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS class Yoruba(Language): lang = "yo" Defaults = YorubaDefaults - default_config = Config().from_str(DEFAULT_CONFIG) __all__ = ["Yoruba"] diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ba5489dfd..fe0613c80 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Set, Dict, Callable, Any +from typing import Optional, List, Dict, Any from enum import Enum import tempfile import srsly @@ -10,7 +10,6 @@ from ...errors import Warnings, Errors from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer, registry -from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -20,20 +19,12 @@ _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from http DEFAULT_CONFIG = """ [nlp] -lang = "zh" -stop_words = {"@language_data": "spacy.zh.stop_words"} -lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"} [nlp.tokenizer] -@tokenizers = "spacy.ChineseTokenizer.v1" +@tokenizers = "spacy.zh.ChineseTokenizer" segmenter = "char" pkuseg_model = null pkuseg_user_dict = "default" - -[nlp.writing_system] -direction = "ltr" -has_case = false -has_letters = false """ @@ -47,17 +38,7 @@ class Segmenter(str, Enum): return list(cls.__members__.keys()) -@registry.language_data("spacy.zh.stop_words") -def stop_words() -> Set[str]: - return STOP_WORDS - - -@registry.language_data("spacy.zh.lex_attr_getters") -def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: - return LEX_ATTRS - - -@registry.tokenizers("spacy.ChineseTokenizer.v1") +@registry.tokenizers("spacy.zh.ChineseTokenizer") def create_chinese_tokenizer( segmenter: Segmenter = Segmenter.char, pkuseg_model: Optional[str] = None, @@ -155,6 +136,18 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def _get_config(self) -> Dict[str, Any]: + return { + "segmenter": self.segmenter, + "pkuseg_model": self.pkuseg_model, + "pkuseg_user_dict": self.pkuseg_user_dict, + } + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.segmenter = config.get("segmenter", Segmenter.char) + self.pkuseg_model = config.get("pkuseg_model", None) + self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") + def to_bytes(self, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" @@ -175,6 +168,7 @@ class ChineseTokenizer(DummyTokenizer): sorted(list(self.pkuseg_seg.postprocesser.other_words)), ) serializers = { + "cfg": lambda: srsly.json_dumps(self._get_config()), "pkuseg_features": lambda: pkuseg_features_b, "pkuseg_weights": lambda: pkuseg_weights_b, "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data), @@ -194,6 +188,7 @@ class ChineseTokenizer(DummyTokenizer): pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = { + "cfg": lambda b: self._set_config(srsly.json_loads(b)), "pkuseg_features": deserialize_pkuseg_features, "pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_processors": deserialize_pkuseg_processors, @@ -246,6 +241,7 @@ class ChineseTokenizer(DummyTokenizer): srsly.write_msgpack(path, data) serializers = { + "cfg": lambda p: srsly.write_json(p, self._get_config()), "pkuseg_model": lambda p: save_pkuseg_model(p), "pkuseg_processors": lambda p: save_pkuseg_processors(p), } @@ -281,6 +277,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.postprocesser.other_words = set(other_words) serializers = { + "cfg": lambda p: self._set_config(srsly.read_json(p)), "pkuseg_model": lambda p: load_pkuseg_model(p), "pkuseg_processors": lambda p: load_pkuseg_processors(p), } @@ -288,13 +285,15 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - tokenizer_exceptions = BASE_EXCEPTIONS + config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} class Chinese(Language): lang = "zh" Defaults = ChineseDefaults - default_config = Config().from_str(DEFAULT_CONFIG) def try_jieba_import(segmenter: str) -> None: diff --git a/spacy/language.py b/spacy/language.py index 99fe98a66..6d2ae3dbe 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -16,27 +16,25 @@ import multiprocessing as mp from itertools import chain, cycle from .tokens.underscore import Underscore -from .vocab import Vocab +from .vocab import Vocab, create_vocab from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry from .util import SimpleFrozenDict +from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc +from .lookups import load_lookups +from .tokenizer import Tokenizer +from .lemmatizer import Lemmatizer from .errors import Errors, Warnings from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about -# We also need to import these to make sure the functions are registered -from .tokenizer import Tokenizer # noqa: F401 -from .lemmatizer import Lemmatizer # noqa: F401 -from .lookups import Lookups # noqa: F401 -from .lang import defaults # noqa: F401 - ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) @@ -45,10 +43,50 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) class BaseDefaults: - prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES) - suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES) - infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES) - tokenizer_exceptions: Dict[str, List[dict]] = {} + config: Config = Config() + tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS + prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES + suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES + infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES + token_match: Optional[Pattern] = None + url_match: Optional[Pattern] = URL_MATCH + syntax_iterators: Dict[str, Callable] = {} + lex_attr_getters: Dict[int, Callable[[str], Any]] = {} + stop_words = set() + writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} + + +@registry.tokenizers("spacy.Tokenizer.v1") +def create_tokenizer() -> Callable[["Language"], Tokenizer]: + def tokenizer_factory(nlp: "Language") -> Tokenizer: + prefixes = nlp.Defaults.prefixes + suffixes = nlp.Defaults.suffixes + infixes = nlp.Defaults.infixes + prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None + suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None + infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None + return Tokenizer( + nlp.vocab, + rules=nlp.Defaults.tokenizer_exceptions, + prefix_search=prefix_search, + suffix_search=suffix_search, + infix_finditer=infix_finditer, + token_match=nlp.Defaults.token_match, + url_match=nlp.Defaults.url_match, + ) + + return tokenizer_factory + + +@registry.lemmatizers("spacy.Lemmatizer.v1") +def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: + tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + + def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": + lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False) + return Lemmatizer(lookups=lookups) + + return lemmatizer_factory class Language: @@ -65,8 +103,8 @@ class Language: Defaults = BaseDefaults lang: str = None default_config = DEFAULT_CONFIG - factories = SimpleFrozenDict(error=Errors.E957) + factories = SimpleFrozenDict(error=Errors.E957) _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory def __init__( @@ -75,6 +113,7 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, + create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, ): """Initialise a Language object. @@ -108,7 +147,16 @@ class Language: if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - vocab = Vocab.from_config(self._config, vectors_name=vectors_name) + if not create_lemmatizer: + lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} + create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] + # TODO: where does the vocab data come in? + vocab = create_vocab( + self.lang, + self.Defaults, + lemmatizer=create_lemmatizer(self), + vectors_name=vectors_name, + ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -126,7 +174,10 @@ class Language: def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) - cls.default_config = util.deep_merge_configs(cls.default_config, DEFAULT_CONFIG) + cls.default_config = util.deep_merge_configs( + cls.Defaults.config, DEFAULT_CONFIG + ) + cls.default_config["nlp"]["lang"] = cls.lang @property def path(self): @@ -1226,17 +1277,16 @@ class Language: config = util.deep_merge_configs(config, cls.default_config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) - nlp_config = config["nlp"] - config_lang = nlp_config["lang"] + config_lang = config["nlp"]["lang"] if cls.lang is not None and config_lang is not None and config_lang != cls.lang: raise ValueError( Errors.E958.format( - bad_lang_code=nlp_config["lang"], + bad_lang_code=config["nlp"]["lang"], lang_code=cls.lang, lang=util.get_object_name(cls), ) ) - nlp_config["lang"] = cls.lang + config["nlp"]["lang"] = cls.lang # This isn't very elegant, but we remove the [components] block here to prevent # it from getting resolved (causes problems because we expect to pass in # the nlp and name args for each component). If we're auto-filling, we're @@ -1251,22 +1301,12 @@ class Language: filled["components"] = orig_pipeline config["components"] = orig_pipeline create_tokenizer = resolved["nlp"]["tokenizer"] - lemmatizer = resolved["nlp"]["lemmatizer"] - lex_attr_getters = resolved["nlp"]["lex_attr_getters"] - stop_words = resolved["nlp"]["stop_words"] - vocab_data = resolved["nlp"]["vocab_data"] - get_noun_chunks = resolved["nlp"]["get_noun_chunks"] - vocab = Vocab.from_config( - filled, - lemmatizer=lemmatizer, - lex_attr_getters=lex_attr_getters, - stop_words=stop_words, - vocab_data=vocab_data, - get_noun_chunks=get_noun_chunks, + create_lemmatizer = resolved["nlp"]["lemmatizer"] + nlp = cls( + create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, ) - nlp = cls(vocab, create_tokenizer=create_tokenizer) pipeline = config.get("components", {}) - for pipe_name in nlp_config["pipeline"]: + for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 8255b4b36..1cfb681f4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,12 +2,6 @@ from typing import Optional, Callable, List, Dict from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES -from .util import registry - - -@registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer": - return Lemmatizer(data=data) class Lemmatizer: @@ -21,7 +15,6 @@ class Lemmatizer: def __init__( self, lookups: Optional[Lookups] = None, - data: Dict[str, dict] = {}, is_base_form: Optional[Callable] = None, ) -> None: """Initialize a Lemmatizer. @@ -31,9 +24,6 @@ class Lemmatizer: RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() - for name, table in data.items(): - if table is not None: - self.lookups.add_table(name, table) self.is_base_form = is_base_form def __call__( diff --git a/spacy/lookups.py b/spacy/lookups.py index d5def882e..e5a4a0b40 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -13,7 +13,9 @@ UNSET = object() @registry.language_data("spacy-lookups-data") -def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: +def load_lookups( + lang: str, tables: List[str], strict: bool = True +) -> Optional[Dict[str, Any]]: """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty dict if there's no data or if the package is not installed. @@ -24,15 +26,19 @@ def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]: RETURNS (Dict[str, Any]): The lookups, keyed by table name. """ # TODO: import spacy_lookups_data instead of going via entry points here? + lookups = Lookups() if lang not in registry.lookups: - return {} + return lookups data = registry.lookups.get(lang) - result = {} for table in tables: if table not in data: - raise ValueError("TODO: unknown table") - result[table] = load_language_data(data[table]) - return result + if strict: + raise ValueError("TODO: unknown table") + language_data = {} + else: + language_data = load_language_data(data[table]) + lookups.add_table(table, language_data) + return lookups class Lookups: diff --git a/spacy/schemas.py b/spacy/schemas.py index 8b6e3ebab..ad16f3233 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -239,11 +239,7 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") - writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system") - stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") - lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") - get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc") # fmt: on class Config: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a2e319e12..cfdb8e4ff 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -257,7 +257,7 @@ def zh_tokenizer_char(): def zh_tokenizer_jieba(): pytest.importorskip("jieba") config = { - "@tokenizers": "spacy.ChineseTokenizer.v1", + "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "jieba", } nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) @@ -268,7 +268,7 @@ def zh_tokenizer_jieba(): def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") config = { - "@tokenizers": "spacy.ChineseTokenizer.v1", + "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "pkuseg", "pkuseg_model": "default", } diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5fffa4503..6268a77ae 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -26,37 +26,6 @@ from .attrs import intify_attrs from .symbols import ORTH -@registry.tokenizers("spacy.Tokenizer.v1") -def create_tokenizer( - # exceptions: Dict[str, List[dict]], - # prefixes: Optional[List[Union[str, Pattern]]], - # suffixes: Optional[List[Union[str, Pattern]]], - # infixes: Optional[List[Union[str, Pattern]]], - # We currently can't validate against Pattern because that will cause - # Pydantic to parse value *as* pattern - token_match: Optional[Any] = None, - url_match: Optional[Any] = None, -) -> "Tokenizer": - def tokenizer_factory(nlp): - exceptions = nlp.Defaults.tokenizer_exceptions - prefixes = nlp.Defaults.prefixes - suffixes = nlp.Defaults.suffixes - infixes = nlp.Defaults.infixes - prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None - suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None - infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None - return Tokenizer( - nlp.vocab, - rules=exceptions, - prefix_search=prefix_search, - suffix_search=suffix_search, - infix_finditer=infix_finditer, - token_match=token_match, - url_match=url_match, - ) - return tokenizer_factory - - cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1a4959833..0f99a45f5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,6 +23,33 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang +def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None): + lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} + # This is messy, but it's the minimal working fix to Issue #639. + lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) + # Ensure that getter can be pickled + lex_attrs[LANG] = functools.partial(get_lang, lang=lang) + lex_attrs[NORM] = util.add_lookups( + lex_attrs.get(NORM, LEX_ATTRS[NORM]), + BASE_NORMS, + vocab_data.get("lexeme_norm", {}), + ) + lookups = Lookups() + for name, data in vocab_data.items(): + if name not in lookups: + data = data if data is not None else {} + lookups.add_table(name, data) + return Vocab( + lex_attr_getters=lex_attrs, + lemmatizer=lemmatizer, + lookups=lookups, + writing_system=defaults.writing_system, + get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), + vectors_name=vectors_name, + ) + + + cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying @@ -31,7 +58,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, lemmatizer=None, - strings=tuple(), lookups=None, tag_map={}, vocab_data={}, + strings=tuple(), lookups=None, tag_map={}, oov_prob=-20., vectors_name=None, writing_system={}, get_noun_chunks=None, **deprecated_kwargs): """Create the vocabulary. @@ -51,10 +78,6 @@ cdef class Vocab: lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): lookups = Lookups() - for name, data in vocab_data.items(): - if name not in lookups: - data = data if data is not None else {} - lookups.add_table(name, data) if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) self.cfg = {'oov_prob': oov_prob} @@ -416,66 +439,6 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - @classmethod - def from_config( - cls, - config, - lemmatizer=None, - lex_attr_getters=None, - stop_words=None, - vocab_data=None, - get_noun_chunks=None, - vectors_name=None, - ): - """Create a Vocab from a config and (currently) language defaults, i.e. - nlp.Defaults. - - config (Dict[str, Any]): The full config. - lemmatizer (Callable): Optional lemmatizer. - vectors_name (str): Optional vectors name. - RETURNS (Vocab): The vocab. - """ - # TODO: make this less messy – move lemmatizer out into its own pipeline - # component, move language defaults to config - lang = config["nlp"]["lang"] - writing_system = config["nlp"]["writing_system"] - if not lemmatizer: - lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]} - lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - if stop_words is None: - stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]} - stop_words = registry.make_from_config(stop_words_cfg)["stop_words"] - if vocab_data is None: - vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} - vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] - if get_noun_chunks is None: - noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]} - get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"] - if lex_attr_getters is None: - lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} - lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] - lex_attrs = dict(LEX_ATTRS) - lex_attrs.update(lex_attr_getters) - # This is messy, but it's the minimal working fix to Issue #639. - lex_attrs[IS_STOP] = functools.partial(is_stop, stops=stop_words) - # Ensure that getter can be pickled - lex_attrs[LANG] = functools.partial(get_lang, lang=lang) - lex_attrs[NORM] = util.add_lookups( - lex_attrs.get(NORM, LEX_ATTRS[NORM]), - BASE_NORMS, - vocab_data.get("lexeme_norm", {}), - ) - vocab = cls( - lex_attr_getters=lex_attrs, - vocab_data=vocab_data, - lemmatizer=lemmatizer, - writing_system=writing_system, - get_noun_chunks=get_noun_chunks - ) - if vocab.vectors.name is None and vectors_name: - vocab.vectors.name = vectors_name - return vocab - def to_disk(self, path, exclude=tuple()): """Save the current state to a directory. From b9aaa4e457f58257ff8318fcb9418e75c0b00797 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 11:51:30 +0200 Subject: [PATCH 08/14] Improve vocab data integration and warning --- spacy/default_config.cfg | 1 - spacy/errors.py | 2 +- spacy/pipeline/tagger.pyx | 6 ++++-- spacy/schemas.py | 1 - spacy/syntax/nn_parser.pyx | 6 ++++-- spacy/tests/parser/test_ner.py | 3 ++- spacy/util.py | 1 + spacy/vocab.pyx | 19 ++++++++++--------- 8 files changed, 22 insertions(+), 17 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7ba008fb6..f1786e04b 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,5 @@ [nlp] lang = null -vocab_data = {} pipeline = [] [nlp.tokenizer] diff --git a/spacy/errors.py b/spacy/errors.py index 04d831c41..07c3df686 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -83,7 +83,7 @@ class Warnings: "doesn't have a normalization table, please ignore this warning. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed. The languages with lexeme normalization tables " - "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") + "are currently: {langs}") # TODO: fix numbering after merging develop into master W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index e4250b932..dfbb943f8 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -184,8 +184,10 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) - if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: - warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) + lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: + langs = ", ".join(util.LEXEME_NORM_LANGS) + warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs)) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): diff --git a/spacy/schemas.py b/spacy/schemas.py index ad16f3233..e55123e14 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -239,7 +239,6 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") - vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") # fmt: on class Config: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 09616ee75..f640e2e8d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -431,8 +431,10 @@ cdef class Parser: def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: - warnings.warn(Warnings.W033.format(model="parser or NER")) + lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: + langs = ", ".join(util.LEXEME_NORM_LANGS) + warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs)) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 71539fe60..4a6bf73a5 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -342,7 +342,8 @@ def test_overfitting_IO(): def test_ner_warns_no_lookups(): - nlp = Language() + nlp = English() + assert nlp.lang in util.LEXEME_NORM_LANGS nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") diff --git a/spacy/util.py b/spacy/util.py index 0d732034f..18ce7e474 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -53,6 +53,7 @@ if TYPE_CHECKING: _PRINT_ENV = False OOV_RANK = numpy.iinfo(numpy.uint64).max +LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] class registry(thinc.registry): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0f99a45f5..56e62834a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,13 +17,20 @@ from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors from .util import link_vectors_to_models, registry -from .lookups import Lookups +from .lookups import Lookups, load_lookups from . import util from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None): +def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): + # If the spacy-lookups-data package is installed, we pre-populate the lookups + # with lexeme data, if available + if load_lookups_data: + tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] + lookups = load_lookups(lang, tables=tables, strict=False) + else: + lookups = Lookups() lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} # This is messy, but it's the minimal working fix to Issue #639. lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) @@ -32,13 +39,8 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - vocab_data.get("lexeme_norm", {}), + lookups.get_table("lexeme_norm", {}), ) - lookups = Lookups() - for name, data in vocab_data.items(): - if name not in lookups: - data = data if data is not None else {} - lookups.add_table(name, data) return Vocab( lex_attr_getters=lex_attrs, lemmatizer=lemmatizer, @@ -49,7 +51,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No ) - cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying From 8d9d28eb8b1fb8484631ba4c690a9a1aacadb08d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 12:14:28 +0200 Subject: [PATCH 09/14] Re-add setting for vocab data and tidy up --- spacy/default_config.cfg | 1 + spacy/language.py | 21 ++++++++++++--------- spacy/schemas.py | 10 +--------- spacy/util.py | 6 ++++-- spacy/vocab.pyx | 4 ++-- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index f1786e04b..258b8634a 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,7 @@ [nlp] lang = null pipeline = [] +load_vocab_data = true [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" diff --git a/spacy/language.py b/spacy/language.py index 6d2ae3dbe..dc6167ef2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -121,15 +121,18 @@ class Language: vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. - max_length (int) : - Maximum number of characters in a single text. The current models - may run out memory on extremely long texts, due to large internal - allocations. You should segment these texts into meaningful units, - e.g. paragraphs, subsections etc, before passing them to spaCy. - Default maximum length is 1,000,000 characters (1mb). As a rule of - thumb, if all pipeline components are enabled, spaCy's default - models currently requires roughly 1GB of temporary memory per + max_length (int): Maximum number of characters in a single text. The + current models may run out memory on extremely long texts, due to + large internal allocations. You should segment these texts into + meaningful units, e.g. paragraphs, subsections etc, before passing + them to spaCy. Default maximum length is 1,000,000 charas (1mb). As + a rule of thumb, if all pipeline components are enabled, spaCy's + default models currently requires roughly 1GB of temporary memory per 100,000 characters in one text. + create_tokenizer (Callable): Function that takes the nlp object and + returns a tokenizer. + create_lemmatizer (Callable): Function that takes the nlp object and + returns a lemmatizer. RETURNS (Language): The newly constructed object. """ # We're only calling this to import all factories provided via entry @@ -150,12 +153,12 @@ class Language: if not create_lemmatizer: lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - # TODO: where does the vocab data come in? vocab = create_vocab( self.lang, self.Defaults, lemmatizer=create_lemmatizer(self), vectors_name=vectors_name, + load_data=self._config["nlp"]["load_vocab_data"], ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): diff --git a/spacy/schemas.py b/spacy/schemas.py index e55123e14..c6bdd6e9c 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaNlpWritingSystem(BaseModel): - direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'") - has_case: StrictBool = Field(..., title="Whether the language has case") - has_letters: StrictBool = Field(..., title="Whether the language has letters") - - class Config: - extra = "allow" - - class ConfigSchemaNlp(BaseModel): # fmt: off lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") + load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") # fmt: on class Config: diff --git a/spacy/util.py b/spacy/util.py index 18ce7e474..3b6ba0f25 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -188,8 +188,10 @@ def load_model( """Load a model from a package or data path. name (str): Package name or model path. - **overrides: Specific overrides, like pipeline components to disable. - RETURNS (Language): `Language` class with the loaded model. + disable (Iterable[str]): Names of pipeline components to disable. + component_cfg (Dict[str, dict]): Config overrides for pipeline components, + keyed by component names. + RETURNS (Language): The loaded nlp object. """ cfg = component_cfg if isinstance(name, str): # name or string path diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 56e62834a..2115789e6 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): +def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available - if load_lookups_data: + if load_data: tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] lookups = load_lookups(lang, tables=tables, strict=False) else: From a063a82c40aae7850f476490600c452c6312a164 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 12:14:37 +0200 Subject: [PATCH 10/14] Tidy up __init__.py --- setup.cfg | 1 - spacy/__init__.py | 50 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2abb1dcb8..dbe2c25fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -104,7 +104,6 @@ exclude = .git, __pycache__, _tokenizer_exceptions_list.py, - spacy/__init__.py [tool:pytest] markers = diff --git a/spacy/__init__.py b/spacy/__init__.py index e9783b161..da2b23a20 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,32 +1,50 @@ +from typing import Union, Iterable, Dict, Any +from pathlib import Path import warnings import sys -warnings.filterwarnings("ignore", message="numpy.dtype size changed") -warnings.filterwarnings("ignore", message="numpy.ufunc size changed") +warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa +warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa # These are imported as part of the API -from thinc.api import prefer_gpu, require_gpu +from thinc.api import prefer_gpu, require_gpu # noqa: F401 -from . import pipeline -from .cli.info import info -from .glossary import explain -from .about import __version__ -from .errors import Errors, Warnings +from . import pipeline # noqa: F401 +from .cli.info import info # noqa: F401 +from .glossary import explain # noqa: F401 +from .about import __version__ # noqa: F401 +from .util import registry # noqa: F401 + +from .errors import Errors +from .language import Language from . import util -from .util import registry - if sys.maxunicode == 65535: raise SystemError(Errors.E130) -config = registry +def load( + name: Union[str, Path], + disable: Iterable[str] = tuple(), + component_cfg: Dict[str, Dict[str, Any]] = util.SimpleFrozenDict(), +) -> Language: + """Load a spaCy model from an installed package or a local path. + + name (str): Package name or model path. + disable (Iterable[str]): Names of pipeline components to disable. + component_cfg (Dict[str, dict]): Config overrides for pipeline components, + keyed by component names. + RETURNS (Language): The loaded nlp object. + """ + return util.load_model(name, disable=disable, component_cfg=component_cfg) -def load(name, **overrides): - return util.load_model(name, **overrides) +def blank(name: str, **overrides) -> Language: + """Create a blank nlp object for a given language code. - -def blank(name, **kwargs): + name (str): The language code, e.g. "en". + **overrides: Keyword arguments passed to language subclass on init. + RETURNS (Language): The nlp object. + """ LangClass = util.get_lang_class(name) - return LangClass(**kwargs) + return LangClass(**overrides) From c003d26b9446de4f67f9efc21a7905184db308a9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 12:21:37 +0200 Subject: [PATCH 11/14] Tidy up --- spacy/errors.py | 1 + spacy/lookups.py | 4 ++-- spacy/util.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 07c3df686..fb50f913d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,6 +483,7 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " diff --git a/spacy/lookups.py b/spacy/lookups.py index e5a4a0b40..bf71ba877 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -12,7 +12,6 @@ from .strings import get_string_id UNSET = object() -@registry.language_data("spacy-lookups-data") def load_lookups( lang: str, tables: List[str], strict: bool = True ) -> Optional[Dict[str, Any]]: @@ -23,6 +22,7 @@ def load_lookups( lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] + strict (bool): Whether to raise an error if a table doesn't exist. RETURNS (Dict[str, Any]): The lookups, keyed by table name. """ # TODO: import spacy_lookups_data instead of going via entry points here? @@ -33,7 +33,7 @@ def load_lookups( for table in tables: if table not in data: if strict: - raise ValueError("TODO: unknown table") + raise ValueError(Errors.E955.format(table=table, lang=lang)) language_data = {} else: language_data = load_language_data(data[table]) diff --git a/spacy/util.py b/spacy/util.py index 3b6ba0f25..682d45bc9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -62,7 +62,6 @@ class registry(thinc.registry): tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True) lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True) lookups = catalogue.create("spacy", "lookups", entry_points=True) - language_data = catalogue.create("spacy", "language_data", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True) # These are factories registered via third-party packages and the From 4a0a692875ffcdc15ca94512a8c2c9774505d361 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 12:55:18 +0200 Subject: [PATCH 12/14] Add missing lex_attr_getters (resolves #5806 ) --- spacy/lang/ko/__init__.py | 2 ++ spacy/lang/ml/__init__.py | 2 ++ spacy/lang/ro/__init__.py | 2 ++ spacy/lang/tr/__init__.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index d2af9c4b1..6197ab927 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -3,6 +3,7 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg @@ -64,6 +65,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index 166d0e061..cfad52261 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,8 +1,10 @@ from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language class MalayalamDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 74016d3e9..f0d8d8d31 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES +from .lex_attrs import LEX_ATTRS from ...language import Language # Lemma data note: @@ -14,6 +15,7 @@ class RomanianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 70b277487..8bd0b93df 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,10 +1,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language class TurkishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS From 49f27a2a7b7c05ed9fad374c7e65d19cba0c94b7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 25 Jul 2020 13:00:49 +0200 Subject: [PATCH 13/14] Tidy up [ci skip] --- spacy/lang/en/punctuation.py | 2 +- spacy/lang/sv/tokenizer_exceptions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py index 67e3e80e5..5d3eb792e 100644 --- a/spacy/lang/en/punctuation.py +++ b/spacy/lang/en/punctuation.py @@ -1,5 +1,5 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS -from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA _infixes = ( LIST_ELLIPSES diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 64206f2f2..ce7db895a 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -151,6 +151,6 @@ for orth in ABBREVIATIONS: # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: - _exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}] + _exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) From 17f39eebdcfaf2b66de9e7a1be6636ab7ad2356d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Jul 2020 13:33:40 +0200 Subject: [PATCH 14/14] Update PTB config --- .../ptb-joint-pos-dep/defaults.cfg | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 1c946ac60..0df2e501d 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -1,4 +1,5 @@ [training] +max_steps = 0 patience = 10000 eval_frequency = 200 dropout = 0.2 @@ -8,13 +9,20 @@ max_epochs = 100 orth_variant_level = 0.0 gold_preproc = true max_length = 0 -use_gpu = -1 scores = ["tags_acc", "uas", "las"] score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 discard_oversize = false +raw_text = null +tag_map = null +morph_rules = null +base_model = null + +eval_batch_size = 128 +use_pytorch_for_gpu_memory = false +batch_by = "padded" [training.batch_size] @schedules = "compounding.v1" @@ -30,41 +38,48 @@ beta2 = 0.999 [nlp] lang = "en" -vectors = ${training:vectors} +pipeline = ["tok2vec", "tagger", "parser"] +load_vocab_data = false -[nlp.pipeline.tok2vec] +[nlp.tokenizer] +@tokenizers = "spacy.Tokenizer.v1" + +[nlp.lemmatizer] +@lemmatizers = "spacy.Lemmatizer.v1" + +[components] + +[components.tok2vec] factory = "tok2vec" -[nlp.pipeline.tagger] +[components.tagger] factory = "tagger" -[nlp.pipeline.parser] +[components.parser] factory = "parser" learn_tokens = false min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 -[nlp.pipeline.tagger.model] +[components.tagger.model] @architectures = "spacy.Tagger.v1" -[nlp.pipeline.tagger.model.tok2vec] +[components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +width = ${components.tok2vec.model:width} -[nlp.pipeline.parser.model] +[components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 64 maxout_pieces = 3 -[nlp.pipeline.parser.model.tok2vec] +[components.parser.model.tok2vec] @architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +width = ${components.tok2vec.model:width} -[nlp.pipeline.tok2vec.model] +[components.tok2vec.model] @architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} +pretrained_vectors = ${training:vectors} width = 96 depth = 4 window_size = 1