From a5cd2032843b26fbff9d6e0b53637e9477af3f7f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 15:59:14 +0200 Subject: [PATCH] Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal --- spacy/attrs.pxd | 2 +- spacy/attrs.pyx | 2 +- spacy/cli/init_model.py | 7 +- spacy/cli/train.py | 10 - spacy/lang/da/__init__.py | 8 +- spacy/lang/da/norm_exceptions.py | 527 ---- spacy/lang/de/__init__.py | 9 +- spacy/lang/de/norm_exceptions.py | 16 - spacy/lang/el/__init__.py | 9 +- spacy/lang/el/norm_exceptions.py | 2642 ----------------- spacy/lang/en/__init__.py | 9 +- spacy/lang/en/norm_exceptions.py | 1768 ----------- spacy/lang/id/__init__.py | 9 +- spacy/lang/id/norm_exceptions.py | 532 ---- spacy/lang/lb/__init__.py | 9 +- spacy/lang/lb/norm_exceptions.py | 16 - spacy/lang/lex_attrs.py | 15 - spacy/lang/pt/__init__.py | 9 +- spacy/lang/pt/norm_exceptions.py | 23 - spacy/lang/ru/__init__.py | 9 +- spacy/lang/ru/norm_exceptions.py | 36 - spacy/lang/sr/__init__.py | 9 +- spacy/lang/sr/norm_exceptions.py | 26 - spacy/lang/ta/norm_exceptions.py | 139 - spacy/lang/th/__init__.py | 9 +- spacy/lang/th/norm_exceptions.py | 113 - spacy/language.py | 6 +- spacy/lexeme.pxd | 24 +- spacy/lexeme.pyx | 84 +- spacy/lookups.py | 8 +- spacy/structs.pxd | 23 - spacy/symbols.pxd | 2 +- spacy/symbols.pyx | 2 +- spacy/tests/lang/da/test_exceptions.py | 8 - spacy/tests/lang/de/test_exceptions.py | 14 - spacy/tests/lang/en/test_exceptions.py | 1 + spacy/tests/lang/lb/test_exceptions.py | 6 - .../serialize/test_serialize_vocab_strings.py | 24 +- spacy/tests/test_lemmatizer.py | 2 +- spacy/tests/vocab_vectors/test_lexeme.py | 13 - spacy/tests/vocab_vectors/test_lookups.py | 6 +- spacy/tests/vocab_vectors/test_vectors.py | 12 + spacy/tokens/token.pyx | 10 +- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 134 +- 45 files changed, 161 insertions(+), 6182 deletions(-) delete mode 100644 spacy/lang/da/norm_exceptions.py delete mode 100644 spacy/lang/de/norm_exceptions.py delete mode 100644 spacy/lang/el/norm_exceptions.py delete mode 100644 spacy/lang/en/norm_exceptions.py delete mode 100644 spacy/lang/id/norm_exceptions.py delete mode 100644 spacy/lang/lb/norm_exceptions.py delete mode 100644 spacy/lang/pt/norm_exceptions.py delete mode 100644 spacy/lang/ru/norm_exceptions.py delete mode 100644 spacy/lang/sr/norm_exceptions.py delete mode 100644 spacy/lang/ta/norm_exceptions.py delete mode 100644 spacy/lang/th/norm_exceptions.py diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 8f583b3a3..805dc2950 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -15,7 +15,7 @@ cdef enum attr_id_t: LIKE_NUM LIKE_EMAIL IS_STOP - IS_OOV + IS_OOV_DEPRECATED IS_BRACKET IS_QUOTE IS_LEFT_PUNCT diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 2187f3c65..fe9895d06 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -16,7 +16,7 @@ IDS = { "LIKE_NUM": LIKE_NUM, "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, - "IS_OOV": IS_OOV, + "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED, "IS_BRACKET": IS_BRACKET, "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 618266633..3311a5120 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None): nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = OOV_RANK - lex_added = 0 for attrs in lex_attrs: if "settings" in attrs: continue lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs) - lexeme.is_oov = False - lex_added += 1 - lex_added += 1 if len(nlp.vocab): oov_prob = min(lex.prob for lex in nlp.vocab) - 1 else: @@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): if vector_keys is not None: for word in vector_keys: if word not in nlp.vocab: - lexeme = nlp.vocab[word] - lexeme.is_oov = False + nlp.vocab[word] if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6e6423131..7cb2d9745 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,7 +15,6 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu -from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus from ..compat import path2str from .. import util @@ -630,15 +629,6 @@ def _create_progress_bar(total): def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) - for lex in nlp.vocab: - values = {} - for attr, func in nlp.vocab.lex_attr_getters.items(): - # These attrs are expected to be set by data. Others should - # be set by calling the language functions. - if attr not in (CLUSTER, PROB, IS_OOV, LANG): - values[lex.vocab.strings[attr]] = func(lex.orth_) - lex.set_attrs(**values) - lex.is_oov = False def _load_pretrained_tok2vec(nlp, loc): diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index ac8c04954..92eec44b2 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "da" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) morph_rules = MORPH_RULES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py deleted file mode 100644 index dbffdb88b..000000000 --- a/spacy/lang/da/norm_exceptions.py +++ /dev/null @@ -1,527 +0,0 @@ -# coding: utf8 -""" -Special-case rules for normalizing tokens to improve the model's predictions. -For example 'mysterium' vs 'mysterie' and similar. -""" -from __future__ import unicode_literals - - -# Sources: -# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ -# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/ - -_exc = { - # Alternative spelling - "a-kraft-værk": "a-kraftværk", # 1 - "ålborg": "aalborg", # 2 - "århus": "aarhus", - "accessoirer": "accessoires", # 1 - "affektert": "affekteret", # 1 - "afrikander": "afrikaaner", # 1 - "aftabuere": "aftabuisere", # 1 - "aftabuering": "aftabuisering", # 1 - "akvarium": "akvarie", # 1 - "alenefader": "alenefar", # 1 - "alenemoder": "alenemor", # 1 - "alkoholambulatorium": "alkoholambulatorie", # 1 - "ambulatorium": "ambulatorie", # 1 - "ananassene": "ananasserne", # 2 - "anførelsestegn": "anførselstegn", # 1 - "anseelig": "anselig", # 2 - "antioxydant": "antioxidant", # 1 - "artrig": "artsrig", # 1 - "auditorium": "auditorie", # 1 - "avocado": "avokado", # 2 - "bagerst": "bagest", # 2 - "bagstræv": "bagstræb", # 1 - "bagstræver": "bagstræber", # 1 - "bagstræverisk": "bagstræberisk", # 1 - "balde": "balle", # 2 - "barselorlov": "barselsorlov", # 1 - "barselvikar": "barselsvikar", # 1 - "baskien": "baskerlandet", # 1 - "bayrisk": "bayersk", # 1 - "bedstefader": "bedstefar", # 1 - "bedstemoder": "bedstemor", # 1 - "behefte": "behæfte", # 1 - "beheftelse": "behæftelse", # 1 - "bidragydende": "bidragsydende", # 1 - "bidragyder": "bidragsyder", # 1 - "billiondel": "billiontedel", # 1 - "blaseret": "blasert", # 1 - "bleskifte": "bleskift", # 1 - "blodbroder": "blodsbroder", # 2 - "blyantspidser": "blyantsspidser", # 2 - "boligministerium": "boligministerie", # 1 - "borhul": "borehul", # 1 - "broder": "bror", # 2 - "buldog": "bulldog", # 2 - "bådhus": "bådehus", # 1 - "børnepleje": "barnepleje", # 1 - "børneseng": "barneseng", # 1 - "børnestol": "barnestol", # 1 - "cairo": "kairo", # 1 - "cambodia": "cambodja", # 1 - "cambodianer": "cambodjaner", # 1 - "cambodiansk": "cambodjansk", # 1 - "camouflage": "kamuflage", # 2 - "campylobacter": "kampylobakter", # 1 - "centeret": "centret", # 2 - "chefskahyt": "chefkahyt", # 1 - "chefspost": "chefpost", # 1 - "chefssekretær": "chefsekretær", # 1 - "chefsstol": "chefstol", # 1 - "cirkulærskrivelse": "cirkulæreskrivelse", # 1 - "cognacsglas": "cognacglas", # 1 - "columnist": "kolumnist", # 1 - "cricket": "kricket", # 2 - "dagplejemoder": "dagplejemor", # 1 - "damaskesdug": "damaskdug", # 1 - "damp-barn": "dampbarn", # 1 - "delfinarium": "delfinarie", # 1 - "dentallaboratorium": "dentallaboratorie", # 1 - "diaramme": "diasramme", # 1 - "diaré": "diarré", # 1 - "dioxyd": "dioxid", # 1 - "dommedagsprædiken": "dommedagspræken", # 1 - "donut": "doughnut", # 2 - "driftmæssig": "driftsmæssig", # 1 - "driftsikker": "driftssikker", # 1 - "driftsikring": "driftssikring", # 1 - "drikkejogurt": "drikkeyoghurt", # 1 - "drivein": "drive-in", # 1 - "driveinbiograf": "drive-in-biograf", # 1 - "drøvel": "drøbel", # 1 - "dødskriterium": "dødskriterie", # 1 - "e-mail-adresse": "e-mailadresse", # 1 - "e-post-adresse": "e-postadresse", # 1 - "egypten": "ægypten", # 2 - "ekskommunicere": "ekskommunikere", # 1 - "eksperimentarium": "eksperimentarie", # 1 - "elsass": "Alsace", # 1 - "elsasser": "alsacer", # 1 - "elsassisk": "alsacisk", # 1 - "elvetal": "ellevetal", # 1 - "elvetiden": "ellevetiden", # 1 - "elveårig": "elleveårig", # 1 - "elveårs": "elleveårs", # 1 - "elveårsbarn": "elleveårsbarn", # 1 - "elvte": "ellevte", # 1 - "elvtedel": "ellevtedel", # 1 - "energiministerium": "energiministerie", # 1 - "erhvervsministerium": "erhvervsministerie", # 1 - "espaliere": "spaliere", # 2 - "evangelium": "evangelie", # 1 - "fagministerium": "fagministerie", # 1 - "fakse": "faxe", # 1 - "fangstkvota": "fangstkvote", # 1 - "fader": "far", # 2 - "farbroder": "farbror", # 1 - "farfader": "farfar", # 1 - "farmoder": "farmor", # 1 - "federal": "føderal", # 1 - "federalisering": "føderalisering", # 1 - "federalisme": "føderalisme", # 1 - "federalist": "føderalist", # 1 - "federalistisk": "føderalistisk", # 1 - "federation": "føderation", # 1 - "federativ": "føderativ", # 1 - "fejlbeheftet": "fejlbehæftet", # 1 - "femetagers": "femetages", # 2 - "femhundredekroneseddel": "femhundredkroneseddel", # 2 - "filmpremiere": "filmpræmiere", # 2 - "finansimperium": "finansimperie", # 1 - "finansministerium": "finansministerie", # 1 - "firehjulstræk": "firhjulstræk", # 2 - "fjernstudium": "fjernstudie", # 1 - "formalier": "formalia", # 1 - "formandsskift": "formandsskifte", # 1 - "fornemst": "fornemmest", # 2 - "fornuftparti": "fornuftsparti", # 1 - "fornuftstridig": "fornuftsstridig", # 1 - "fornuftvæsen": "fornuftsvæsen", # 1 - "fornuftægteskab": "fornuftsægteskab", # 1 - "forretningsministerium": "forretningsministerie", # 1 - "forskningsministerium": "forskningsministerie", # 1 - "forstudium": "forstudie", # 1 - "forsvarsministerium": "forsvarsministerie", # 1 - "frilægge": "fritlægge", # 1 - "frilæggelse": "fritlæggelse", # 1 - "frilægning": "fritlægning", # 1 - "fristille": "fritstille", # 1 - "fristilling": "fritstilling", # 1 - "fuldttegnet": "fuldtegnet", # 1 - "fødestedskriterium": "fødestedskriterie", # 1 - "fødevareministerium": "fødevareministerie", # 1 - "følesløs": "følelsesløs", # 1 - "følgeligt": "følgelig", # 1 - "førne": "førn", # 1 - "gearskift": "gearskifte", # 2 - "gladeligt": "gladelig", # 1 - "glosehefte": "glosehæfte", # 1 - "glædeløs": "glædesløs", # 1 - "gonoré": "gonorré", # 1 - "grangiveligt": "grangivelig", # 1 - "grundliggende": "grundlæggende", # 2 - "grønsag": "grøntsag", # 2 - "gudbenådet": "gudsbenådet", # 1 - "gudfader": "gudfar", # 1 - "gudmoder": "gudmor", # 1 - "gulvmop": "gulvmoppe", # 1 - "gymnasium": "gymnasie", # 1 - "hackning": "hacking", # 1 - "halvbroder": "halvbror", # 1 - "halvelvetiden": "halvellevetiden", # 1 - "handelsgymnasium": "handelsgymnasie", # 1 - "hefte": "hæfte", # 1 - "hefteklamme": "hæfteklamme", # 1 - "heftelse": "hæftelse", # 1 - "heftemaskine": "hæftemaskine", # 1 - "heftepistol": "hæftepistol", # 1 - "hefteplaster": "hæfteplaster", # 1 - "heftestraf": "hæftestraf", # 1 - "heftning": "hæftning", # 1 - "helbroder": "helbror", # 1 - "hjemmeklasse": "hjemklasse", # 1 - "hjulspin": "hjulspind", # 1 - "huggevåben": "hugvåben", # 1 - "hulmurisolering": "hulmursisolering", # 1 - "hurtiggående": "hurtigtgående", # 2 - "hurtigttørrende": "hurtigtørrende", # 2 - "husmoder": "husmor", # 1 - "hydroxyd": "hydroxid", # 1 - "håndmikser": "håndmixer", # 1 - "højtaler": "højttaler", # 2 - "hønemoder": "hønemor", # 1 - "ide": "idé", # 2 - "imperium": "imperie", # 1 - "imponerthed": "imponerethed", # 1 - "inbox": "indboks", # 2 - "indenrigsministerium": "indenrigsministerie", # 1 - "indhefte": "indhæfte", # 1 - "indheftning": "indhæftning", # 1 - "indicium": "indicie", # 1 - "indkassere": "inkassere", # 2 - "iota": "jota", # 1 - "jobskift": "jobskifte", # 1 - "jogurt": "yoghurt", # 1 - "jukeboks": "jukebox", # 1 - "justitsministerium": "justitsministerie", # 1 - "kalorifere": "kalorifer", # 1 - "kandidatstipendium": "kandidatstipendie", # 1 - "kannevas": "kanvas", # 1 - "kaperssauce": "kaperssovs", # 1 - "kigge": "kikke", # 2 - "kirkeministerium": "kirkeministerie", # 1 - "klapmydse": "klapmyds", # 1 - "klimakterium": "klimakterie", # 1 - "klogeligt": "klogelig", # 1 - "knivblad": "knivsblad", # 1 - "kollegaer": "kolleger", # 2 - "kollegium": "kollegie", # 1 - "kollegiehefte": "kollegiehæfte", # 1 - "kollokviumx": "kollokvium", # 1 - "kommissorium": "kommissorie", # 1 - "kompendium": "kompendie", # 1 - "komplicerthed": "komplicerethed", # 1 - "konfederation": "konføderation", # 1 - "konfedereret": "konfødereret", # 1 - "konferensstudium": "konferensstudie", # 1 - "konservatorium": "konservatorie", # 1 - "konsulere": "konsultere", # 1 - "kradsbørstig": "krasbørstig", # 2 - "kravsspecifikation": "kravspecifikation", # 1 - "krematorium": "krematorie", # 1 - "krep": "crepe", # 1 - "krepnylon": "crepenylon", # 1 - "kreppapir": "crepepapir", # 1 - "kricket": "cricket", # 2 - "kriterium": "kriterie", # 1 - "kroat": "kroater", # 2 - "kroki": "croquis", # 1 - "kronprinsepar": "kronprinspar", # 2 - "kropdoven": "kropsdoven", # 1 - "kroplus": "kropslus", # 1 - "krøllefedt": "krølfedt", # 1 - "kulturministerium": "kulturministerie", # 1 - "kuponhefte": "kuponhæfte", # 1 - "kvota": "kvote", # 1 - "kvotaordning": "kvoteordning", # 1 - "laboratorium": "laboratorie", # 1 - "laksfarve": "laksefarve", # 1 - "laksfarvet": "laksefarvet", # 1 - "laksrød": "lakserød", # 1 - "laksyngel": "lakseyngel", # 1 - "laksørred": "lakseørred", # 1 - "landbrugsministerium": "landbrugsministerie", # 1 - "landskampstemning": "landskampsstemning", # 1 - "langust": "languster", # 1 - "lappegrejer": "lappegrej", # 1 - "lavløn": "lavtløn", # 1 - "lillebroder": "lillebror", # 1 - "linear": "lineær", # 1 - "loftlampe": "loftslampe", # 2 - "log-in": "login", # 1 - "login": "log-in", # 2 - "lovmedholdig": "lovmedholdelig", # 1 - "ludder": "luder", # 2 - "lysholder": "lyseholder", # 1 - "lægeskifte": "lægeskift", # 1 - "lærvillig": "lærevillig", # 1 - "løgsauce": "løgsovs", # 1 - "madmoder": "madmor", # 1 - "majonæse": "mayonnaise", # 1 - "mareridtagtig": "mareridtsagtig", # 1 - "margen": "margin", # 2 - "martyrium": "martyrie", # 1 - "mellemstatlig": "mellemstatslig", # 1 - "menneskene": "menneskerne", # 2 - "metropolis": "metropol", # 1 - "miks": "mix", # 1 - "mikse": "mixe", # 1 - "miksepult": "mixerpult", # 1 - "mikser": "mixer", # 1 - "mikserpult": "mixerpult", # 1 - "mikslån": "mixlån", # 1 - "miksning": "mixning", # 1 - "miljøministerium": "miljøministerie", # 1 - "milliarddel": "milliardtedel", # 1 - "milliondel": "milliontedel", # 1 - "ministerium": "ministerie", # 1 - "mop": "moppe", # 1 - "moder": "mor", # 2 - "moratorium": "moratorie", # 1 - "morbroder": "morbror", # 1 - "morfader": "morfar", # 1 - "mormoder": "mormor", # 1 - "musikkonservatorium": "musikkonservatorie", # 1 - "muslingskal": "muslingeskal", # 1 - "mysterium": "mysterie", # 1 - "naturalieydelse": "naturalydelse", # 1 - "naturalieøkonomi": "naturaløkonomi", # 1 - "navnebroder": "navnebror", # 1 - "nerium": "nerie", # 1 - "nådeløs": "nådesløs", # 1 - "nærforestående": "nærtforestående", # 1 - "nærstående": "nærtstående", # 1 - "observatorium": "observatorie", # 1 - "oldefader": "oldefar", # 1 - "oldemoder": "oldemor", # 1 - "opgraduere": "opgradere", # 1 - "opgraduering": "opgradering", # 1 - "oratorium": "oratorie", # 1 - "overbookning": "overbooking", # 1 - "overpræsidium": "overpræsidie", # 1 - "overstatlig": "overstatslig", # 1 - "oxyd": "oxid", # 1 - "oxydere": "oxidere", # 1 - "oxydering": "oxidering", # 1 - "pakkenellike": "pakkenelliker", # 1 - "papirtynd": "papirstynd", # 1 - "pastoralseminarium": "pastoralseminarie", # 1 - "peanutsene": "peanuttene", # 2 - "penalhus": "pennalhus", # 2 - "pensakrav": "pensumkrav", # 1 - "pepperoni": "peperoni", # 1 - "peruaner": "peruvianer", # 1 - "petrole": "petrol", # 1 - "piltast": "piletast", # 1 - "piltaste": "piletast", # 1 - "planetarium": "planetarie", # 1 - "plasteret": "plastret", # 2 - "plastic": "plastik", # 2 - "play-off-kamp": "playoffkamp", # 1 - "plejefader": "plejefar", # 1 - "plejemoder": "plejemor", # 1 - "podium": "podie", # 2 - "praha": "prag", # 2 - "preciøs": "pretiøs", # 2 - "privilegium": "privilegie", # 1 - "progredere": "progrediere", # 1 - "præsidium": "præsidie", # 1 - "psykodelisk": "psykedelisk", # 1 - "pudsegrejer": "pudsegrej", # 1 - "referensgruppe": "referencegruppe", # 1 - "referensramme": "referenceramme", # 1 - "refugium": "refugie", # 1 - "registeret": "registret", # 2 - "remedium": "remedie", # 1 - "remiks": "remix", # 1 - "reservert": "reserveret", # 1 - "ressortministerium": "ressortministerie", # 1 - "ressource": "resurse", # 2 - "resætte": "resette", # 1 - "rettelig": "retteligt", # 1 - "rettetaste": "rettetast", # 1 - "returtaste": "returtast", # 1 - "risici": "risikoer", # 2 - "roll-on": "rollon", # 1 - "rollehefte": "rollehæfte", # 1 - "rostbøf": "roastbeef", # 1 - "rygsæksturist": "rygsækturist", # 1 - "rødstjært": "rødstjert", # 1 - "saddel": "sadel", # 2 - "samaritan": "samaritaner", # 2 - "sanatorium": "sanatorie", # 1 - "sauce": "sovs", # 1 - "scanning": "skanning", # 2 - "sceneskifte": "sceneskift", # 1 - "scilla": "skilla", # 1 - "sejflydende": "sejtflydende", # 1 - "selvstudium": "selvstudie", # 1 - "seminarium": "seminarie", # 1 - "sennepssauce": "sennepssovs ", # 1 - "servitutbeheftet": "servitutbehæftet", # 1 - "sit-in": "sitin", # 1 - "skatteministerium": "skatteministerie", # 1 - "skifer": "skiffer", # 2 - "skyldsfølelse": "skyldfølelse", # 1 - "skysauce": "skysovs", # 1 - "sladdertaske": "sladretaske", # 2 - "sladdervorn": "sladrevorn", # 2 - "slagsbroder": "slagsbror", # 1 - "slettetaste": "slettetast", # 1 - "smørsauce": "smørsovs", # 1 - "snitsel": "schnitzel", # 1 - "snobbeeffekt": "snobeffekt", # 2 - "socialministerium": "socialministerie", # 1 - "solarium": "solarie", # 1 - "soldebroder": "soldebror", # 1 - "spagetti": "spaghetti", # 1 - "spagettistrop": "spaghettistrop", # 1 - "spagettiwestern": "spaghettiwestern", # 1 - "spin-off": "spinoff", # 1 - "spinnefiskeri": "spindefiskeri", # 1 - "spolorm": "spoleorm", # 1 - "sproglaboratorium": "sproglaboratorie", # 1 - "spækbræt": "spækkebræt", # 2 - "stand-in": "standin", # 1 - "stand-up-comedy": "standupcomedy", # 1 - "stand-up-komiker": "standupkomiker", # 1 - "statsministerium": "statsministerie", # 1 - "stedbroder": "stedbror", # 1 - "stedfader": "stedfar", # 1 - "stedmoder": "stedmor", # 1 - "stilehefte": "stilehæfte", # 1 - "stipendium": "stipendie", # 1 - "stjært": "stjert", # 1 - "stjærthage": "stjerthage", # 1 - "storebroder": "storebror", # 1 - "stortå": "storetå", # 1 - "strabads": "strabadser", # 1 - "strømlinjet": "strømlinet", # 1 - "studium": "studie", # 1 - "stænkelap": "stænklap", # 1 - "sundhedsministerium": "sundhedsministerie", # 1 - "suppositorium": "suppositorie", # 1 - "svejts": "schweiz", # 1 - "svejtser": "schweizer", # 1 - "svejtserfranc": "schweizerfranc", # 1 - "svejtserost": "schweizerost", # 1 - "svejtsisk": "schweizisk", # 1 - "svigerfader": "svigerfar", # 1 - "svigermoder": "svigermor", # 1 - "svirebroder": "svirebror", # 1 - "symposium": "symposie", # 1 - "sælarium": "sælarie", # 1 - "søreme": "sørme", # 2 - "søterritorium": "søterritorie", # 1 - "t-bone-steak": "t-bonesteak", # 1 - "tabgivende": "tabsgivende", # 1 - "tabuere": "tabuisere", # 1 - "tabuering": "tabuisering", # 1 - "tackle": "takle", # 2 - "tackling": "takling", # 2 - "taifun": "tyfon", # 1 - "take-off": "takeoff", # 1 - "taknemlig": "taknemmelig", # 2 - "talehørelærer": "tale-høre-lærer", # 1 - "talehøreundervisning": "tale-høre-undervisning", # 1 - "tandstik": "tandstikker", # 1 - "tao": "dao", # 1 - "taoisme": "daoisme", # 1 - "taoist": "daoist", # 1 - "taoistisk": "daoistisk", # 1 - "taverne": "taverna", # 1 - "teateret": "teatret", # 2 - "tekno": "techno", # 1 - "temposkifte": "temposkift", # 1 - "terrarium": "terrarie", # 1 - "territorium": "territorie", # 1 - "tesis": "tese", # 1 - "tidsstudium": "tidsstudie", # 1 - "tipoldefader": "tipoldefar", # 1 - "tipoldemoder": "tipoldemor", # 1 - "tomatsauce": "tomatsovs", # 1 - "tonart": "toneart", # 1 - "trafikministerium": "trafikministerie", # 1 - "tredve": "tredive", # 1 - "tredver": "trediver", # 1 - "tredveårig": "trediveårig", # 1 - "tredveårs": "trediveårs", # 1 - "tredveårsfødselsdag": "trediveårsfødselsdag", # 1 - "tredvte": "tredivte", # 1 - "tredvtedel": "tredivtedel", # 1 - "troldunge": "troldeunge", # 1 - "trommestikke": "trommestik", # 1 - "trubadur": "troubadour", # 2 - "trøstepræmie": "trøstpræmie", # 2 - "tummerum": "trummerum", # 1 - "tumultuarisk": "tumultarisk", # 1 - "tunghørighed": "tunghørhed", # 1 - "tus": "tusch", # 2 - "tusind": "tusinde", # 2 - "tvillingbroder": "tvillingebror", # 1 - "tvillingbror": "tvillingebror", # 1 - "tvillingebroder": "tvillingebror", # 1 - "ubeheftet": "ubehæftet", # 1 - "udenrigsministerium": "udenrigsministerie", # 1 - "udhulning": "udhuling", # 1 - "udslaggivende": "udslagsgivende", # 1 - "udspekulert": "udspekuleret", # 1 - "udviklingsministerium": "udviklingsministerie", # 1 - "uforpligtigende": "uforpligtende", # 1 - "uheldvarslende": "uheldsvarslende", # 1 - "uimponerthed": "uimponerethed", # 1 - "undervisningsministerium": "undervisningsministerie", # 1 - "unægtelig": "unægteligt", # 1 - "urinale": "urinal", # 1 - "uvederheftig": "uvederhæftig", # 1 - "vabel": "vable", # 2 - "vadi": "wadi", # 1 - "vaklevorn": "vakkelvorn", # 1 - "vanadin": "vanadium", # 1 - "vaselin": "vaseline", # 1 - "vederheftig": "vederhæftig", # 1 - "vedhefte": "vedhæfte", # 1 - "velar": "velær", # 1 - "videndeling": "vidensdeling", # 2 - "vinkelanførelsestegn": "vinkelanførselstegn", # 1 - "vipstjært": "vipstjert", # 1 - "vismut": "bismut", # 1 - "visvas": "vissevasse", # 1 - "voksværk": "vokseværk", # 1 - "værtdyr": "værtsdyr", # 1 - "værtplante": "værtsplante", # 1 - "wienersnitsel": "wienerschnitzel", # 1 - "yderliggående": "yderligtgående", # 2 - "zombi": "zombie", # 1 - "ægbakke": "æggebakke", # 1 - "ægformet": "æggeformet", # 1 - "ægleder": "æggeleder", # 1 - "ækvilibrist": "ekvilibrist", # 2 - "æselsøre": "æseløre", # 1 - "øjehule": "øjenhule", # 1 - "øjelåg": "øjenlåg", # 1 - "øjeåbner": "øjenåbner", # 1 - "økonomiministerium": "økonomiministerie", # 1 - "ørenring": "ørering", # 2 - "øvehefte": "øvehæfte", # 1 -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index dee1841c8..ca01428ba 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP @@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class GermanDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "de" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py deleted file mode 100644 index 3dbd4c7e3..000000000 --- a/spacy/lang/de/norm_exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# Here we only want to include the absolute most common words. Otherwise, -# this list would get impossibly long for German – especially considering the -# old vs. new spelling rules, and all possible cases. - - -_exc = {"daß": "dass"} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 6d551cc4e..d03a42da9 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lookups import Lookups -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class GreekDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "el" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py deleted file mode 100644 index d4384ff3c..000000000 --- a/spacy/lang/el/norm_exceptions.py +++ /dev/null @@ -1,2642 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -# These exceptions are used to add NORM values based on a token's ORTH value. -# Norms are only set if no alternative is provided in the tokenizer exceptions. - -_exc = { - "αγιορίτης": "αγιορείτης", - "αγόρι": "αγώρι", - "έωλος": "αίολος", - "αλλοίθωρος": "αλλήθωρος", - "αλλοιώς": "αλλιώς", - "αλλοιώτικος": "αλλκότικος", - "αναµιγνύω": "αναµειγνύω", - "ανάµιξη": "ανάµειξη", - "ανανδρεία": "ανανδρία", - "αναφιλυτό": "αναφιλητό", - "ανελλειπώς": "ανελλιπώς", - "ανεξιθρησκεία": "ανεξιθρησκία", - "αντικρυνός": "αντικρινός", - "απάγκιο": "απάγκεω", - "αρµατωλός": "αρµατολός", - "αρρώστεια": "αρρώστια", - "ατόφιος": "ατόφυος", - "αφίνω": "αφήνω", - "χιβάδα": "χηβάδα", - "αχρηστεία": "αχρηστία", - "βαρυγκωµώ": "βαρυγγωµώ", - "βεβαρυµένος": "βεβαρηµένος", - "βερύκοκκο": "βερίκοκο", - "βλήτο": "βλίτο", - "βογκώ": "βογγώ", - "βραδυά": "βραδιά", - "βραδυάζει": "βραδίάζει", - "Βρεταννία": "Βρετανία", - "Βρεττανία": "Βρετανία", - "βολοδέρνω": "βωλοδέρνω", - "γέλοιο": "γέλιο", - "γκάµα": "γκάµµα", - "γλύφω": "γλείφω", - "γλήνα": "γλίνα", - "διαφήµηση": "διαφήµιση", - "δικλείδα": "δικλίδα", - "διοξείδιο": "διοξίδιο", - "διορία": "διωρία", - "δυόροφος": "διώροφος", - "δυόµισυ": "δυόµισι", - "διόσµος": "δυόσμος", - "δυσφήμιση": "δυσφήµηση", - "δοσίλογος": "δωσίλογος", - "εγχείριση": "εγχείρηση", - "ειδωλολατρεία": "ειδωλολατρία", - "εληά": "ελιά", - "ελιξίριο": "ελιξήριο", - "έλκυθρο": "έλκηθρο", - "ελλειπής": "ελλίπής", - "ενάµισυς": "ενάµισης", - "ενάµισυ": "ενάµισι", - "ενανθρώπιση": "ενανθρώπηση", - "έξη": "έξι", - "επί τούτο": "επί τούτω", - "εταιρία": "εταιρεία", - "εφορεία": "εφορία", - "ζηλειάρης": "ζηλιάρης", - "Θεοφάνεια": "Θεοφάνια", - "καυγάς": "καβγάς", - "καθίκι": "καθοίκι", - "καινούριος": "καινούργιος", - "κακάβι": "κακκάβι", - "κακαβιά": "κακκαβιά", - "καµµία": "καµία", - "κανέλα": "Καννέλα", - "κανονιοφόρος": "κανονιοφόρος", - "καντίλι": "καντήλι", - "κατεβοδώνω": "κατευοδώνω", - "κοίτοµαι": "κείτοµαι", - "κελαϊδώ": "κελαηδώ", - "κυάλια": "κιάλια", - "κλύδωνας": "κλήδονας", - "κλωτσώ": "κλοτσώ", - "κολλιτσίδα": "κολλητσίδα", - "κουκί": "κουκκί", - "κουλός": "κουλλός", - "κρεββάτι": "κρεβάτι", - "κροκόδειλος": "κροκόδιλος", - "κοβιός": "κωβιός", - "λάκισα": "λάκησα", - "λιµέρι": "ληµέρι", - "λώξυγγας": "λόξυγγας", - "µαγγούρα": "µαγκούρα", - "µαζή": "μαζί", - "µακρυά": "µακριά", - "µαµή": "µαµµή", - "µαµόθρεφτος": "µαµµόθρεφτος", - "µίγµα": "µείγµα", - "µίξη": "µείξη", - "µετώπη": "µετόπη", - "µυρολόι": "µοιρολόι", - "µοτοσικλέτα": "µοτοσυκλέτα", - "µπαλωµατής": "µπαλλωµατής", - "µιζίθρα": "µυζήθρα", - "νεοτερίζω": "νεωτερίζω", - "νεοτερισµός": "νεωτερισμός", - "νεοτεριστής": "νεωτεριστής", - "νινί": "νηνί", - "νοιώθω": "νιώθω", - "νονός": "νοννός", - "ξενιτιά": "ξενιτειά", - "ξαίρω": "ξέρω", - "ξίγκι": "ξίγγι", - "ξείδι": "ξίδι", - "ξώβεργα": "ξόβεργα", - "ξιπάζω": "ξυπάζω", - "ξιπασµένος": "ξυπασµένος", - "ξυπόλητος": "ξυπόλυτος", - "ξωκλήσι": "ξωκκλήσι", - "οξυά": "οξιά", - "ορθοπεδικός": "ορθοπαιδικός", - "ωχ": "οχ", - "παπάς": "παππάς", - "παραγιός": "παραγυιός", - "περηφάνεια": "περηφάνια", - "πιλάλα": "πηλάλα", - "πίννα": "πίνα", - "πηρούνι": "πιρούνι", - "πιτσιλώ": "πιτσυλώ", - "πιτσιλίζω": "πιτσυλίζω", - "πλατυάζω": "πλατειάζω", - "πληµµυρίδα": "πληµυρίδα", - "πληγούρι": "πλιγούρι", - "πωπώ": "ποπό", - "πουγγί": "πουγκί", - "πρίγκηπας": "πρίγκιπας", - "προάστειο": "προάστιο", - "προεδρεία": "προεδρία", - "πρίµα": "πράµα", - "πρωτήτερα": "πρωτύτερα", - "προτύτερα": "πρωτύτερα", - "πόρωση": "πώρωση", - "ρεβύθι": "ρεβίθι", - "ρέγγα": "ρέΥκα", - "ρηγώνω": "ριγώνω", - "ρωµανικός": "ροµανικός", - "ρίζι": "ρύζι", - "Ρώσσος": "Ρώσος", - "σακκούλα": "σακούλα", - "συνάφι": "σινάφι", - "σειρίτι": "σιρίτι", - "σιφόνι": "σιφώνι", - "συχαίνοµαι": "σιχαίνοµαι", - "σκιρόδεµα": "σκυρόδεµα", - "σπάγγος": "σπάγκος", - "στυλιάρι": "στειλιάρι", - "στοιβάδα": "στιβάδα", - "στίβα": "στοίβα", - "στριµώνω": "στρυµώνω", - "στριμώχνω": "στρυμώχνω", - "συγχύζω": "συγχίζω", - "σηκώτι": "συκώτι", - "σιναγρίδα": "συναγρίδα", - "συνοδεία": "συνοδία", - "σίφιλη": "σύφιλη", - "τανιέµαι": "τανυέµαι", - "τανίζω": "τανύζω", - "τέσσερις": "τέσσερεις", - "τζιτζιφιά": "τζιτζυφιά", - "τόνος": "τόννος", - "τοπείο": "τοπίο", - "τρέλλα": "τρέλα", - "τσαγγάρης": "τσαγκάρης", - "τσανάκα": "τσαννάκα", - "τσανακογλείφτης": "τσαννακογλείφτης", - "τσιτώνω": "τσητώνω", - "τσιγκλώ": "τσυγκλώ", - "τσίµα": "τσύµα", - "υννί": "υνί", - "υπερηφάνια": "υπερηφάνεια", - "υπόχρεως": "υπόχρεος", - "φάκελλος": "φάκελος", - "φείδι": "φίδι", - "φιλονεικώ": "φιλονικώ", - "φιλονεικία": "φιλονικία", - "φυρί-φυρί": "φιρί-φιρί", - "φτιάνω": "φτειάχνω", - "φτιάχνω": "φτειάχνω", - "φτώχεια": "φτώχια", - "φυσαλίδα": "φυσαλλίδα", - "χάνος": "χάννος", - "χυνόπωρο": "χινόπωρο", - "χεινόπωρο": "χινόπωρο", - "χιµίζω": "χυµίζω", - "χιμίζω": "χυμιζώ", - "γκωλ": "γκολ", - "αιρκοντίσιον": "ερκοντίσιον", - "καρµπυρατέρ": "καρµπφατέρ", - "κυλόττα": "κιλότα", - "κλή ρινγκ": "κλίρινγκ", - "κωλγκέρλ": "κολγκέρλ", - "κοµπιναιζόν": "κοµπινεζόν", - "κοπυράιτ": "κοπιράιτ", - "µυλαίδη": "µιλέδη", - "µποϋκοτάζ": "µποϊκοτάζ", - "πέναλτυ": "πέναλτι", - "πορτραίτο": "πορτρέτο", - "ρεστωράν": "ρεστοράν", - "ροσµπήφ": "ροσµπίφ", - "σαντιγύ": "σαντιγί", - "στριπτήζ": "στριπτίζ", - "ταµπλώ": "ταµπλό", - "τζόκεϋ": "τζόκεϊ", - "φουτµπώλ": "φουτµπόλ", - "τρόλλεϋ": "τρόλεϊ", - "χίππυ": "χίπι", - "φέρρυ-µπωτ": "φεριµπότ", - "χειρούργος": "χειρουργός", - "αβαείο": "αββαείο", - "αβάς": "αββάς", - "αβάσκαµα": "βάσκαµα", - "αβασκανία": "βασκανία", - "αβάφτιστος": "αβάπτιστος", - "αβάφτιστη": "αβάπτιστη", - "αβάφτιστο": "αβάπτιστο", - "αβγίλα": "αβγουλίλα", - "αυτί": "αφτί", - "αβδέλλα": "βδέλλα", - "Αβράµ": "'Αβραάµ", - "αγγινάρα": "αγκινάρα", - "αγγόνα": "εγγονή", - "αγγόνι": "εγγόνι", - "αγγονός": "εγγονός", - "άγειρτος": "άγερτος", - "άγειρτη": "άγερτη", - "άγειρτο": "άγερτο", - "αγέρας": "αέρας", - "αγκλέουρας": "αγλέορας", - "αγκλίτοα": "γκλίτσα", - "Αγκόλα": "Ανγκόλα", - "αγκορά": "ανγκορά", - "αγκοστοίιρα": "ανγκοστούρα", - "άγνεστος": "άγνεθος", - "άγνεστη": "άγνεθη", - "άγνεστο": "άγνεθο", - "αγώρι": "αγόρι", - "αγωρίστικος": "αγορίστικος", - "αγωρίστικη": "αγορίστικη", - "αγωρίστικο": "αγορίστικο", - "αγωροκόριτσο": "αγοροκόριστο", - "αγουρόλαδο": "αγουρέλαιο", - "αγροικώ": "γροικώ", - "αδάµαντας": "αδάµας", - "αδερφή": "αδελφή", - "αδέρφι": "αδέλφι", - "αδερφικός": "αδελφικός", - "αδερφική": "αδελφική", - "αδερφικό": "αδελφικό", - "αδερφοποιτός": "αδελφοποιτός", - "αδερφός": "αδελφός", - "αδερφοσύνη": "αδελφοσύνη", - "αέρι": "αγέρι", - "αερόµπικ": "αεροβική", - "αεροστρόβιλος": "αεριοστρόβιλος", - "αητός": "αετός", - "αιµατοποσία": "αιµοποσία", - "άιντε": "άντε", - "αισθηµατισµός": "συναισθηµατισµός", - "αιτιακός": "αιτιώδης", - "αιτιακή": "αιτιώδης", - "αιτιακό": "αιτιώδες", - "ακατανόµαστος": "ακατονόµαστος", - "ακατανόμαστη": "ακατονόμαστη", - "ακατονόμαστο": "ακατανόμαστο", - "ακέραιος": "ακέριος", - "ακέραια": "ακέρια", - "ακέραιο": "ακέριο", - "άκρον": "άκρο", - "ακτύπητος": "αχτύπητος", - "ακτύπητη": "αχτύπητη", - "ακτύπητο": "αχτύπητο", - "ακυριολεκτώ": "ακυρολεκτώ", - "ακυριολεξία": "ακυρολεξία", - "αλάτι": "άλας", - "αλατένιος": "αλάτινος", - "αλατένια": "αλάτινη", - "αλατένιο": "αλάτινο", - "αλαφραίνω": "ελαφρώνω", - "αλαφριός": "ελαφρύς", - "αλαφριό": "ελαφρύ", - "αλαφρόµυαλος": "ελαφρόµυαλος", - "αλαφρόμυαλη": "ελαφρόμυαλη", - "αλαφρόμυαλο": "ελαφρόμυαλο", - "αλείβω": "αλείφω", - "άλευρο": "αλεύρι", - "αλησµονησιά": "λησµονιά", - "αλκολίκι": "αλκοολίκι", - "αλλέως": "αλλιώς", - "αλληλοεπίδραση": "αλληλεπίδραση", - "αλλήθωρος": "αλλοίθωρος", - "αλλήθωρη": "αλλοίθωρη", - "αλλήθωρο": "αλλοίθωρο", - "αλλοίµονο": "αλίµονο", - "αµνηστεία": "αµνηστία", - "αµπαρόριζα": "αρµπαρόριζα", - "αµπέχωνο": "αµπέχονο", - "αµυγδαλάτος": "αµυγδαλωτός", - "αμυγδαλάτη": "αμυγδαλωτή", - "αμυγδαλάτο": "αμυγδαλωτό", - "αµυγδαλόλαδο": "αµυγδαλέλαιο", - "αµφίλογος": "αµφιλεγόµενος", - "αμφίλογη": "αμφιλεγόμενη", - "αμφίλογο": "αμφιλεγόμενο", - "αναβατός": "ανεβατός", - "αναβατή": "ανεβατή", - "αναβατό": "ανεβατό", - "αναδεχτός": "αναδεκτός", - "αναθρέφω": "ανατρέφω", - "ανακατώνω": "ανακατεύω", - "ανακάτωση": "ανακάτεµα", - "αναλίσκω": "αναλώνω", - "αναμειγνύω": "αναμιγνύω", - "αναμείκτης": "αναμίκτης", - "ανάµεικτος": "ανάµικτος", - "ανάμεικτη": "ανάμικτη", - "ανάμεικτο": "ανάμικτο", - "αναπαµός": "ανάπαυση", - "αναπαρασταίνω": "αναπαριστάνω", - "ανάπρωρος": "ανάπλωρος", - "ανάπρωρη": "ανάπλωρη", - "ανάπρωρο": "ανάπλωρο", - "αναπτυγµένος": "ανεπτυγμένος", - "αναπτυγµένη": "ανεπτυγμένη", - "αναπτυγµένο": "ανεπτυγμένο", - "άναστρος": "ανάστερος", - "αναστυλώνω": "αναστηλώνω", - "αναστύλωση": "αναστήλωση", - "ανεγνωρισµένος": "αναγνωρισµένος", - "αναγνωρισμένη": "αναγνωρισµένη", - "αναγνωρισμένο": "αναγνωρισµένο", - "ανέµυαλος": "άμυαλος", - "ανέμυαλη": "άμυαλη", - "ανέμυαλο": "άμυαλο", - "ανεπάντεχος": "αναπάντεχος", - "ανεπάντεχη": "αναπάντεχη", - "ανεπάντεχο": "αναπάντεχο", - "ανεψιά": "ανιψιά", - "ανεψιός": "ανιψιός", - "ανήρ": "άνδρας", - "ανηφόρι": "ανήφορος", - "ανηψιά": "ανιψιά", - "ανηψιός": "ανιψιός", - "άνθιση": "άνθηση", - "ανταλλάζω": "ανταλλάσσω", - "ανταπεξέρχοµαι": "αντεπεξέρχοµαι", - "αντζούγια": "αντσούγια", - "αντιεισαγγελέας": "αντεισαγγελέας", - "αντικατασταίνω": "αντικαθιστώ", - "αντικρύζω": "αντικρίζω", - "αντιµολία": "αντιµωλία", - "αντιπροσωπεία": "αντιπροσωπία", - "αντισταµινικό": "αντιισταµινικός", - "αντίχτυπος": "αντίκτυπος", - "άντρας": "άνδρας", - "αντρόγυνο": "ανδρόγυνο", - "αντρώνω": "ανδρώνω", - "άξια": "άξιος", - "απακούµπι": "αποκούµπι", - "απαλάµη": "παλάµη", - "Απαλάχια": "Αππαλάχια", - "απάνω": "επάνω", - "απέδρασα": "αποδιδράσκω", - "απλούς": "απλός", - "απλούν": "απλό", - "απόγαιο": "απόγειο", - "αποδείχνω": "αποδεικνύω", - "αποθαµός": "πεθαµός", - "αποθανατίζω": "απαθανατίζω", - "αποκεντροποίηση": "αποκέντρωση", - "απολαυή": "απολαβή", - "αποξεραίνω": "αποξηραίνω", - "απόξυοη": "απόξεση", - "απόξω": "απέξω", - "απόσχω": "απέχω", - "αποτίω": "αποτίνω", - "αποτυχαίνω": "αποτυγχάνω", - "αποχαιρετίζω": "αποχαιρετώ", - "απόχτηµα": "απόκτηµα", - "απόχτηση": "απόκτηση", - "αποχτώ": "αποκτώ", - "Απρίλης": "Απρίλιος", - "αρκαντάσης": "καρντάσης", - "αρµάρι": "ερµάριο", - "άρµη": "άλµη", - "αρµοστεία": "αρµοστία", - "άρµπουρο": "άλµπουρο", - "αρµύρα": "αλµύρα", - "αρµυρίκι": "αλµυρίκι", - "άρρην": "άρρεν", - "αρσανάς": "ταρσανάς", - "αρτύνω": "αρταίνω", - "αρχινίζω": "αρχίζω", - "αρχινώ": "αρχίζω", - "αρχίτερα": "αρχύτερα", - "ασκηµάδα": "ασχήµια", - "ασκηµαίνω": "ασχηµαίνω", - "ασκήµια": "ασχήµια", - "ασκηµίζω": "ασχηµίζω", - "άσσος": "άσος", - "αστράπτω": "αστράφτω", - "αστράπτω": "αστράφτω", - "αταχτώ": "ατακτώ", - "ατσάλινος": "ατσαλένιος", - "ατσάλινη": "ατσαλένια", - "ατσάλινο": "ατσαλένιο", - "Ατσιγγάνος": "Τσιγγάνος", - "Ατσίγγανος": "Τσιγγάνος", - "αυγαταίνω": "αβγατίζω", - "αυγατίζω": "αβγατίζω", - "αυγό": "αβγό", - "αυγοειδής": "αυγοειδής", - "αυγοειδές": "αβγοειδές", - "αυγοθήκη": "αβγοθήκη", - "αυγοκόβω": "αβγοκόβω", - "αυγοτάραχο": "αβγοτάραχο", - "αύλακας": "αυλάκι", - "αυτί": "αφτί", - "αυτιάζοµαι": "αφτιάζοµαι", - "αφορεσµός": "αφορισµός", - "άφρονας": "άφρων", - "αχείλι": "χείλι", - "άχερο": "άχυρο", - "αχερώνας": "αχυρώνας", - "αχιβάδα": "αχηβάδα", - "αχτίδα": "ακτίνα", - "βαβουίνος": "µπαµπουίνος", - "Βαγγέλης": "Ευάγγελος", - "βαγγέλιο": "ευαγγέλιο", - "Βάγια": "Βάί'α", - "βαζιβουζούκος": "βασιβουζούκος", - "βαθύνω": "βαθαίνω", - "βάιο": "βάγιο", - "βακαλάος": "µπακαλιάρος", - "βαλάντιο": "βαλλάντιο", - "βαλαντώνω": "βαλλαντώνω", - "βάνω": "βάζω", - "βαρειά": "βαριά", - "βαριεστίζω": "βαργεστώ", - "βαριεστώ": "βαργεστώ", - "βαρώ": "βαράω", - "βαρώνος": "βαρόνος", - "βασιλέας": "βασιλιάς", - "βασµούλος": "γασµούλος", - "Βαυαρία": "Βαβαρία", - "Βαυαροκρατία": "Βαβαροκρατία", - "βαφτίζω": "βαπτίζω", - "βάφτιση": "βάπτιση", - "βάφτισµα": "βάπτισµα", - "βαφτιστής": "βαπτιστής", - "βαφτιστικός": "βαπτιστικός", - "βαφτιστική": "βαπτιστική", - "βαφτιστικιά": "βαπτιστική", - "βαφτιστικό": "βαπτιστικό", - "βδοµάδα": "εβδοµάδα", - "βεγόνια": "µπιγκόνια", - "βελανίδι": "βαλανίδι", - "βελανιδιά": "βαλανιδιά", - "βενζίνα": "βενζίνη", - "βεράτιο": "µπεράτι", - "βερόκοκο": "βερίκοκο", - "βιγόνια": "µπιγκόνια", - "βλάφτω": "βλάπτω", - "βλογιά": "ευλογιά", - "βλογάω": "ευλογώ", - "βογγίζω": "βογγώ", - "βόγγος": "βογγητό", - "βογκητό": "βογγητό", - "βοδάµαξα": "βοϊδάµαξα", - "βόλλεϋ": "βόλεϊ", - "βολοκοπώ": "βωλοκοπώ", - "βόλος": "βώλος", - "βουβάλι": "βούβαλος", - "βουή": "βοή", - "βούλα": "βούλλα", - "βούλωµα": "βούλλωµα", - "βουλώνω": "βουλλώνω", - "βουρβόλακας": "βρικόλακας", - "βουρκόλακας": "βρικόλακας", - "βους": "βόδι", - "βραδι": "βράδυ", - "βρυκόλακας": "βρικόλακας", - "βρώµα": "βρόµα", - "βρώµη": "βρόµη", - "βρωµιά": "βροµιά", - "βρωµίζω": "βροµίζω", - "βρώµιο": "βρόµιο", - "βρωµώ": "βροµώ", - "βωξίτης": "βοξίτης", - "γάβρος": "γαύρος", - "γαϊδάρα": "γαϊδούρα", - "γαίµα": "αίµα", - "γαλακτόπιτα": "γαλατόπιτα", - "γάµα": "γάµµα", - "γαµβρός": "γαµπρός", - "γαρίφαλο": "γαρύφαλλο", - "γαρούφαλλο": "γαρύφαλλο", - "γαυγίζω": "γαβγίζω", - "γελάδα": "αγελάδα", - "γελέκο": "γιλέκο", - "γένοµαι": "γίνοµαι", - "γενότυπος": "γονότυπος", - "Γένουα": "Γένοβα", - "γεράζω": "γερνώ", - "γέρακας": "γεράκι", - "γερατειά": "γηρατειά", - "γεροκοµείο": "γηροκοµείο", - "γεροκοµώ": "γηροκοµώ", - "Γεσθηµανή": "Γεθσηµανή", - "γεώδης": "γαιώδης", - "γαιώδες": "γαιώδες", - "γηρασµός": "γήρανση", - "Γιάννενα": "Ιωάννινα", - "Γιάννινα": "Ιωάννινα", - "γιάνω": "γιαίνω", - "γιαουρτλού": "γιογουρτλού", - "Γιαπωνέζος": "Ιαπωνέζος", - "γιγαντεύω": "γιγαντώνω", - "γιεγιές": "γεγές", - "Γιεν": "γεν", - "γιέσµαν": "γέσµαν", - "γιόκας": "γυιόκας", - "γιορτασµός": "εορτασµός", - "γιος": "γυιος", - "Γιούλης": "Ιούλιος", - "Γιούνης": "Ιούνιος", - "γιοφύρι": "γεφύρι", - "Γιώργος": "Γεώργιος", - "γιωτ": "γιοτ", - "γιωτακισµός": "ιωτακισµός", - "γκάγκστερ": "γκάνγκστερ", - "γκαγκστερισµός": "γκανγκστερισµός", - "γκαµήλα": "καµήλα", - "γκεµπελίσκος": "γκαιµπελίσκος", - "γκιουβέτσι": "γιουβέτσι", - "γκιώνης": "γκιόνης", - "γκλοµπ": "κλοµπ", - "γκογκ": "γκονγκ", - "Γκιόνα": "Γκιώνα", - "γκόρφι": "γκόλφι", - "γκρα": "γκρας", - "Γκράβαρα": "Κράβαρα", - "γκυ": "γκι", - "γλαϋξ": "γλαύκα", - "γλιτώνω": "γλυτώνω", - "γλύκισµα": "γλύκυσµα", - "γλυστρώ": "γλιστρώ", - "γλωσσίδα": "γλωττίδα", - "γνέφαλλο": "γνάφαλλο", - "γνοιάζοµαι": "νοιάζοµαι", - "γόµα": "γόµµα", - "γόνα": "γόνατο", - "γονιός": "γονέας", - "γόπα": "γώπα", - "γούµενος": "ηγούµενος", - "γουµένισσα": "ηγουµένη", - "γουώκµαν": "γουόκµαν", - "γραία": "γριά", - "Γράµος": "Γράµµος", - "γρασίδι": "γρασσίδι", - "γρεγολεβάντες": "γραιγολεβάντες", - "γρέγος": "γραίγος", - "γρικώ": "γροικώ", - "Γροιλανδία": "Γροιλανδία", - "γρίνια": "γκρίνια", - "γροθοκοπώ": "γρονθοκοπώ", - "γρούµπος": "γρόµπος", - "γυαλοπωλείο": "υαλοπωλείο", - "γυρνώ": "γυρίζω", - "γόρωθε": "γύροθε", - "γωβιός": "κωβιός", - "δάγκάµα": "δάγκωµα", - "δαγκαµατιά": "δαγκωµατιά", - "δαγκανιά": "δαγκωνιά", - "δαιµονοπληξία": "δαιµονιόπληκτος", - "δαίµων": "δαίµονας", - "δακτυλήθρα": "δαχτυλήθρα", - "δακτυλίδι": "δαχτυλίδι", - "∆αυίδ": "∆αβίδ", - "δαχτυλογραφία": "δακτυλογραφία", - "δαχτυλογράφος": "δακτυλογράφος", - "δεικνύω": "δείχνω", - "δείλι": "δειλινό", - "δείχτης": "δείκτης", - "δελής": "ντελής", - "δενδρογαλή": "δεντρογαλιά", - "δεντρολίβανο": "δενδρολίβανο", - "δεντροστοιχία": "δενδροστοιχία", - "δεντροφυτεία": "δενδροφυτεία", - "δεντροφυτεύω": "δενδροφυτεύω", - "δεντρόφυτος": "δενδρόφυτος", - "δεξής": "δεξιό", - "δερµατώδης": "δερµατοειδής", - "δερματώδες": "δερµατοειδές", - "δέσποτας": "δεσπότης", - "δεφτέρι": "τεφτέρι", - "διαβατάρης": "διαβάτης", - "διάβηκα": "διαβαίνω", - "διαβιβρώσκω": "διαβρώνω", - "διαθρέψω": "διατρέφω", - "διακόνεµα": "διακονιά", - "διάολος": "διάβολος", - "∆ιαµαντής": "Αδαµάντιος", - "διαολιά": "διαβολιά", - "διαολογυναίκα": "διαβολογυναίκα", - "διαολοθήλυκο": "διαβολοθήλυκο", - "διαολόκαιρος": "διαβολόκαιρος", - "διαολοκόριτσο": "διαβολοκόριτσο", - "διαολόπαιδο": "διαβολόπαιδο", - "διάολος": "διάβολος", - "διασκελιά": "δρασκελιά", - "διαχύνω": "διαχέω", - "δίδω": "δίνω", - "δίκηο": "δίκιο", - "δοβλέτι": "ντοβλέτι", - "δοσίλογος": "δωσίλογος", - "δράχνω": "αδράχνω", - "δρέπανο": "δρεπάνι", - "δρόσος": "δροσιά", - "δώνω": "δίνω", - "εγγίζω": "αγγίζω", - "εδώθε": "δώθε", - "εδωνά": "εδωδά", - "εικοσάρι": "εικοσάρικο", - "εικών": "εικόνα", - "εισαγάγω": "εισάγω", - "εισήγαγα": "εισάγω", - "εισήχθην": "εισάγω", - "έκαμα": "έκανα", - "εκατόν": "εκατό", - "εκατοστάρης": "κατοστάρης", - "εκατοστάρι": "κατοστάρι", - "εκατοστάρικο": "κατοστάρικο", - "εκλαίρ": "εκλέρ", - "Ελδοράδο": "Ελντοράντο", - "ελευθεροτεκτονισµός": "τεκτονισµός", - "ελευτεριά": "ελευθερία", - "Ελεφαντοστού Ακτή": "Ακτή Ελεφαντοστού", - "ελληνικάδικο": "ελληνάδικο", - "Ελπίδα": "Ελπίς", - "εµορφιά": "οµορφιά", - "εµορφάδα": "οµορφιά", - "έµπορας": "έµπορος", - "εµώ": "εξεµώ", - "ένδεκα": "έντεκα", - "ενενήκοντα": "ενενήντα", - "ενωρίς": "νωρίς", - "εξανέστην": "εξανίσταµαι", - "εξήκοντα": "εξήντα", - "έξις": "έξη", - "εξωκκλήσι": "ξωκκλήσι", - "εξωµερίτης": "ξωµερίτης", - "επανωφόρι": "πανωφόρι", - "επιµειξία": "επιµιξία", - "επίστοµα": "απίστοµα", - "επτάζυµο": "εφτάζυµο", - "επταήµερος": "εφταηµερος", - "επταθέσιος": "εφταθέσιος", - "επταµελής": "εφταµελης", - "επταµηνία": "εφταµηνία", - "επταµηνίτικος": "εφταµηνίτικος", - "επταπλασιάζω": "εφταπλασιάζω", - "επταπλάσιος": "εφταπλάσιος", - "επτασύλλαβος": "εφτασύλλαβος", - "επτατάξιος": "εφτατάξιος", - "επτάτοµος": "εφτάτοµος", - "επτάφυλλος": "εφτάφυλλος", - "επτάχρονα": "εφτάχρονα", - "επτάχρονος": "εφτάχρονος", - "επταψήφιος": "εφταψήφιος", - "επτάωρος": "εφτάωρος", - "επταώροφος": "εφταώροφος", - "έργον": "έργο", - "ευκή": "ευχή", - "ευρό": "ευρώ", - "ευσπλαχνίζοµαι": "σπλαχνίζοµαι", - "εφεντης": "αφέντης", - "εφηµεριακός": "εφηµέριος", - "εφημεριακή": "εφηµέρια", - "εφημεριακό": "εφηµέριο", - "εφτά": "επτά", - "εφταετία": "επταετία", - "εφτακόσια": "επτακόσια", - "εφτακόσιοι": "επτακόσιοι", - "εφτακοσιοστός": "επτακοσιοστός", - "εχθές": "χθες", - "ζάπι": "ζάφτι", - "ζαχαριάζω": "ζαχαρώνω", - "ζαχαροµύκητας": "σακχαροµύκητας", - "ζεµανφού": "ζαµανφού", - "ζεµανφουτισµός": "ζαµανφουτισµός", - "ζέστα": "ζέστη", - "ζεύλα": "ζεύγλα", - "Ζηλανδία": "Νέα Ζηλανδία", - "ζήλεια": "ζήλια", - "ζιµπούλι": "ζουµπούλι", - "ζο": "ζώο", - "ζουρλαµάρα": "ζούρλα", - "ζωοφόρος": "ζωφόρος", - "ηλεκτροκόλληση": "ηλεκτροσυγκόλληση", - "ηλεκτροοπτική": "ηλεκτροπτική", - "ήλιο": "ήλιον", - "ηµιόροφος": "ηµιώροφος", - "θαλάµι": "θαλάµη", - "θάµα": "θαύµα", - "θαµπώνω": "θαµβώνω", - "θάµπος": "θάµβος", - "θάφτω": "θάβω", - "θεοψία": "θεοπτία", - "θέσει": "θέση", - "θηλειά": "θηλιά", - "Θόδωρος": "Θεόδωρος", - "θρύβω": "θρύπτω", - "θυµούµαι": "θυµάµαι", - "Ιαµάϊκή": "Τζαµάικα", - "ιατρεύω": "γιατρεύω", - "ιατρός": "γιατρός", - "ιατροσόφιο": "γιατροσόφι", - "I.Q.": "αϊ-κιού", - "ινατι": "γινάτι", - "ιονίζω": "ιοντίζω", - "ιονιστής": "ιοντιστής", - "ιονόσφαιρα": "ιοντόσφαιρα", - "Ιούλης": "Ιούλιος", - "ίσασµα": "ίσιωµα", - "ισιάζω": "ισιώνω", - "ίσκιος": "ήσκιος", - "ισκιώνω": "ησκιώνω", - "ίσωµα": "ίσιωµα", - "ισώνω": "ισιώνω", - "ιχθύαση": "ιχθύωση", - "ιώτα": "γιώτα", - "καββαλισµός": "καβαλισµός", - "κάβουρος": "κάβουρας", - "καδής": "κατής", - "καδρίλια": "καντρίλια", - "Καζακστάν": "Καζαχστάν", - "καθέκλα": "καρέκλα", - "κάθησα": "κάθισα", - "[1766]. καθίκι": "καθοίκι", - "καΐλα": "καήλα", - "καϊξής": "καϊκτσής", - "καλδέρα": "καλντέρα", - "καλεντάρι": "καλαντάρι", - "καλήν εσπέρα": "καλησπέρα", - "καλιά": "καλειά", - "καλιακούδα": "καλοιακούδα", - "κάλλια": "κάλλιο", - "καλλιά": "κάλλιο", - "καλόγηρος": "καλόγερος", - "καλόρχεται": "καλοέρχεται", - "καλσόν": "καλτσόν", - "καλυµµαύκι": "καµιλαύκι", - "καλύµπρα": "καλίµπρα", - "καλωσύνη": "καλοσύνη", - "καµαρωτός": "καµαρότος", - "καµηλαύκι": "καµιλαύκι", - "καµτσίκι": "καµουτσίκι", - "καναβάτσο": "κανναβάτσο", - "κανακίζω": "κανακεύω", - "κανάτα": "καννάτα", - "κανατάς": "καννατάς", - "κανάτι": "καννάτι", - "κανελής": "καννελής", - "κανελιά": "καννελή", - "κανελί": "καννελή", - "κανελονι": "καννελόνι", - "κανελλόνι": "καννελόνι", - "κανένας": "κανείς", - "κάνη": "κάννη", - "κανί": "καννί", - "κάνναβης": "κάνναβις", - "καννιβαλισµός": "κανιβαλισµός", - "καννίβαλος": "κανίβαλος", - "κανοκιάλι": "καννοκιάλι", - "κανόνι": "καννόνι", - "κανονιά": "καννονιά", - "κανονίδι": "καννονίδι", - "κανονιέρης": "καννονιέρης", - "κανονιοβολητής": "καννονιοβολητής", - "κανονιοβολισµός": "καννονιοβολισµός", - "κανονιοβολώ": "καννονιοβολώ", - "κανονιοστάσιο": "καννονιοστάσιο", - "κανονιοστοιχία": "καννονιοστοιχία", - "κανονοθυρίδα": "καννονοθυρίδα", - "κάνουλα": "κάννουλα", - "κανών": "κανόνας", - "κάπα": "κάππα", - "κάπαρη": "κάππαρη", - "καπαρντίνα": "καµπαρντίνα", - "καραβόσκοινο": "καραβόσχοινο", - "καρένα": "καρίνα", - "κάρκάδο": "κάκαδο", - "καροτίνη": "καρωτίνη", - "καρότο": "καρώτο", - "καροτόζουµο": "καρωτόζουµο", - "καροτοσαλάτα": "καρωτοσαλάτα", - "καρπούµαι": "καρπώνοµαι", - "καρρώ": "καρό", - "κάρυ": "κάρι", - "καρυοφύλλι": "καριοφίλι", - "καταΐφι": "κανταΐφι", - "κατακάθηµαι": "κατακάθοµαι", - "κατάντια": "κατάντηµα", - "κατασκοπεία": "κατασκοπία", - "καταφτάνω": "καταφθάνω", - "καταχράσθηκα": "καταχράστηκα", - "κατάχτηση": "κατάκτηση", - "καταχτητής": "κατακτητής", - "καταχτώ": "κατακτώ", - "καταχωρώ": "καταχωρίζω", - "κατέβαλα": "καταβάλλω", - "Κατερίνα": "Αικατερίνη", - "κατοστίζω": "εκατοστίζω", - "κάτου": "κάτω", - "κατρουλιό": "κατουρλιό", - "καυναδίζω": "καβγαδίζω", - "καϋµός": "καηµός", - "'κεί": "εκεί", - "κείθε": "εκείθε", - "καψόνι": "καψώνι", - "καψύλλιο": "καψούλι", - "κελάρης": "κελλάρης", - "κελί": "κελλί", - "κεντήτρια": "κεντήστρα", - "κεσέµι": "γκεσέµι", - "κέσιο": "καίσιο", - "κηπάριο": "κήπος", - "κινάρα": "αγκινάρα", - "κιοφτές": "κεφτές", - "κλαίγω": "κλαίω", - "κλαπάτσα": "χλαπάτσα", - "κλασσικίζω": "κλασικίζω", - "κλασσικιστής": "κλασικιστής", - "κλέπτης": "κλέφτης", - "κληθρα": "σκλήθρα", - "κλήρινγκ": "κλίρινγκ", - "κλιπ": "βιντεοκλίπ", - "κλωσά": "κλώσσα", - "κλωτσιά": "κλοτσιά", - "κογκλάβιο": "κονκλάβιο", - "κογκρέσο": "κονγκρέσο", - "κοιµίσης": "κοίµησης", - "κοιµούµαι": "κοιµάµαι", - "κοιτώ": "κοιτάζω", - "κοιτάω": "κοιτάζω", - "κόκαλο": "κόκκαλο", - "κοκίτης": "κοκκύτης", - "κοκκίαση": "κοκκίωση", - "κοκκοφοίνικας": "κοκοφοίνικας", - "κολάζ": "κολλάζ", - "κολαντρίζω": "κουλαντρίζω", - "κολαρίζω": "κολλαρίζω", - "κολεχτίβα": "κολεκτίβα", - "κολεχτιβισµός": "κολεκτιβισµός", - "κολιγιά": "κολληγιά", - "κολίγος": "κολλήγας", - "κολίγας": "κολλήγας", - "κολικόπονος": "κωλικόπονος", - "κολιός": "κολοιός", - "κολιτσίνα": "κολτσίνα", - "κολυµπήθρα": "κολυµβήθρα", - "κολώνα": "κολόνα", - "κολώνια": "κολόνια", - "κοµβόι": "κονβόι", - "κόµις": "κόµης", - "κόµισσα": "κόµης", - "κόµιτας": "κόµης", - "κοµιτεία": "κοµητεία", - "κόµµατα": "κοµµάτι", - "κοµµούνα": "κοµούνα", - "κοµµουναλισµός": "κοµουναλισµός", - "κοµµούνι": "κοµούνι", - "κοµµουνίζω": "κοµουνίζω", - "κοµµουνισµός": "κοµουνισµός", - "κοµµουνιστής": "κοµουνιστής", - "κονδυλοειδής": "κονδυλώδης", - "κονδυλοειδές": "κονδυλώδες", - "κονσέρτο": "κοντσέρτο", - "κόντραµπαντιέρης": "κοντραµπατζής", - "κοντσίνα": "κολτσίνα", - "κονφορµισµός": "κοµφορµισµός", - "κονφορµιστής": "κομφορμιστής", - "κοπελιά": "κοπέλα", - "κοπλιµέντο": "κοµπλιµέντο", - "κόπτω": "κόβω", - "κόπυραιτ": "κοπιράιτ", - "Κοριτσα": "Κορυτσά", - "κοριτσόπουλο": "κορίτσι", - "κορνέτο": "κορνέτα", - "κορνιζώνω": "κορνιζάρω", - "κορόιδεµα": "κοροϊδία", - "κορόνα": "κορώνα", - "κορφή": "κορυφή", - "κοσάρι": "εικοσάρικο", - "κοσάρικο": "εικοσάρικο", - "κοσµετολογία": "κοσµητολογία", - "κοτάω": "κοτώ", - "κουβαρνταλίκι": "χουβαρνταλίκι", - "κουβαρντάς": "χουβαρντάς", - "κουβερνάντα": "γκουβερνάντα", - "κούκος": "κούκκος", - "κουλλουρτζής": "κουλλουράς", - "κουλούρας": "κουλλουράς", - "κουλούρι": "κουλλούρι", - "κουλουριάζω": "κουλλουριάζω", - "κουλουρτζής": "κουλλουράς", - "κουρδιστής": "χορδιστής", - "κουρντιστής": "χορδιστής", - "κουρντίζω": "κουρδίζω", - "κουρντιστήρι": "κουρδιστήρι", - "κουστούµι": "κοστούµι", - "κουτεπιέ": "κουντεπιέ", - "κόφτης": "κόπτης", - "κόχη": "κόγχη", - "κοψοχείλης": "κοψαχείλης", - "κρεµάζω": "κρεµώ", - "κροντήρι": "κρωντήρι", - "κροµµύδι": "κρεµµύδι", - "κροµµυδίλα": "κρεµµυδίλα", - "κρουσταλλιάζω": "κρυσταλλιάζω", - "κτένα": "χτένα", - "κτενάκι": "χτενάκι", - "κτένι": "χτένι", - "κτενίζω": "χτενίζω", - "κτένισµα": "χτένισµα", - "κτίριο": "κτήριο", - "κυλίω": "κυλώ", - "κυττάζω": "κοιτάζω", - "κωλ-γκέρλ": "κολ-γκέρλ", - "κωλοµπαράς": "κολοµπαράς", - "κωσταντινάτο": "κωνσταντινάτο", - "Κώστας": "Κωνσταντίνος", - "κώχη": "κόγχη", - "λάβδα": "λάµβδα", - "λαγούτο": "λαούτο", - "λαγύνι": "λαγήνι", - "λαίδη": "λέδη", - "λαϊκάντζα": "λαϊκούρα", - "λαιµά": "λαιµός", - "λαΐνι": "λαγήνι", - "λαµπράδα": "λαµπρότητα", - "λάρος": "γλάρος", - "λατόµι": "λατοµείο", - "λαύδανο": "λάβδανο", - "λαυράκι": "λαβράκι", - "λαφίνα": "ελαφίνα", - "λαφόπουλο": "ελαφόπουλο", - "λειβάδι": "λιβάδι", - "Λειβαδιά": "Λιβάδια", - "λεϊµόνι": "λεµόνι", - "λεϊµονιά": "λεµονιά", - "Λειψία": "Λιψία", - "λέοντας": "λέων", - "λεπτά": "λεφτά", - "λεπτύνω": "λεπταίνω", - "λευκαστής": "λευκαντής", - "Λευτέρης": "Ελευθέριος", - "λευτερώνω": "ελευθερώνω", - "λέω": "λέγω", - "λιανεµπόριο": "λειανεµπόριο", - "λιανίζω": "λειανίζω", - "λιανοτούφεκο": "λειανοτούφεκο", - "λιανοντούφεκο": "λειανοντούφεκο", - "λιανοπούληµα": "λειανοπούληµα", - "λιανοπωλητής": "λειανοπωλητής", - "λιανοτράγουδο": "λειανοτράγουδο", - "λιγοψυχία": "ολιγοψυχία", - "λιθρίνι": "λυθρίνι", - "λιµένας": "λιµάνι", - "λίµπρα": "λίβρα", - "λιοβολιά": "ηλιοβολία", - "λιόδεντρο": "ελαιόδεντρο", - "λιόλαδο": "ελαιόλαδο", - "λιόσπορος": "ηλιόσπορος", - "λιοτρίβειο": "ελαιοτριβείο", - "λιοτρόπι": "ηλιοτρόπιο", - "λιόφως": "ηλιόφως", - "λιχουδιά": "λειχουδιά", - "λιώνω": "λειώνω", - "λογιωτατίζω": "λογιοτατίζω", - "λογιώτατος": "λογιότατος", - "λόγκος": "λόγγος", - "λόξιγκας": "λόξυγγας", - "λοτόµος": "υλοτόµος", - "Λουµπλιάνα": "Λιουµπλιάνα", - "λούω": "λούζω", - "λύγξ": "λύγκας", - "λυµφατισµός": "λεµφατισµός", - "λυντσάρω": "λιντσάρω", - "λυσσιακό": "λυσσακό", - "λυώνω": "λειώνω", - "Λωξάντρα": "Λοξάντρα", - "λωρένσιο": "λορένσιο", - "λωρίδα": "λουρίδα", - "µαγγάνιο": "µαγκάνιο", - "µαγγιώρος": "µαγκιόρος", - "µαγειριά": "µαγεριά", - "µάγειρος": "µάγειρας", - "µόγερας": "µάγειρας", - "µαγιώ": "µαγιό", - "µαγκανοπήγαδο": "µαγγανοπήγαδο", - "µαγκώνω": "µαγγώνω", - "µαγνόλια": "µανόλια", - "Μαγυάρος": "Μαγιάρος", - "µαζύ": "µαζί", - "µαζώνω": "µαζεύω", - "µαιζονέτα": "µεζονέτα", - "µαιτρ": "µετρ", - "µαιτρέσα": "µετρέσα", - "µακριός": "µακρύς", - "μακριά": "µακρυά", - "μακριό": "µακρύ", - "µαλάσσω": "µαλάζω", - "µαµά": "µαµµά", - "µαµouδι": "µαµούνι", - "µάνα": "µάννα", - "µανδαρινέα": "µανταρινιά", - "µανδήλι": "µαντήλι", - "µάνδρα": "µάντρα", - "µανές": "αµανές", - "Μανόλης": "Εµµανουήλ", - "µαντζούνι": "µατζούνι", - "µαντζουράνα": "µατζουράνα", - "µαντίλα": "µαντήλα", - "µαντίλι": "µαντήλι", - "µαντµαζέλ": "µαµαζέλ", - "µαντρίζω": "µαντρώνω", - "µαντώ": "µαντό", - "Μανώλης": "Εµµανουήλ", - "µάρτυς": "µάρτυρας", - "µασκάλη": "µασχάλη", - "µατοκυλίζω": "αιµατοκυλίζω", - "µατοκύλισµα": "αιµατοκυλίζω", - "µατσέτα": "µασέτα", - "µαυράδα": "µαυρίλα", - "μεγαλόπολη": "µεγαλούπολη", - "µεγαλοσπληνία": "σπληνοµεγαλία", - "µέγγενη": "µέγκενη", - "μείκτης": "µίκτης", - "µελίγγι": "µηλίγγι", - "µεντελισµός": "µενδελισµός", - "µενχίρ": "µενίρ", - "µέρα": "ηµέρα", - "µεράδι": "µοιράδι", - "µερεύω": "ηµερεύω", - "µέρµηγκας": "µυρµήγκι", - "µερµήγκι": "µυρµήγκι", - "µερσίνα": "µυρσίνη", - "µερσίνη": "µυρσίνη", - "µέρωµα": "ηµερώνω", - "µερώνω": "ηµερώνω", - "µέσον": "µέσο", - "µεσοούρανα": "µεσούρανα", - "µεταλίκι": "µεταλλίκι", - "µεταπούληση": "µεταπώληση", - "µεταπουλω": "µεταπωλώ", - "µετοχιάριος": "µετοχάρης", - "µητάτο": "µιτάτο", - "µητριά": "µητρυιά", - "µητριός": "µητρυιός", - "Μιανµάρ": "Μυανµάρ", - "Μίκι Μάους": "Μίκυ Μάους", - "µικρύνω": "µικραίνω", - "µινουέτο": "µενουέτο", - "µιξοπαρθένα": "µειξοπαρθένα", - "µισοφόρι": "µεσοφόρι", - "µίτζα": "µίζα", - "µολογώ": "οµολογώ", - "μολογάω": "οµολογώ", - "µοµία": "µούµια", - "µοµιοποίηση": "µουµιοποίηση", - "µονάρχιδος": "µόνορχις", - "µονιάζω": "µονοιάζω", - "µορφιά": "οµορφιά", - "µορφονιός": "οµορφονιός", - "µοσκάρι": "µοσχάρι", - "µοσκοβολιά": "µοσκοβολιά", - "µοσκοβολώ": "µοσχοβολώ", - "µοσκοκαρυδιά": "µοσχοκαρυδιά", - "µοσκοκάρυδο": "µοσχοκάρυδο", - "µοσκοκάρφι": "µοσχοκάρφι", - "µοσκολίβανο": "µοσχολίβανο", - "µοσκοµπίζελο": "µοσχοµπίζελο", - "µοσκοµυρίζω": "µοσχοµυρίζω", - "µοσκοπουλώ": "µοσχοπουλώ", - "µόσκος": "µόσχος", - "µοσκοσάπουνο": "µοσχοσάπουνο", - "µοσκοστάφυλο": "µοσχοστάφυλο", - "µόσχειος": "µοσχαρήσιος", - "μόσχειο": "µοσχαρήσιο", - "µουλώνω": "µουλαρώνω", - "µουρταδέλα": "µορταδέλα", - "µουσικάντης": "µουζικάντης", - "µουσσώνας": "µουσώνας", - "µουστάκα": "µουστάκι", - "µουστακοφόρος": "µυστακοφόρος", - "µπαγάζια": "µπαγκάζια", - "πάγκα": "µπάνκα", - "µπαγκαδορος": "µπανκαδόρος", - "µπογκέρης": "µπανκέρης", - "µπάγκος": "πάγκος", - "µπαιν-µαρί": "µπεν-µαρί", - "µπαλάντα": "µπαλλάντα", - "µπαλαντέζα": "µπαλλαντέζα", - "µπαλαντέρ": "µπαλλαντέρ", - "µπαλάντζα": "παλάντζα", - "µπαλένα": "µπαλαίνα", - "µπαλέτο": "µπαλλέτο", - "µπάλος": "µπάλλος", - "µπάλσαµο": "βάλσαµο", - "µπαλσάµωµα": "βαλσάµωµα", - "µπαλσαµώνω": "βαλσαµώνω", - "µπάλωµα": "µπάλλωµα", - "µπαλώνω": "µπαλλώνω", - "µπαµπάκι": "βαµβάκι", - "µπαµπακόσπορος": "βαµβακόσπορος", - "Μπάµπης": "Χαραλάµπης", - "µπάµπω": "βάβω", - "µπανέλα": "µπαναίλα", - "µπαρµπρίζ": "παρµπρίζ", - "µπατίστα": "βατίστα", - "µπαχτσές": "µπαξές", - "µπαχτσίσι": "µπαξίσι", - "µπεζεβέγκης": "πεζεβέγκης", - "µπελτές": "πελτές", - "µπεντόνι": "µπιντόνι", - "µπερδουκλώνω": "µπουρδουκλώνω", - "µπερκέτι": "µπερεκέτι", - "µπετόνι": "µπιτόνι", - "µπεχαβιορισµός": "µπιχεβιορισµός", - "µπεχλιβάνης": "πεχλιβάνης", - "µπιγκουτί": "µπικουτί", - "µπιµπίλα": "µπιρµπίλα", - "µπιµπλό": "µπιµπελό", - "µπιρσίµι": "µπρισίµι", - "µπις": "µπιζ", - "µπιστόλα": "πιστόλα", - "µπιστόλι": "πιστόλι", - "µπιστολιά": "πιστολιά", - "µπιτόνι": "µπιντόνι", - "µπογιάρος": "βογιάρος", - "µπονάτσα": "µπουνάτσα", - "µπονατσάρει": "µπουνατσάρει", - "µπουά": "µποά", - "µπουκαµβίλια": "βουκαµβίλια", - "µποϋκοταζ": "µποϊκοτάζ", - "µποϋκοτάρω": "µποϊκοτάρω", - "µπουλβάρ": "βουλεβάρτο", - "µπουρδέλο": "µπορντέλο", - "µπουρµπουάρ": "πουρµπουάρ", - "µπρίζα": "πρίζα", - "µπριτζόλα": "µπριζόλα", - "µπρος": "εµπρός", - "µπύρα": "µπίρα", - "µπυραρία": "µπιραρία", - "µπυροποσία": "µπιροποσία", - "µυγδαλιά": "αµυγδαλιά", - "µύγδαλο": "αµύγδαλο", - "µυλόρδος": "µιλόρδος", - "μυρουδιά": "µυρωδιά", - "µυτζήθρα": "µυζήθρα", - "µύωψ": "µύωπας", - "µώλος": "µόλος", - "νέθω": "γνέθω", - "νι": "νυ", - "νίκελ": "νικέλιο", - "νοµεύς": "νοµέας", - "νοστιµίζω": "νοστιµεύω", - "νουννός": "νοννός", - "νταβάνι": "ταβάνι", - "ντάβανος": "τάβανος", - "νταβανόσκουπα": "ταβανόσκουπα", - "νταβούλι": "νταούλι", - "νταλαβέρι": "νταραβέρι", - "νταµπλάς": "ταµπλάς", - "ντελαπάρω": "ντεραπάρω", - "ντενεκές": "τενεκές", - "ντερβεναγος": "δερβέναγας", - "ντερβένι": "δερβένι", - "ντερβίσης": "δερβίσης", - "ντερβισόπαιδο": "δερβισόπαιδο", - "ντοκυµανταίρ": "ντοκιµαντέρ", - "ντουνρού": "ντογρού", - "ντουζ": "ντους", - "ντουζιέρα": "ντουσιέρα", - "Ντούµα": "∆ούµα", - "ντούπλεξ": "ντούµπλεξ", - "ντουφέκι": "τουφέκι", - "ντουφεκίδι": "τουφεκίδι", - "ντουφεκίζω": "τουφεκίζω", - "ντουφεξής": "τουφεξής", - "νύκτα": "νύχτα", - "νυκτωδία": "νυχτωδία", - "νωµατάρχης": "ενωµοτάρχης", - "ξανεµίζω": "εξανεµίζω", - "ξεγνοιάζω": "ξενοιάζω", - "ξεγνοιασιά": "ξενοιασιά", - "ξελαφρώνω": "ξαλαφρώνω", - "ξεπίτηδες": "επίτηδες", - "ξεπιτούτου": "εξεπιτούτου", - "ξεσκάζω": "ξεσκάω", - "ξεσπάζω": "ξεσπώ", - "ξεσχίζω": "ξεσκίζω", - "ξέσχισµα": "ξεσκίζω", - "ξευτελίζω": "εξευτελίζω", - "ξεφτίζω": "ξεφτύζω", - "ξεφτίλα": "ξευτίλα", - "ξεφτίλας": "ξευτίλας", - "ξεφτιλίζω": "ξευτιλίζω", - "ξεχάνω": "ξεχνώ", - "ξηγώ": "εξηγώ", - "ξηροφαγία": "ξεροφαγία", - "ξηροφαγιά": "ξεροφαγία", - "ξι": "ξει", - "ξιπασιά": "ξυπασιά", - "ξίπασµα": "ξύπασµα", - "ξιπολησιά": "ξυπολυσιά", - "ξιπολιέµαι": "ξυπολιέµαι", - "εξοµολόγηση": "ξομολόγηση", - "ξοµολογητής": "εξοµολογητής", - "ξοµολόγος": "εξοµολόγος", - "ξοµολογώ": "εξοµολογώ", - "ξουράφι": "ξυράφι", - "ξουράφια": "ξυραφιά", - "ξόφληση": "εξόφληση", - "ξύγγι": "ξίγγι", - "ξύγκι": "ξίγγι", - "ξύδι": "ξίδι", - "ξυλοσκίστης": "ξυλοσχίστης", - "ξυλώνω": "ξηλώνω", - "ξυνωρίδα": "συνωρίδα", - "ξώθυρα": "εξώθυρα", - "ξώπορτα": "εξώπορτα", - "ξώφυλλο": "εξώφυλλο", - "οδοντογιατρός": "οδοντίατρος", - "οδοντόπονος": "πονόδοντος", - "οικογενειακά": "οικογενειακώς", - "οικοκυρά": "νοικοκυρά", - "οκτάς": "οκτάδα", - "οκταετής": "οχταετής", - "οκταετές": "οχταετές", - "οκταετία": "οχταετία", - "οµοιάζω": "µοιάζω", - "οµοιώνω": "εξοµοιώνω", - "οµόµετρο": "ωµόµετρο", - "οµορφάδα": "οµορφιά", - "οµπρός": "εµπρός", - "ονείρεµα": "όνειρο", - "οξείδιο": "οξίδιο", - "οξειδοαναγωγή": "οξιδοαναγωγή", - "οξειδώνω": "οξιδώνω", - "οξείδωση": "οξίδωση", - "οξειδωτής": "οξιδωτής", - "οξιζενέ": "οξυζενέ", - "οπίσω": "πίσω", - "οργιά": "οργυιά", - "όρνεο": "όρνιο", - "όρνις": "όρνιθα", - "ορρός": "ορός", - "όσµωση": "ώσµωση", - "οστεΐτιδα": "οστίτιδα", - "οστεογονία": "οστεογένεση", - "οφίτσιο": "οφίκιο", - "οφφίκιο": "οφίκιο", - "οχτάβα": "οκτάβα", - "οχτάδα": "οκτάδα", - "οχταετία": "οκταετία", - "οχτακόσια": "οκτακόσια", - "οχτακόσιοι": "οκτακόσιοι", - "οχτακόσιες": "οκτακόσιες", - "οχτακόσια": "οκτακόσια", - "όχτρητα": "έχθρητα", - "οχτώ": "οκτώ", - "Οχτώβρης": "Οκτώβριος", - "οψιανός": "οψιδιανός", - "παγαίνω": "πηγαίνω", - "παγόνι": "παγώνι", - "παιγνίδι": "παιχνίδι", - "παίδαρος": "παίδαρος", - "παίχτης": "παίκτης", - "παλικαράς": "παλληκαράς", - "παλικάρι": "παλληκάρι", - "παλικαριά": "παλληκαριά", - "παλικαροσύνη": "παλληκαροσύνη", - "παλληκαρίστίκος": "παλληκαρήσιος", - "παλληκαρίστικη": "παλληκαρήσια", - "παλληκαρίστικο": "παλληκαρήσιο", - "παλληκαροσύνη": "παλληκαριά", - "πανταλόνι": "παντελόνι", - "παντατίφ": "πανταντίφ", - "πανταχούσα": "απανταχούσα", - "Πάντοβα": "Πάδοβα", - "παντούφλα": "παντόφλα", - "παντοχή": "απαντοχή", - "πανψυχισµός": "παµψυχισµός", - "πάνω": "επάνω", - "παπαδάκι": "παππαδάκι", - "παπαδαρειό": "παππαδαρειό", - "παπαδιά": "παππαδιά", - "παπαδοκόρη": "παππαδοκόρη", - "παπαδοκρατούµαι": "παππαδοκρατούµαι", - "παπαδολόι": "παππαδολόι", - "παπαδοπαίδι": "παππαδοπαίδι", - "παπαδοπούλα": "παππαδοπούλα", - "Παπαδόπουλο": "παππαδόπουλο", - "παπατζής": "παππατζής", - "παπατρέχας": "παππατρέχας", - "παραγιάς": "παραγυιός", - "παρανυχίδα": "παρωνυχίδα", - "παρεισφρύω": "παρεισφρέω", - "παρεννοώ": "παρανοώ", - "παρ' ολίγο": "παραλίγο", - "πασαβιόλα": "µπασαβιόλα", - "πασάλειµµα": "πασσάλειµµα", - "πασαλείφω": "πασσαλείφω", - "πασκίζω": "πασχίζω", - "παστρουµάς": "παστουρµάς", - "πατερµά": "πατερηµά", - "πατήρ": "πατέρας", - "πατούνα": "πατούσα", - "πατριός": "πατρυιός", - "πάτρονας": "πάτρωνας", - "πάψη": "παύση", - "πεθυµώ": "επιθυµώ", - "πείρος": "πίρος", - "πελέκι": "πέλεκυς", - "πελεκίζω": "πελεκώ", - "πελλόγρα": "πελάγρα", - "πεντήκοντα": "πενήντα", - "πεντόβολα": "πεντόβωλα", - "πεντόδραχµο": "πεντάδραχµο", - "περβολάρης": "περιβολάρης", - "περβόλι": "περιβόλι", - "περδικλώνω": "πεδικλώνω", - "περηφανεύοµαι": "υπερηφανεύοµαι", - "περηφάνια": "υπερηφάνεια", - "περικόβω": "περικόπτω", - "περιπατώ": "περπατώ", - "περιστεριώνας": "περιστερώνας", - "περιτάµω": "περιτέµνω", - "περιφάνεια": "περηφάνια", - "περιφράζω": "περιφράσσω", - "περιχαράζω": "περιχαράσσω", - "περιχέω": "περιχύνω", - "περντάχι": "µπερντάχι", - "πέρπυρο": "υπέρπυρο", - "πέρσι": "πέρυσι", - "πετούγια": "µπετούγια", - "πευκιάς": "πευκώνας", - "πηγεµός": "πηγαιµός", - "πηγούνι": "πιγούνι", - "πήτα": "πίτα", - "πήχυς": "πήχης", - "πι": "πει", - "πιζάµα": "πιτζάµα", - "πιθαµή": "σπιθαµή", - "πιθώνω": "απιθώνω", - "πίκρισµα": "πικρίζω", - "πιλαλώ": "πηλαλώ", - "Πιλάτος": "Πόντιος Πιλάτος", - "πιοτό": "ποτό", - "πιπίζω": "πιππίζω", - "πιρέξ": "πυρέξ", - "πίστοµα": "απίστοµα", - "πιτσιλάδα": "πιτσυλάδα", - "πιτσιλιά": "πιτσυλιά", - "πίττα": "πίτα", - "πίτυρον": "πίτουρο", - "πλάγι": "πλάι", - "πλανάρω": "πλανίζω", - "πλάσσω": "πλάθω", - "πλειονοψηφία": "πλειοψηφία", - "πλείονοψηφώ": "πλειοψηφώ", - "πλεξίδα": "πλεξούδα", - "πλερωµή": "πληρωµή", - "πλερώνω": "πληρώνω", - "πλέυ µπόυ": "πλεϊµπόι", - "πλέχτης": "πλέκτης", - "πληµµύρα": "πληµύρα", - "πνιγµός": "πνίξιµο", - "πνευµονόκοκκος": "πνευµονιόκοκκος", - "ποιµήν": "ποιµένας", - "πόλις": "πόλη", - "πόλιτσµαν": "πόλισµαν", - "πολιτσµάνος": "πόλισµαν", - "πολύµπριζο": "πολύπριζο", - "πολυπάω": "πολυπηγαίνω", - "πολύπους": "πολύποδας", - "Πόρτο Ρίκο": "Πουέρτο Ρίκο", - "ποταπαγόρευση": "ποτοαπαγόρευση", - "πούντρα": "πούδρα", - "πράµα": "πράγµα", - "πρεβάζι": "περβάζι", - "πρέπον": "πρέπων", - "προαγάγω": "προάγω", - "προδίνω": "προδίδω", - "προιξ": "προίκα", - "προποτζής": "προπατζής", - "προσαγάγω": "προσάγω", - "πρόσµιξη": "πρόσµειξη", - "προσφύγω": "προσφεύγω", - "προφθάνω": "προφταίνω", - "προφυλάω": "προφυλάσσω", - "προψές": "προχθές", - "πρύµη": "πρύµνη", - "πταρνίζοµαι": "φταρνίζοµαι", - "πτελέα": "φτελιά", - "πτέρνα": "φτέρνα", - "πτερυγίζω": "φτερουγίζω", - "πτιφούρ": "πετιφούρ", - "πτι-φούρ": "πετιφούρ", - "πτωχαίνω": "φτωχαίνω", - "πτώχεια": "φτώχια", - "πυκνά": "πυκνός", - "πυλωτή": "πιλοτή", - "πύο": "πύον", - "πυρογενής": "πυριγενής", - "πυρογενές": "πυριγενές", - "πυτζάµα": "πιτζάµα", - "ραγκλόν": "ρεγκλάν", - "ραγού": "ραγκού", - "ραΐζω": "ραγίζω", - "ραίντνκεν": "ρέντγκεν", - "ράντζο": "ράντσο", - "ράπτω": "ράβω", - "ρεβανί": "ραβανί", - "ρέγγε": "ρέγκε", - "Ρεγγίνα": "Ρεγκίνα", - "ρεµούλκα": "ρυµούλκα", - "ασκέρι": "ασκέρι", - "ρεοβάση": "ρευµατοβάση", - "ρεπανάκι": "ραπανάκι", - "ρεπάνι": "ραπάνι", - "ρεύω": "ρέβω", - "ρήγα": "ρίγα", - "ρηµοκκλήσι": "ερηµοκκλήσι", - "ριγκ": "ρινγκ", - "ριζότο": "ρυζότο", - "ροβίθι": "ρεβίθι", - "ροβιθιά": "ρεβιθιά", - "ροδακινιά": "ρωδακινιά", - "ροδάκινο": "ρωδάκινο", - "ρόιδι": "ρόδι", - "ροϊδιά": "ροδιά", - "ρόιδο": "ρόδι", - "ροοστάτης": "ρεοστάτης", - "ροφώ": "ρουφώ", - "ρωδιός": "ερωδιός", - "ρωθωνίζω": "ρουθουνίζω", - "ρωµαντισµός": "ροµαντισµός", - "Ρωσσία": "Ρωσία", - "ρωτώ": "ερωτώ", - "σάζω": "σιάζω", - "σαιζλόνγκ": "σεζλόνγκ", - "σαιζόν": "σεζόν", - "σαγολαίφα": "σακολαίβα", - "σάκκα": "σάκα", - "σακκάκι": "σακάκι", - "σακκάς": "σακάς", - "σακκί": "σακί", - "σακκίδιο": "σακίδιο", - "σακκοβελόνα": "σακοβελόνα", - "σακκογκόλιθος": "σακογκόλιθος", - "σακκοειδής": "σακοειδής", - "σακκοειδές": "σακοειδες", - "σακκοράφα": "σακοράφα", - "σάκκος": "σάκος", - "σακκουλα": "σακούλα", - "σακκουλάκι": "σακούλι", - "σακκουλεύοµαι": "σακουλεύοµαι", - "σακκούλι": "σακούλι", - "σακκουλιάζω": "σακουλιάζω", - "σακχαροδιαβήτης": "ζαχαροδιαβήτης", - "σάκχαροκαλάµο": "ζαχαροκάλαµο", - "σακχαροποιία": "ζαχαροποιία", - "σακχαρότευτλον": "ζαχαρότευτλο", - "σαλιαρίστρα": "σαλιάρα", - "σαλπιστής": "σαλπιγκτής", - "σαντακρούτα": "σατακρούτα", - "σαντάλι": "σανδάλι", - "σάνταλο": "σανδάλι", - "σάρρα": "σάρα", - "σαφρίδι": "σαυρίδι", - "σαχάνι": "σαγάνι", - "σβολιάζω": "σβωλιάζω", - "σβώλιασμα": "σβόλιασµα", - "σβόλος": "σβώλος", - "σβύνω": "σβήνω", - "σγουρώνω": "σγουραίνω", - "σενκόντο": "σεκόντο", - "σεγκούνα": "σιγκούνα", - "σεγόντο": "σεκόντο", - "Σειληνός": "Σιληνός", - "σείρακας": "σείρικας", - "σειρήτι": "σιρίτι", - "σεκονταρω": "σιγοντάρω", - "σεγκοντάρω": "σιγοντάρω", - "σελιλόιντ": "σελουλόιντ", - "σέλλα": "σέλα", - "σεξπιριστής": "σαιξπηριστής", - "Σεράγεβο": "Σαράγεβο", - "σεστέτο": "σεξτέτο", - "σετέτο": "σεπτέτο", - "σέχτα": "σέκτα", - "σεχταρισµός": "σεκταρισµός", - "σηµαφόρος": "σηµατοφόρος", - "σήριαλ": "σίριαλ", - "σηψίνη": "σηπτίνη", - "σιγάρο": "τσιγάρο", - "σιγαροθήκη": "τσιγαροθήκη", - "σίγλος": "σίκλος", - "σιγόντο": "σεκόντο", - "Σίδνεϊ": "Σύδνεϋ", - "σίελος": "σίαλος", - "σινθεσάιζερ": "συνθεσάιζερ", - "σιντέφι": "σεντέφι", - "σιορ": "σινιόρ", - "σιρυΐάνι": "σεργιάνι", - "σιρµαγιά": "σερµαγιά", - "σίτα": "σήτα", - "σταρέµπορος": "σιτέµπορος", - "σκανδαλιά": "σκανταλιά", - "σκάνταλο": "σκάνδαλο", - "σκάπτω": "σκάβω", - "σκάρα": "σχάρα", - "σκαρµός": "σκαλµός", - "σκάφτω": "σκάβω", - "σκεβρώνω": "σκευρώνω", - "σκερπάνι": "σκεπάρνι", - "σκίζα": "σχίζα", - "σκίζω": "σχίζω", - "σκίνος": "σχίνος", - "σκίσιµο": "σχίσιµο", - "σκισµάδα": "σχισµάδα", - "σκισµή": "σχισµή", - "σκλήρωση": "σκλήρυνση", - "σκοινάκι": "σχοινάκι", - "σκονί": "σχοινί", - "σκοινί": "σχοινί", - "σκοίνος": "σχοίνος", - "σκολάω": "σχολώ", - "σκολειαρόπαιδο": "σχολειαρόπαιδο", - "σκολειαρούδι": "σχολειαρούδι", - "σκολειό": "σχολείο", - "σκόλη": "σχόλη", - "σκολιαρόπαιδο": "σχολειαρόπαιδο", - "σκολιαρούδι": "σχολειαρούδι", - "σκολιό": "σχολειό", - "σκολνώ": "σχολώ", - "σκολώ": "σχολώ", - "Σκοτία": "Σκωτία", - "σκότισµα": "σκοτισµός", - "Σκοτσέζος": "Σκωτσέζος", - "σκουντούφληµα": "σκουντούφλα", - "σκώληξ": "σκουλήκι", - "σκώτι": "συκώτι", - "σοβαντεπί": "σοβατεπί", - "σοβατίζω": "σοβαντίζω", - "σοροκολεβάντες": "σιροκολεβάντες", - "σορόκος": "σιρόκος", - "σοροπιάζω": "σιροπιάζω", - "σουβατίζω": "σοβαντίζω", - "σουβαντίζω": "σοβαντίζω", - "σουβάς": "σοβάς", - "σουβατεπί": "σοβαντεπί", - "σοβατεπί": "σοβαντεπί", - "σουµιέ": "σοµιέ", - "σούρσιµο": "σύρσιµο", - "σουσπασιόν": "σισπανσιόν", - "σοφεράρω": "σοφάρω", - "σπαής": "σπαχής", - "σπαράσσω": "σπαράζω", - "σπερµατσετο": "σπαρµατσέτο", - "σπερµίνη": "σπερµατίνη", - "σπερµοβλάστη": "σπερµατοβλάστη", - "σπερµογονία": "σπερµατογονία", - "σπερµοδότης": "σπερµατοδότης", - "σπερµοδόχος": "σπερµατοδόχος", - "σπερμοδόχο": "σπερµατοδόχο", - "σπερµοθήκη": "σπερµατοθήκη", - "σπερµοκτόνος": "σπερµατοκτόνος", - "σπερμοκτόνο": "σπερµατοκτόνο", - "σπερµοτοξίνη": "σπερµατοτοξίνη", - "σπερµοφάγος": "σπερµατοφάγος", - "σπερμοφάγο": "σπερµατοφάγο", - "σπερµοφόρος": "σπερµατοφόρος", - "σπερμοφόρο": "σπερµατοφόρο", - "σπινάρω": "σπινιάρω", - "σπιράλ": "σπειράλ", - "σπλάχνο": "σπλάγχνο", - "σπογγίζω": "σφουγγίζω", - "σπω": "σπάζω", - "Στάθης": "Ευστάθιος", - "στάλαµα": "στάλαγµα", - "σταλαµατιά": "σταλαγµατιά", - "σταλαξιά": "σταλαγµατιά", - "σταλίτσα": "σταλιά", - "σταρήθρα": "σιταρήθρα", - "στάρι": "σιτάρι", - "σταρότοπος": "σιταρότοπος", - "σταχολογώ": "σταχυολογώ", - "στειρεύω": "στερεύω", - "στειροποιώ": "στειρώνω", - "Στέλιος": "Στυλιανός", - "Στέλλα": "Στυλιανή", - "στεναχώρια": "στενοχώρια", - "στεναχωρώ": "στενοχωρώ", - "στένω": "στήνω", - "στέριωµα": "στερέωµα", - "στεριώνω": "στερεώνω", - "στέρξιµο": "στέργω", - "στιλ": "στυλ", - "στιλάκι": "στυλάκι", - "στιλιζάρω": "στυλιζάρω", - "στιλίστας": "στυλίστας", - "στιλό": "στυλό", - "στιφάδο": "στυφάδο", - "στορίζω": "ιστορώ", - "στόρισµα": "ιστόρηση", - "στραβοµάρα": "στραβωµάρα", - "στραγγουλίζω": "στραγγαλίζω", - "Στρατής": "Ευστράτιος", - "στρατί": "στράτα", - "στρατοποίηση": "στρατιωτικοποίηση", - "Στράτος": "Ευστράτιος", - "στρένω": "στέργω", - "στριµόκωλα": "στρυµόκωλα", - "στριµωξίδι": "στρυµωξίδι", - "στριµώχνω": "στρυµώχνω", - "στύβω": "στείβω", - "στυπώνω": "στουπώνω", - "σύγνεφο": "σύννεφο", - "συγνώµη": "συγγνώµη", - "συδαυλίζω": "συνδαυλίζω", - "συµπαρασέρνω": "συµπαρασύρω", - "συµπεθεριά": "συµπεθεριό", - "δεκαέξι": "δεκάξι", - "συνήθιο": "συνήθειο", - "συντάµω": "συντέµνω", - "συντριβάνι": "σιντριβάνι", - "συνυφάδα": "συννυφάδα", - "συφορά": "συµφορά", - "συχώρεση": "συγχώρηση", - "συχωρώ": "συγχωρώ", - "συχωροχάρτι": "συγχωροχάρτι", - "σφαλνώ": "σφαλίζω", - "σφεντάµι": "σφένδαµνος", - "σφερδούκλι": "σπερδούκλι", - "σφόνδυλος": "σπόνδυλος", - "σωβινισµός": "σοβινισµός", - "σωβινιστής": "σοβινιστής", - "σώνω": "σώζω", - "σωρείτης": "σωρίτης", - "σωτάρω": "σοτάρω", - "σωτέ": "σοτέ", - "Σωτήρης": "Σωτήριος", - "σωφέρ": "σοφέρ", - "ταβατούρι": "νταβαντούρι", - "ταβερνούλα": "ταβέρνα", - "ταβλάς": "ταµπλάς", - "ταγιαδόρος": "ταλιαδόρος", - "ταγίζω": "ταΐζω", - "τάγισµα": "τάισµα", - "ταγκό": "τανγκό", - "ταή": "ταγή", - "τάλαρο": "τάλιρο", - "τάλληρο": "τάλιρο", - "ταµίευση": "αποταµίευση", - "ταµιεύω": "αποταµιεύω", - "ταµώ": "τέµνω", - "ταξείδι": "ταξίδι", - "ταπεραµέντο": "ταµπεραµέντο", - "ταράσσω": "ταράζω", - "ταχτοποίηση": "τακτοποίηση", - "ταχτοποιώ": "τακτοποιώ", - "τελάλης": "ντελάλης", - "τελολογία": "τελεολογία", - "τεριρέµ": "τερερέµ", - "τερραίν": "τερέν", - "τέσσαρα": "τέσσερα", - "τετράς": "τετράδα", - "τζέντζερης": "τέντζερης", - "τζετζερέδια": "τεντζερέδια", - "τζιριτζάντζουλα": "τζυριτζάτζουλα", - "τζίρος": "τζύρος", - "τζιτζιµπίρα": "τσιτσιµπίρα", - "τηκ": "τικ", - "τηλοµοιοτύπηµα": "τηλεοµοιοτύπηµα", - "τηλοµοιοτυπία": "τηλεοµοιοτυπία", - "τηλοµοιοτυπώ": "τηλεοµοιοτυπώ", - "τιτιβίζω": "τιττυβίζω", - "τµήθηκα": "τέµνω", - "τµήσω": "τέµνω", - "Τόκιο": "Τόκυο", - "τοµάτα": "ντοµάτα", - "τοµατιά": "ντοµατιά", - "τοµατοπολτός": "ντοµατοπολτός", - "τοµατοσαλάτα": "ντοµατοσαλάτα", - "τονθορύζω": "υποτονθορύζω", - "τορβάς": "ντορβάς", - "τορνάρω": "τορνεύω", - "τορπίλα": "τορπίλη", - "τούνδρα": "τούντρα", - "Τουρκάλα": "Τούρκος", - "τράβαλα": "ντράβαλα", - "τραΐ": "τραγί", - "τραινάρισµα": "τρενάρισµα", - "τραινάρω": "τρενάρω", - "τραίνο": "τρένο", - "τρακόσοι": "τριακόσιοι", - "τραπεζάκι": "τραπέζι", - "τρέµουλο": "τρεµούλα", - "τρέψω": "τρέπω", - "τριάµισι": "τρεισήµισι", - "τρικλίζω": "τρεκλίζω", - "τρίκλισµα": "τρέκλισµα", - "τρίπλα": "ντρίπλα", - "τριπλαδόρος": "ντριπλαδόρος", - "τριπλάρω": "ντριπλάρω", - "τρίπους": "τρίποδας", - "τρόπις": "τρόπιδα", - "τρυκ": "τρικ", - "τσαγγαράδικο": "τσαγκαράδικο", - "τσογγάρης": "τσαγκάρης", - "τσαγγάρικο": "τσαγκάρικο", - "τσαγγαροδευτέρα": "τσαγκαροδευτέρα", - "τσάµπα": "τζάµπα", - "τσαµπατζής": "τζαµπατζής", - "τσαντίζω": "τσατίζω", - "τσαντίλα": "τσατίλα", - "τσαντίλας": "τσατίλας", - "τσάντισµα": "τσάτισµα", - "τσίβα": "τζίβα", - "τσίκλα": "τσίχλα", - "τσιµεντώνω": "τσιµεντάρω", - "τσιπούρα": "τσιππούρα", - "τσιρίζω": "τσυρίζω", - "τσιριτσάντζουλα": "τζιριτζάντζουλα", - "τσιρότο": "τσηρώτο", - "τσίτα": "τσήτα", - "τσιτσιρίζω": "τσυτσυρίζω", - "τσιτσίρισµα": "τσυτσυρίζω", - "τσίτωµα": "τσήτωµα", - "τσοµπάνος": "τσοµπάνης", - "τσοπάνης": "τσοµπάνης", - "τσοπανόπουλο": "τσοµπανόπουλο", - "τσοπάνος": "τσοµπάνης", - "τσύνορο": "τσίνορο", - "τυράγνισµα": "τυράννισµα", - "τυραγνω": "τυραννώ", - "τυφεκίζω": "τουφεκίζω", - "τυφεκισµός": "τουφεκισµός", - "υαλόχαρτον": "γυαλόχαρτο", - "υαλόχαρτο": "γυαλόχαρτο", - "υάρδα": "γιάρδα", - "ύβρη": "ύβρις", - "υδατοσκοπια": "υδροσκοπία", - "υδραέριο": "υδαταέριο", - "ύελος": "ύαλος", - "Υόρκη Νέα": "Νέα Υόρκη", - "υποδείχνω": "υποδεικνύω", - "υπόδεσις": "υπόδηση", - "υποκάµισο": "πουκάµισο", - "φαγκρί": "φαγγρί", - "φαγοκύτωση": "φαγοκυττάρωση", - "ψόγουσα": "φαγέδαινα", - "φαγωµός": "φαγωµάρα", - "φάδι": "υφάδι", - "φαινοµεναλισµός": "φαινοµενοκρατία", - "φαινοµενισµός": "φαινοµενοκρατία", - "φαίνω": "υφαίνω", - "φαλακρώνω": "φαλακραίνω", - "φαµίλια": "φαµελιά", - "φαµφάρα": "φανφάρα", - "φαµφαρονισµος": "φανφαρονισµός", - "φαµφαρόνος": "φανφαρόνος", - "φαράκλα": "φαλάκρα", - "φαρµασόνος": "φραµασόνος", - "φαρµπαλάς": "φραµπαλάς", - "φασουλάδα": "φασολάδα", - "φασουλάκια": "φασολάκια", - "φασουλιά": "φασολιά", - "φασούλι": "φασόλι", - "φελόνι": "φαιλόνιο", - "φελώ": "ωφελώ", - "φεουδαλισµός": "φεουδαρχισµός", - "φερµάνι": "φιρµάνι", - "φέτος": "εφέτος", - "φθήνια": "φτήνια", - "Φιλανδία": "Φινλανδία", - "φιλενάδα": "φιλαινάδα", - "φιλιστρίνι": "φινιστρίνι", - "φιλόφρονας": "φιλόφρων", - "φιντάνι": "φυντάνι", - "φιορντ": "φιόρδ", - "φίσκα": "φύσκα", - "φκειάνω": "φτειάχνω", - "φκιάνω": "φτειάχνω", - "φκειασιδι": "φτειασίδι", - "φκειασίδωµα": "φτειασίδωµα", - "φκειασιδώνω": "φτειασιδώνω", - "φκιασιδι": "φτειασίδι", - "φκιασίδωµα": "φτειασίδωµα", - "φκιασιδώνω": "φτειασιδώνω", - "φκυάρι": "φτυάρι", - "Φλάνδρα": "Φλαµανδία", - "φλισκούνι": "φλησκούνι", - "φλοίδα": "φλούδα", - "φλοµιάζω": "φλοµώνω", - "φλορίνι": "φιορίνι", - "φλυτζάνι": "φλιτζάνι", - "φοβούµαι": "φοβάµαι", - "φονεύς": "φονιάς", - "φόντα": "φόντο", - "φουσέκι": "φισέκι", - "φούχτα": "χούφτα", - "φουχτώνω": "χουφτώνω", - "Φραγκφούρτη": "Φρανκφούρτη", - "φράσσω": "φράζω", - "Φρίντα": "Φρειδερίκη", - "Φροσύνη": "Ευφροσύνη", - "Φρόσω": "Ευφροσύνη", - "φροϋδισµος": "φροϊδισµός", - "φρουµάζω": "φριµάζω", - "φρούµασµα": "φρίµασµα", - "φτάνω": "φθάνω", - "φταρνίζοµαι": "φτερνίζοµαι", - "φτειάνω": "φτειάχνω", - "φτηνά": "φθηνά", - "φτηναίνω": "φθηναίνω", - "φτιασίδι": "φτειασίδι", - "φτιασιδώνοµαι": "φτειασιδώνοµαι", - "φτωχοκοµείο": "πτωχοκοµείο", - "φυγάδας": "φυγάς", - "φύγω": "φεύγω", - "φυλάγω": "φυλάσσω", - "φυλλαράκι": "φύλλο", - "φυλλόδεντρο": "φιλόδεντρο", - "φυλώ": "φυλάσσω", - "φυσέκι": "φισέκι", - "φυσεκλίκι": "φισεκλίκι", - "φυσιοθεραπεία": "φυσικοθεραπεία", - "φυστίκι": "φιστίκι", - "φυστικιά": "φιστικιά", - "φύω": "φύοµαι", - "φχαριστώ": "ευχαριστώ", - "φωβισµός": "φοβισµός", - "φωβιστής": "φοβισµός", - "Φώτης": "Φώτιος", - "φωτογραφώ": "φωτογραφίζω", - "φωτοβολή": ", φωτοβολία", - "χάβω": "χάφτω", - "χαΐδεµα": "χαϊδεύω", - "χάιδι": "χάδι", - "χαλνώ": "χαλώ", - "χαλυβώνω": "χαλυβδώνω", - "χάµου": "χάµω", - "χαµψίνι": "χαµσίνι", - "χάνδρα": "χάντρα", - "χαντζής": "χανιτζής", - "χαραµατιά": "χαραγµατιά", - "χάραξ": "χάρακας", - "χάροντας": "χάρος", - "χατζάρα": "χαντζάρα", - "χατζάρι": "χαντζάρι", - "χεγκελιανισµός": "εγελιανισµός", - "χειρόβολο": "χερόβολο", - "χειροµάχηµα": "χεροµαχώ", - "χειροµάχισσα": "χεροµάχος", - "χειροµάχος": "χεροµάχος", - "χειροµαχώ": "χεροµαχώ", - "χέρα": "χέρι", - "χερόµυλος": "χειρόµυλος", - "χεροπόδαρα": "χειροπόδαρα", - "χηνάρι": "χήνα", - "χι": "χει", - "χιµώ": "χυµώ", - "χιών": "χιόνι", - "χλεµπάνια": "πλεµπάγια", - "χλοΐζω": "χλοάζω", - "χλόισµα": "χλόασµα", - "χνώτο": "χνότο", - "χορδίζω": "κουρδίζω", - "χόρδισµα": "κούρδισμα", - "χοχλάζω": "κοχλάζω", - "χοχλακιάζω": "κοχλάζω", - "χοχλακίζω": "κοχλάζω", - "χοχλακώ": "κοχλάζω", - "χρεογραφο": "χρεώγραφο", - "χρεοκοπία": "χρεωκοπία", - "χρεοκοπώ": "χρεωκοπώ", - "χρεολυσία": "χρεωλυσία", - "χρεολύσιο": "χρεωλύσιο", - "χρεόλυτρο": "χρεώλυτρο", - "χρεοπιστώνω": "πιστοχρεώνω", - "χρεοπίστωση": "πιστοχρεώνω", - "χρεοστάσιο": "χρεωστάσιο", - "χρεοφειλέτης": "χρεωφειλέτης", - "Χρήστος": "Χρίστος", - "χρωµατόσωµα": "χρωµόσωµα", - "χρωµογόνος": "χρωµατογόνος", - "χρωµογόνο": "χρωµατογόνο", - "χρωµοφόρος": "χρωµατοφόρος", - "χρωµοφόρο": "χρωµατοφόρο", - "χτες": "χθες", - "χτήµα": "κτήµα", - "χτίζω": "κτίζω", - "χτίσιµο": "κτίσιµο", - "χτίσµα": "κτίσµα", - "χτίστης": "κτίστης", - "χτύπηµα": "κτύπηµα", - "χτύπος": "κτύπος", - "χτυπώ": "κτυπώ", - "χυµίζω": "χυµώ", - "χωλ": "χολ", - "χώνεψη": "χώνευση", - "χωριατοσύνη": "χωριατιά", - "ψένω": "ψήνω", - "ψηλαφώ": "ψηλαφίζω", - "ψηφιδοθέτης": "ψηφοθέτης", - "ψιττακίαση": "ψιττάκωση", - "ψίχαλο": "ψίχουλο", - "ψυχεδελισµός": "ψυχεδέλεια", - "ψυχογιός": "ψυχογυιός", - "ψώριασµα": "ψωριάζω", - "ωγκρατέν": "ογκρατέν", - "ωράριο": "οράριο", - "ώς": "έως", - "ωτασπίδα": "ωτοασπίδα", - "ωτοστόπ": "οτοστόπ", - "ωφελιµοκρατία": "ωφελιµισµός", - "ωχαδερφισµός": "οχαδερφισµός", - "ώχου": "όχου", - "άγυρτος": "άγειρτος", - "άγυρτη": "άγειρτη", - "άγυρτο": "άγειρτο", - "ανηµέρευτος": "ανηµέρωτος", - "ανηµέρευτη": "ανηµέρωτη", - "ανηµέρευτο": "ανηµέρωτο", - "ανοικτός": "ανοιχτός", - "ανοικτή": "ανοιχτή", - "ανοικτό": "ανοιχτό", - "αντιελληνικός": "ανθελληνικός", - "αντιελληνική": "ανθελληνική", - "αντιελληνικό": "ανθελληνικό", - "αντιεπιστηµονικος": "αντεπιστηµονικός", - "αντιεπιστηµονικη": "αντεπιστηµονική", - "αντιεπιστηµονικο": "αντεπιστηµονικό", - "αξόφλητος": "ανεξόφλητος", - "αξόφλητη": "ανεξόφλητη", - "αξόφλητο": "ανεξόφλητο", - "άπαιχτος": "άπαικτος", - "άπαιχτη": "άπαικτη", - "άπαιχτο": "άπαικτο", - "απηρχαιωµένος": "απαρχαιωµένος", - "απηρχαιωµένη": "απαρχαιωµένη", - "απηρχαιωµένο": "απαρχαιωµένο", - "άπιωτος": "άπιοτος", - "άπιωτη": "άπιοτη", - "άπιωτο": "άπιοτο", - "άπραχτος": "άπρακτος", - "άπραχτη": "άπρακτη", - "άπραχτο": "άπρακτο", - "άραχλος": "άραχνος", - "άραχλη": "άραχνη", - "άραχλο": "άραχνο", - "αρήγωτος": "αρίγωτος", - "αρήγωτη": "αρίγωτη", - "αρήγωτο": "αρίγωτο", - "αρµενικός": "αρµενιακός", - "αρµενική": "αρµενιακή", - "αρµενικό": "αρµενιακό", - "αρµυρός": "αλµυρός", - "αρµυρή": "αλµυρή", - "αρµυρό": "αλµυρό", - "άσβεστος": "άσβηστος", - "άσβεστη": "άσβηστη", - "άσβεστο": "άσβηστο", - "άσκηµος": "άσχηµος", - "άσκηµη": "άσχηµη", - "άσκηµο": "άσχηµο", - "άστυφτος": "άστειφτος", - "άστυφτη": "άστειφτη", - "άστυφτο": "άστειφτο", - "ασυχώρετος": "ασυγχώρητος", - "ασυχώρετη": "ασυγχώρητη", - "ασυχώρετο": "ασυγχώρητο", - "άταχτος": "άτακτος", - "άταχτη": "άτακτη", - "άταχτο": "άτακτο", - "άφκιαστος": "άφτειαχτος", - "άφκιαστη": "άφτειαχτη", - "άφκιαστο": "άφτειαχτο", - "άφκειαστος": "άφτειαχτος", - "άφκειαστη": "άφτειαχτη", - "άφκειαστο": "άφτειαχτο", - "άφταστος": "άφθαστος", - "άφταστη": "άφθαστη", - "άφταστο": "άφθαστο", - "άφτερος": "άπτερος", - "άφτερη": "άπτερη", - "άφτερο": "άπτερο", - "αχτιδωτος": "ακτινωτός", - "αχτιδωτη": "ακτινωτή", - "αχτιδωτο": "ακτινωτό", - "άχτιστος": "άκτιστος", - "άχτιστη": "άκτιστη", - "άχτιστο": "άκτιστο", - "βιωτικός": "βιοτικός", - "βιωτική": "βιοτική", - "βιωτικό": "βιοτικό", - "βλάστηµος": "βλάσφηµος", - "βλάστηµη": "βλάσφηµη", - "βλάστηµο": "βλάσφηµο", - "βλογηµένος": "ευλογηµένος", - "βλογηµένη": "ευλογηµένη", - "βλογηµένο": "ευλογηµένο", - "βοϊδινός": "βοδινός", - "βοϊδινή": "βοδινή", - "βοϊδινό": "βοδινό", - "βορινός": "βορεινός", - "βορινή": "βορεινή", - "βορινό": "βορεινό", - "βρωµερός": "βροµερός", - "βρωµερή": "βροµερή", - "βρωµερό": "βροµερό", - "βρώµικος": "βρόµικος", - "βρώµικη": "βρόµικη", - "βρώµικο": "βρόµικο", - "γαλατερός": "γαλακτερός", - "γαλατερή": "γαλακτερή", - "γαλατερό": "γαλακτερό", - "γδυµνός": "γυµνός", - "γδυµνή": "γυµνή", - "γδυµνό": "γυµνό", - "γελαδινός": "αγελαδινός", - "γελαδινή": "αγελαδινή", - "γελαδινό": "αγελαδινό", - "γερτός": "γειρτός", - "γερτή": "γειρτή", - "γερτό": "γειρτό", - "γιοµάτος": "γεµάτος", - "γιοµάτη": "γεµάτη", - "γιοµάτο": "γεµάτο", - "γκεµπελικός": "γκαιµπελικός", - "γκεµπελική": "γκαιµπελική", - "γκεµπελικό": "γκαιµπελικό", - "γλήγορος": "γρήγορος", - "γλήγορη": "γρήγορη", - "γλήγορο": "γρήγορο", - "γρανίτινος": "γρανιτένιος", - "γρανίτινη": "γρανιτένιη", - "γρανίτινο": "γρανιτένιο", - "γραφτός": "γραπτός", - "γραφτή": "γραπτή", - "γραφτό": "γραπτό", - "γυρτός": "γειρτός", - "γυρτή": "γειρτή", - "γυρτό": "γειρτό", - "δαιµονόπληκτος": "δαιµονιόπληκτος", - "δαιµονόπληκτη": "δαιµονιόπληκτη", - "δαιµονόπληκτο": "δαιµονιόπληκτο", - "δερµικός": "δερµατικός", - "δερµική": "δερµατική", - "δερµικό": "δερµατικό", - "δεχτός": "δεκτός", - "δεχτή": "δεκτή", - "δεχτό": "δεκτό", - "διαλεκτός": "διαλεχτός", - "διαλεκτή": "διαλεχτή", - "διαλεκτό": "διαλεχτό", - "διαολεµένος": "διαβολεµένος", - "διαολεµένη": "διαβολεµένη", - "διαολεµένο": "διαβολεµένο", - "δυσέλεγκτος": "δυσεξέλεγκτος", - "δυσέλεγκτη": "δυσεξέλεγκτη", - "δυσέλεγκτο": "δυσεξέλεγκτο", - "δυσλεκτικός": "δυσλεξικός", - "δυσλεκτική": "δυσλεξική", - "δυσλεκτικό": "δυσλεξικό", - "εκδοµένος": "εκδεδοµένος", - "εκδοµένη": "εκδεδοµένη", - "εκδοµένο": "εκδεδοµένο", - "ελεύτερος": "ελεύθερος", - "ελεύτερη": "ελεύθερη", - "ελεύτερο": "ελεύθερο", - "εξώφθαλµος": "εξόφθαλµος", - "εξώφθαλµη": "εξόφθαλµη", - "εξώφθαλµο": "εξόφθαλµο", - "επανωτός": "απανωτός", - "επανωτή": "απανωτή", - "επανωτό": "απανωτό", - "επεξηγητικος": "επεξηγηµατικός", - "επεξηγητικη": "επεξηγηµατική", - "επεξηγητικο": "επεξηγηµατικό", - "έρµος": "έρηµος", - "έρµη": "έρηµη", - "έρµο": "έρηµο", - "ετερόκλητος": "ετερόκλιτος", - "ετερόκλητη": "ετερόκλιτη", - "ετερόκλητο": "ετερόκλιτο", - "ετούτος": "τούτος", - "ετούτη": "τούτη", - "ετούτο": "τούτο", - "εφετεινός": "εφετινός", - "εφετεινή": "εφετινή", - "εφετεινό": "εφετινό", - "εφταήµερος": "επταήµερος", - "εφταήµερη": "επταήµερη", - "εφταήµερο": "επταήµερο", - "ζάµπλουτος": "ζάπλουτος", - "ζάµπλουτη": "ζάπλουτη", - "ζάµπλουτο": "ζάπλουτο", - "ζαχαράτος": "ζαχαρωτός", - "ζαχαράτη": "ζαχαρωτή", - "ζαχαράτο": "ζαχαρωτό", - "θαµβός": "θαµπός", - "θαµβή": "θαµπή", - "θαµβό": "θαµπό", - "θραψερός": "θρεψερός", - "θραψερή": "θρεψερή", - "θραψερό": "θρεψερό", - "ιονικός": "ιοντικός", - "ιονική": "ιοντική", - "ιονικό": "ιοντικό", - "καββαλιστικός": "καβαλιστικός", - "καββαλιστική": "καβαλιστική", - "καββαλιστικό": "καβαλιστικό", - "καλλίτερος": "καλύτερος", - "καλλίτερη": "καλύτερη", - "καλλίτερο": "καλύτερο", - "καταχτητικός": "κατακτητικός", - "καταχτητική": "κατακτητική", - "καταχτητικό": "κατακτητικό", - "καταψυγµένος": "κατεψυγµένος", - "καταψυγµένη": "κατεψυγµένη", - "καταψυγµένο": "κατεψυγµένο", - "καυδιανός": "καβδιανός", - "καυδιανή": "καβδιανή", - "καυδιανό": "καβδιανό", - "καϋµένος": "καηµένος", - "καϋµένη": "καηµένη", - "καϋµένο": "καηµένο", - "κέδρινος": "κέδρος", - "κέδρινη": "κέδρη", - "κέδρινο": "κέδρο", - "κεραµεικος": "κεραµικός", - "κεραµεικη": "κεραµική", - "κεραµεικο": "κεραµικό", - "κλασσικός": "κλασικός", - "κλασσική": "κλασική", - "κλασσικό": "κλασικό", - "κόλαριστός": "κολλαριστός", - "κόλαριστή": "κολλαριστή", - "κόλαριστό": "κολλαριστό", - "κοµµουνιστικός": "κοµουνιστικός", - "κοµµουνιστική": "κοµουνιστική", - "κοµµουνιστικό": "κοµουνιστικό", - "κοράλλινος": "κοραλλένιος", - "κοράλλινη": "κοραλλένιη", - "κοράλλινο": "κοραλλένιο", - "κτυπητός": "χτυπητός", - "κτυπητή": "χτυπητή", - "κτυπητό": "χτυπητό", - "κωφός": "κουφός", - "κωφή": "κουφή", - "κωφό": "κουφό", - "λειπανάβατος": "λειψανάβατος", - "λειπανάβατη": "λειψανάβατη", - "λειπανάβατο": "λειψανάβατο", - "λιανικός": "λειανικός", - "λιανική": "λειανική", - "λιανικό": "λειανικό", - "λιανός": "λειανός", - "λιανή": "λειανή", - "λιανό": "λειανό", - "λιγοήµερος": "ολιγοήµερος", - "λιγοήµερη": "ολιγοήµερη", - "λιγοήµερο": "ολιγοήµερο", - "λιγόκαρδος": "ολιγόκαρδος", - "λιγόκαρδη": "ολιγόκαρδη", - "λιγόκαρδο": "ολιγόκαρδο", - "λιγόλογος": "ολιγόλογος", - "λιγόλογη": "ολιγόλογη", - "λιγόλογο": "ολιγόλογο", - "λιγόπιστος": "ολιγόπιστος", - "λιγόπιστη": "ολιγόπιστη", - "λιγόπιστο": "ολιγόπιστο", - "λιγόψυχος": "ολιγοψυχία", - "λιγόψυχοςή": "ολιγοψυχίαη", - "λιγόψυχοςό": "ολιγοψυχίαο", - "λιόλουστος": "ηλιόλουστος", - "λιόλουστη": "ηλιόλουστη", - "λιόλουστο": "ηλιόλουστο", - "λιόµορφος": "ηλιόµορφος", - "λιόµορφη": "ηλιόµορφη", - "λιόµορφο": "ηλιόµορφο", - "λιόχαρος": "ηλιόχαρος", - "λιόχαρη": "ηλιόχαρη", - "λιόχαρο": "ηλιόχαρο", - "λιπανάβατος": "λειψανάβατος", - "λιπανάβατη": "λειψανάβατη", - "λιπανάβατο": "λειψανάβατο", - "λυµφατικός": "λεµφατικός", - "λυµφατική": "λεµφατική", - "λυµφατικό": "λεµφατικό", - "µαυριδερός": "µαυρειδερός", - "µαυριδερή": "µαυρειδερή", - "µαυριδερό": "µαυρειδερό", - "µεικτός": "µικτός", - "µεικτή": "µικτή", - "µεικτό": "µικτό", - "µελαψός": "µελαµψός", - "µελαψή": "µελαµψή", - "µελαψό": "µελαµψό", - "µετάξινος": "µεταξένιος", - "µετάξινη": "µεταξένιη", - "µετάξινο": "µεταξένιο", - "µιξοβάρβαρος": "µειξοβάρβαρος", - "µιξοβάρβαρη": "µειξοβάρβαρη", - "µιξοβάρβαρο": "µειξοβάρβαρο", - "µοσκαναθρεµµένος": "µοσχαναθρεµµένος", - "µοσκαναθρεµµένη": "µοσχαναθρεµµένη", - "µοσκαναθρεµµένο": "µοσχαναθρεµµένο", - "µουλωχτός": "µουλλωχτός", - "µουλωχτή": "µουλλωχτή", - "µουλωχτό": "µουλλωχτό", - "µπαµπακερός": "βαµβακερός", - "µπαµπακερή": "βαµβακερή", - "µπαµπακερό": "βαµβακερό", - "νεόχτιστος": "νεόκτιστος", - "νεόχτιστη": "νεόκτιστη", - "νεόχτιστο": "νεόκτιστο", - "νηστίσιµος": "νηστήσιµος", - "νηστίσιµη": "νηστήσιµη", - "νηστίσιµο": "νηστήσιµο", - "νιογέννητος": "νεογέννητος", - "νιογέννητη": "νεογέννητη", - "νιογέννητο": "νεογέννητο", - "νυκτερινός": "νυχτερινός", - "νυκτερινή": "νυχτερινή", - "νυκτερινό": "νυχτερινό", - "ξιπόλητος": "ξυπόλυτος", - "ξιπόλητη": "ξυπόλυτη", - "ξιπόλητο": "ξυπόλυτο", - "ξυνός": "ξινός", - "ξυνή": "ξινή", - "ξυνό": "ξινό", - "ξωτικός": "εξωτικός", - "ξωτική": "εξωτική", - "ξωτικό": "εξωτικό", - "οικονοµίστικος": "οικονοµικίστικος", - "οικονοµίστικη": "οικονοµικίστικη", - "οικονοµίστικο": "οικονοµικίστικο", - "οκταγωνικός": "οχταγωνικός", - "οκταγωνική": "οχταγωνική", - "οκταγωνικό": "οχταγωνικό", - "οκτάγωνος": "οχτάγωνος", - "οκτάγωνη": "οχτάγωνη", - "οκτάγωνο": "οχτάγωνο", - "οκτάεδρος": "οχτάεδρος", - "οκτάεδρη": "οχτάεδρη", - "οκτάεδρο": "οχτάεδρο", - "οκτάκιλος": "οχτάκιλος", - "οκτάκιλη": "οχτάκιλη", - "οκτάκιλο": "οχτάκιλο", - "οξειδώσιµος": "οξιδώσιµος", - "οξειδώσιµη": "οξιδώσιµη", - "οξειδώσιµο": "οξιδώσιµο", - "ορεχτικός": "ορεκτικός", - "ορεχτική": "ορεκτική", - "ορεχτικό": "ορεκτικό", - "οχταγωνικός": "οκταγωνικός", - "οχταγωνική": "οκταγωνική", - "οχταγωνικό": "οκταγωνικό", - "οχτάγωνος": "οκτάγωνος", - "οχτάγωνη": "οκτάγωνη", - "οχτάγωνο": "οκτάγωνο", - "οχτάεδρος": "οκτάεδρος", - "οχτάεδρη": "οκτάεδρη", - "οχτάεδρο": "οκτάεδρο", - "οχτακοσιοστός": "οκτακοσιοστός", - "οχτακοσιοστή": "οκτακοσιοστή", - "οχτακοσιοστό": "οκτακοσιοστό", - "οχτάπλευρος": "οκτάπλευρος", - "οχτάπλευρη": "οκτάπλευρη", - "οχτάπλευρο": "οκτάπλευρο", - "οχτάστηλος": "οκτάστηλος", - "οχτάστηλη": "οκτάστηλη", - "οχτάστηλο": "οκτάστηλο", - "οχτάστιχος": "οκτάστιχος", - "οχτάστιχη": "οκτάστιχη", - "οχτάστιχο": "οκτάστιχο", - "οχτάωρος": "οκτάωρος", - "οχτάωρη": "οκτάωρη", - "οχτάωρο": "οκτάωρο", - "οχτωβριανός": "οκτωβριανός", - "οχτωβριανή": "οκτωβριανή", - "οχτωβριανό": "οκτωβριανό", - "παιδιακίστικος": "παιδιάστικος", - "παιδιακίστικη": "παιδιάστικη", - "παιδιακίστικο": "παιδιάστικο", - "πανέρµος": "πανέρηµος", - "πανέρµη": "πανέρηµη", - "πανέρµο": "πανέρηµο", - "παπαδικός": "παππαδικός", - "παπαδική": "παππαδική", - "παπαδικό": "παππαδικό", - "παπαδίστικος": "παππαδίστικος", - "παπαδίστικη": "παππαδίστικη", - "παπαδίστικο": "παππαδίστικο", - "παραεκκλησιαστικός": "παρεκκλησιαστικός", - "παραεκκλησιαστική": "παρεκκλησιαστική", - "παραεκκλησιαστικό": "παρεκκλησιαστικό", - "πειρακτικός": "πειραχτικός", - "πειρακτική": "πειραχτική", - "πειρακτικό": "πειραχτικό", - "περήφανος": "υπερήφανος", - "περήφανη": "υπερήφανη", - "περήφανο": "υπερήφανο", - "περσότερος": "περισσότερος", - "περσότερη": "περισσότερη", - "περσότερο": "περισσότερο", - "πεταγµένος": "πεταµένος", - "πεταγµένη": "πεταµένη", - "πεταγµένο": "πεταµένο", - "πηκτός": "πηχτός", - "πηκτή": "πηχτή", - "πηκτό": "πηχτό", - "πιτσιλιστός": "πιτσυλιστός", - "πιτσιλιστή": "πιτσυλιστή", - "πιτσιλιστό": "πιτσυλιστό", - "πλεχτικός": "πλεκτικός", - "πλεχτική": "πλεκτική", - "πλεχτικό": "πλεκτικό", - "πλεχτός": "πλεκτός", - "πλεχτή": "πλεκτή", - "πλεχτό": "πλεκτό", - "προσεχτικός": "προσεκτικός", - "προσεχτική": "προσεκτική", - "προσεχτικό": "προσεκτικό", - "προψεσινός": "προχθεσινός", - "προψεσινή": "προχθεσινή", - "προψεσινό": "προχθεσινό", - "πτερωτός": "φτερωτός", - "πτερωτή": "φτερωτή", - "πτερωτό": "φτερωτό", - "πτωχικός": "φτωχικός", - "πτωχική": "φτωχική", - "πτωχικό": "φτωχικό", - "ραφτικός": "ραπτικός", - "ραφτική": "ραπτική", - "ραφτικό": "ραπτικό", - "ραφτός": "ραπτός", - "ραφτή": "ραπτή", - "ραφτό": "ραπτό", - "ρούσικος": "ρωσικός", - "ρούσικη": "ρωσική", - "ρούσικο": "ρωσικό", - "ρωµαντικός": "ροµαντικός", - "ρωµαντική": "ροµαντική", - "ρωµαντικό": "ροµαντικό", - "σειληνικός": "σιληνικός", - "σειληνική": "σιληνική", - "σειληνικό": "σιληνικό", - "σειριακός": "σειραϊκός", - "σειριακή": "σειραϊκή", - "σειριακό": "σειραϊκό", - "σεξπιρικός": "σαιξπηρικός", - "σεξπιρική": "σαιξπηρική", - "σεξπιρικό": "σαιξπηρικό", - "σιδηρόφρακτος": "σιδερόφραχτος", - "σιδηρόφρακτη": "σιδερόφραχτη", - "σιδηρόφρακτο": "σιδερόφραχτο", - "σκεβρός": "σκευρός", - "σκεβρή": "σκευρή", - "σκεβρό": "σκευρό", - "σκεφτικός": "σκεπτικός", - "σκεφτική": "σκεπτική", - "σκεφτικό": "σκεπτικό", - "σκιστός": "σχιστός", - "σκιστή": "σχιστή", - "σκιστό": "σχιστό", - "σκολιανός": "σχολιανός", - "σκολιανή": "σχολιανή", - "σκολιανό": "σχολιανό", - "σκοτσέζικος": "σκοτσέζικος", - "σκοτσέζικη": "σκοτσέζικη", - "σκοτσέζικο": "σκοτσέζικο", - "σµυρνιώτικος": "σµυρναίικος", - "σµυρνιώτικη": "σµυρναίικη", - "σµυρνιώτικο": "σµυρναίικο", - "σοροπιαστός": "σιροπιαστός", - "σοροπιαστή": "σιροπιαστή", - "σοροπιαστό": "σιροπιαστό", - "σπερνός": "εσπερινός", - "σπερνή": "εσπερινή", - "σπερνό": "εσπερινό", - "σταρόχρωµος": "σιταρόχρωµος", - "σταρόχρωµη": "σιταρόχρωµη", - "σταρόχρωµο": "σιταρόχρωµο", - "στενάχωρος": "στενόχωρος", - "στενάχωρη": "στενόχωρη", - "στενάχωρο": "στενόχωρο", - "στιλιστικός": "στυλιστικός", - "στιλιστική": "στυλιστική", - "στιλιστικό": "στυλιστικό", - "στριµόκωλος": "στρυµόκωλος", - "στριµόκωλη": "στρυµόκωλη", - "στριµόκωλο": "στρυµόκωλο", - "στριµωχτός": "στρυµωχτός", - "στριµωχτή": "στρυµωχτή", - "στριµωχτό": "στρυµωχτό", - "στριφνός": "στρυφνός", - "στριφνή": "στρυφνή", - "στριφνό": "στρυφνό", - "σύµµεικτος": "σύµµικτος", - "σύµµεικτη": "σύµµικτη", - "σύµµεικτο": "σύµµικτο", - "σύµψυχος": "σύψυχος", - "σύµψυχη": "σύψυχη", - "σύµψυχο": "σύψυχο", - "συντεθειµένος": "συνθέτω", - "συντεθειµένοςή": "συνθέτωη", - "συντεθειµένοςό": "συνθέτωο", - "συφοριασµένος": "συμφοριασμένος", - "συφοριασµένη": "συμφοριασμένη", - "συφοριασµένο": "συμφοριασμένο", - "συχωριανός": "συγχωριανός", - "συχωριανή": "συγχωριανή", - "συχωριανό": "συγχωριανό", - "ταγκός": "ταγγός", - "ταγκή": "ταγγή", - "ταµιευτικός": "αποταµιευτικός", - "ταµιευτική": "αποταµιευτική", - "ταµιευτικό": "αποταµιευτικό", - "ταχτικός": "τακτικός", - "ταχτική": "τακτική", - "ταχτικό": "τακτικό", - "τελολογικός": "τελεολογικός", - "τελολογική": "τελεολογική", - "τελολογικό": "τελεολογικό", - "τραγικοκωµικός": "κωµικοτραγικός", - "τραγικοκωµική": "κωµικοτραγική", - "τραγικοκωµικό": "κωµικοτραγικό", - "τρελλός": "τρελός", - "τρελλή": "τρελή", - "τρελλό": "τρελό", - "τσεβδός": "τσευδός", - "τσεβδή": "τσευδή", - "τσεβδό": "τσευδό", - "τσιριχτός": "τσυριχτός", - "τσιριχτή": "τσυριχτή", - "τσιριχτό": "τσυριχτό", - "τσιτωτός": "τσητωτός", - "τσιτωτή": "τσητωτή", - "τσιτωτό": "τσητωτό", - "υποµονητικός": "υποµονετικός", - "υποµονητική": "υποµονετική", - "υποµονητικό": "υποµονετικό", - "φαµφαρονικός": "φανφαρονίστικος", - "φαµφαρονική": "φανφαρονίστικη", - "φαµφαρονικό": "φανφαρονίστικο", - "φαµφαρονίστικος": "φανφαρονίστικος", - "φαµφαρονίστικη": "φανφαρονίστικη", - "φαµφαρονίστικο": "φανφαρονίστικο", - "φαντός": "υφαντός", - "φαντή": "υφαντή", - "φαντό": "υφαντό", - "φανφαρονικός": "φανφαρονιστικός", - "φανφαρονική": "φανφαρονιστική", - "φανφαρονικό": "φανφαρονιστικό", - "φαρακλός": "φαλακρός", - "φαρακλή": "φαλακρή", - "φαρακλό": "φαλακρό", - "φεγγαροφώτιστος": "φεγγαρόφωτος", - "φεγγαροφώτιστη": "φεγγαρόφωτη", - "φεγγαροφώτιστο": "φεγγαρόφωτο", - "φεουδαλικός": "φεουδαρχικός", - "φεουδαλική": "φεουδαρχική", - "φεουδαλικό": "φεουδαρχικό", - "φλοκάτος": "φλοκωτός", - "φλοκάτη": "φλοκωτή", - "φλοκάτο": "φλοκωτό", - "φριχτός": "φρικτός", - "φριχτή": "φρικτή", - "φριχτό": "φρικτό", - "φροϋδικός": "φροϊδικός", - "φροϋδική": "φροϊδική", - "φροϋδικό": "φροϊδικό", - "φτειαστός": "φτειαχτός", - "φτειαστή": "φτειαχτή", - "φτειαστό": "φτειαχτό", - "φτηνός": "φθηνός", - "φτηνή": "φθηνή", - "φτηνό": "φθηνό", - "φυσιοθεραπευτικός": "φυσιοθεραπευτικός", - "φυσιοθεραπευτική": "φυσιοθεραπευτική", - "φυσιοθεραπευτικό": "φυσιοθεραπευτικό", - "φωβιστικός": "φοβιστικός", - "φωβιστική": "φοβιστική", - "φωβιστικό": "φοβιστικό", - "χαδεµένος": "χαϊδεµένος", - "χαδεµένη": "χαϊδεµένη", - "χαδεµένο": "χαϊδεµένο", - "χειλόφωνος": "χειλεόφωνος", - "χειλόφωνη": "χειλεόφωνη", - "χειλόφωνο": "χειλεόφωνο", - "χειροδύναµος": "χεροδύναµος", - "χειροδύναµη": "χεροδύναµη", - "χειροδύναµο": "χεροδύναµο", - "χηράµενος": "χηρευάµενος", - "χηράµενη": "χηρευάµενη", - "χηράµενο": "χηρευάµενο", - "χλωµός": "χλοµός", - "χλωµή": "χλοµή", - "χλωµό": "χλοµό", - "χνουδάτος": "χνουδωτός", - "χνουδάτη": "χνουδωτή", - "χνουδάτο": "χνουδωτό", - "χονδρός": "χοντρός", - "χονδρή": "χοντρή", - "χονδρό": "χοντρό", - "χουβαρντάδικος": "χουβαρντάς", - "χουβαρντάδικοςή": "χουβαρντάςη", - "χουβαρντάδικοςό": "χουβαρντάςο", - "χρεολυτικός": "χρεωλυτικός", - "χρεολυτική": "χρεωλυτική", - "χρεολυτικό": "χρεωλυτικό", - "χρησµοδοτικός": "χρησµοδοσία", - "χρησµοδοτική": "χρησµοδοσίαη", - "χρησµοδοτικό": "χρησµοδοσίαο", - "χρυσόπλεχτος": "χρυσόπλεκτος", - "χρυσόπλεχτη": "χρυσόπλεκτη", - "χρυσόπλεχτο": "χρυσόπλεκτο", - "χτεσινός": "χθεσινός", - "χτεσινή": "χθεσινή", - "χτεσινό": "χθεσινό", - "χτιστός": "κτιστός", - "χτιστή": "κτιστή", - "χτιστό": "κτιστό", - "αντρείος": "ανδρείος", - "αντρεία": "ανδρεία", - "αντρείο": "ανδρείο", - "αποποµπαίος": "αποδιοποµπαίος", - "αποποµπαία": "αποδιοποµπαία", - "αποποµπαίο": "αποδιοποµπαίο", - "γεραλεος": "γηραλέος", - "γεραλεα": "γηραλέα", - "γεραλεο": "γηραλέο", - "εντόπιος": "ντόπιος", - "εντόπια": "ντόπια", - "εντόπιο": "ντόπιο", - "εφταπλάσιος": "επταπλάσιος", - "εφταπλάσια": "επταπλάσια", - "εφταπλάσιο": "επταπλάσιο", - "ζούφιος": "τζούφιος", - "ζούφια": "τζούφια", - "ζούφιο": "τζούφιο", - "καθάριος": "καθάρειος", - "καθάρια": "καθάρεια", - "καθάριο": "καθάρειο", - "λαφήσιος": "ελαφήσιος", - "λαφήσια": "ελαφήσια", - "λαφήσιο": "ελαφήσιο", - "οκταθέσιος": "οχταθέσιος", - "οκταθέσια": "οχταθέσια", - "οκταθέσιο": "οχταθέσιο", - "ονυχαίος": "ονυχιαίος", - "ονυχαία": "ονυχιαία", - "ονυχαίο": "ονυχιαίο", - "οχταπλάσιος": "οκταπλάσιος", - "οχταπλάσια": "οκταπλάσια", - "οχταπλάσιο": "οκταπλάσιο", - "βοϊδήσιος": "βοδινός", - "βοϊδήσια": "βοδινή", - "βοϊδήσιο": "βοδινό", - "καλαµποκίσιος": "καλαµποκήσιος", - "καλαµποκίσια": "καλαµποκήσια", - "καλαµποκίσιο": "καλαµποκήσιο", - "κεφαλίσιος": "κεφαλήσιος", - "κεφαλίσια": "κεφαλήσια", - "κεφαλίσιο": "κεφαλήσιο", - "κρουσταλλένιος": "κρυσταλλένιος", - "κρουσταλλένια": "κρυσταλλένια", - "κρουσταλλένιο": "κρυσταλλένιο", - "µοσκαρήσιος": "µοσχαρήσιος", - "µοσκαρήσια": "µοσχαρήσια", - "µοσκαρήσιο": "µοσχαρήσιο", - "παλικαρήσιος": "παλληκαρήσιος", - "παλικαρήσια": "παλληκαρήσια", - "παλικαρήσιο": "παλληκαρήσιο", - "πετρένιος": "πέτρινος", - "πετρένια": "πέτρινη", - "πετρένιο": "πέτρινο", - "σιταρένιος": "σταρένιος", - "σιταρένια": "σταρένια", - "σιταρένιο": "σταρένιο", - "σκυλίσιος": "σκυλήσιος", - "σκυλίσια": "σκυλήσια", - "σκυλίσιο": "σκυλήσιο", - "χελίσιος": "χελήσιος", - "χελίσια": "χελήσια", - "χελίσιο": "χελήσιο", - "χελωνίσιος": "χελωνήσιος", - "χελωνίσια": "χελωνήσια", - "χελωνίσιο": "χελωνήσιο", - "γουρσούζης": "γρουσούζης", - "γουρσούζα": "γρουσούζα", - "γουρσούζικο": "γρουσούζικο", - "γρινιάρης": "γκρινιάρης", - "γρινιάρα": "γκρινιάρα", - "γρινιάρικο": "γκρινιάρικο", - "λιχούδης": "λειχούδης", - "λιχούδα": "λειχούδα", - "λιχούδικο": "λειχούδικο", - "µαργιόλής": "µαριόλης", - "µαργιόλήςα": "µαριόλα", - "µαργιόλήςικο": "µαριόλικο", - "ξεκουτιάρης": "ξεκούτης", - "ξεκουτιάρα": "ξεκούτα", - "ξεκουτιάρικο": "ξεκούτικο", - "σκανδαλιάρης": "σκανταλιάρης", - "σκανδαλιάρα": "σκανταλιάρα", - "σκανδαλιάρικο": "σκανταλιάρικο", - "τσιγκούνης": "τσιγγούνης", - "τσιγκούνα": "τσιγγούνα", - "τσιγκούνικο": "τσιγγούνικο", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fca4e01e7..4304b3c6a 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc def _return_en(_): @@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = _return_en - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py deleted file mode 100644 index a2cf58b8a..000000000 --- a/spacy/lang/en/norm_exceptions.py +++ /dev/null @@ -1,1768 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang and abbreviations - "cos": "because", - "cuz": "because", - "fav": "favorite", - "fave": "favorite", - "misc": "miscellaneous", - "plz": "please", - "pls": "please", - "thx": "thanks", - # US vs. UK spelling - "accessorise": "accessorize", - "accessorised": "accessorized", - "accessorises": "accessorizes", - "accessorising": "accessorizing", - "acclimatisation": "acclimatization", - "acclimatise": "acclimatize", - "acclimatised": "acclimatized", - "acclimatises": "acclimatizes", - "acclimatising": "acclimatizing", - "accoutrements": "accouterments", - "aeon": "eon", - "aeons": "eons", - "aerogramme": "aerogram", - "aerogrammes": "aerograms", - "aeroplane": "airplane", - "aeroplanes ": "airplanes ", - "aesthete": "esthete", - "aesthetes": "esthetes", - "aesthetic": "esthetic", - "aesthetically": "esthetically", - "aesthetics": "esthetics", - "aetiology": "etiology", - "ageing": "aging", - "aggrandisement": "aggrandizement", - "agonise": "agonize", - "agonised": "agonized", - "agonises": "agonizes", - "agonising": "agonizing", - "agonisingly": "agonizingly", - "almanack": "almanac", - "almanacks": "almanacs", - "aluminium": "aluminum", - "amortisable": "amortizable", - "amortisation": "amortization", - "amortisations": "amortizations", - "amortise": "amortize", - "amortised": "amortized", - "amortises": "amortizes", - "amortising": "amortizing", - "amphitheatre": "amphitheater", - "amphitheatres": "amphitheaters", - "anaemia": "anemia", - "anaemic": "anemic", - "anaesthesia": "anesthesia", - "anaesthetic": "anesthetic", - "anaesthetics": "anesthetics", - "anaesthetise": "anesthetize", - "anaesthetised": "anesthetized", - "anaesthetises": "anesthetizes", - "anaesthetising": "anesthetizing", - "anaesthetist": "anesthetist", - "anaesthetists": "anesthetists", - "anaesthetize": "anesthetize", - "anaesthetized": "anesthetized", - "anaesthetizes": "anesthetizes", - "anaesthetizing": "anesthetizing", - "analogue": "analog", - "analogues": "analogs", - "analyse": "analyze", - "analysed": "analyzed", - "analyses": "analyzes", - "analysing": "analyzing", - "anglicise": "anglicize", - "anglicised": "anglicized", - "anglicises": "anglicizes", - "anglicising": "anglicizing", - "annualised": "annualized", - "antagonise": "antagonize", - "antagonised": "antagonized", - "antagonises": "antagonizes", - "antagonising": "antagonizing", - "apologise": "apologize", - "apologised": "apologized", - "apologises": "apologizes", - "apologising": "apologizing", - "appal": "appall", - "appals": "appalls", - "appetiser": "appetizer", - "appetisers": "appetizers", - "appetising": "appetizing", - "appetisingly": "appetizingly", - "arbour": "arbor", - "arbours": "arbors", - "archaeological": "archeological", - "archaeologically": "archeologically", - "archaeologist": "archeologist", - "archaeologists": "archeologists", - "archaeology": "archeology", - "ardour": "ardor", - "armour": "armor", - "armoured": "armored", - "armourer": "armorer", - "armourers": "armorers", - "armouries": "armories", - "armoury": "armory", - "artefact": "artifact", - "artefacts": "artifacts", - "authorise": "authorize", - "authorised": "authorized", - "authorises": "authorizes", - "authorising": "authorizing", - "axe": "ax", - "backpedalled": "backpedaled", - "backpedalling": "backpedaling", - "bannister": "banister", - "bannisters": "banisters", - "baptise": "baptize", - "baptised": "baptized", - "baptises": "baptizes", - "baptising": "baptizing", - "bastardise": "bastardize", - "bastardised": "bastardized", - "bastardises": "bastardizes", - "bastardising": "bastardizing", - "battleaxe": "battleax", - "baulk": "balk", - "baulked": "balked", - "baulking": "balking", - "baulks": "balks", - "bedevilled": "bedeviled", - "bedevilling": "bedeviling", - "behaviour": "behavior", - "behavioural": "behavioral", - "behaviourism": "behaviorism", - "behaviourist": "behaviorist", - "behaviourists": "behaviorists", - "behaviours": "behaviors", - "behove": "behoove", - "behoved": "behooved", - "behoves": "behooves", - "bejewelled": "bejeweled", - "belabour": "belabor", - "belaboured": "belabored", - "belabouring": "belaboring", - "belabours": "belabors", - "bevelled": "beveled", - "bevvies": "bevies", - "bevvy": "bevy", - "biassed": "biased", - "biassing": "biasing", - "bingeing": "binging", - "bougainvillaea": "bougainvillea", - "bougainvillaeas": "bougainvilleas", - "bowdlerise": "bowdlerize", - "bowdlerised": "bowdlerized", - "bowdlerises": "bowdlerizes", - "bowdlerising": "bowdlerizing", - "breathalyse": "breathalyze", - "breathalysed": "breathalyzed", - "breathalyser": "breathalyzer", - "breathalysers": "breathalyzers", - "breathalyses": "breathalyzes", - "breathalysing": "breathalyzing", - "brutalise": "brutalize", - "brutalised": "brutalized", - "brutalises": "brutalizes", - "brutalising": "brutalizing", - "buses": "busses", - "busing": "bussing", - "caesarean": "cesarean", - "caesareans": "cesareans", - "calibre": "caliber", - "calibres": "calibers", - "calliper": "caliper", - "callipers": "calipers", - "callisthenics": "calisthenics", - "canalise": "canalize", - "canalised": "canalized", - "canalises": "canalizes", - "canalising": "canalizing", - "cancellation": "cancelation", - "cancellations": "cancelations", - "cancelled": "canceled", - "cancelling": "canceling", - "candour": "candor", - "cannibalise": "cannibalize", - "cannibalised": "cannibalized", - "cannibalises": "cannibalizes", - "cannibalising": "cannibalizing", - "canonise": "canonize", - "canonised": "canonized", - "canonises": "canonizes", - "canonising": "canonizing", - "capitalise": "capitalize", - "capitalised": "capitalized", - "capitalises": "capitalizes", - "capitalising": "capitalizing", - "caramelise": "caramelize", - "caramelised": "caramelized", - "caramelises": "caramelizes", - "caramelising": "caramelizing", - "carbonise": "carbonize", - "carbonised": "carbonized", - "carbonises": "carbonizes", - "carbonising": "carbonizing", - "carolled": "caroled", - "carolling": "caroling", - "catalogue": "catalog", - "catalogued": "cataloged", - "catalogues": "catalogs", - "cataloguing": "cataloging", - "catalyse": "catalyze", - "catalysed": "catalyzed", - "catalyses": "catalyzes", - "catalysing": "catalyzing", - "categorise": "categorize", - "categorised": "categorized", - "categorises": "categorizes", - "categorising": "categorizing", - "cauterise": "cauterize", - "cauterised": "cauterized", - "cauterises": "cauterizes", - "cauterising": "cauterizing", - "cavilled": "caviled", - "cavilling": "caviling", - "centigramme": "centigram", - "centigrammes": "centigrams", - "centilitre": "centiliter", - "centilitres": "centiliters", - "centimetre": "centimeter", - "centimetres": "centimeters", - "centralise": "centralize", - "centralised": "centralized", - "centralises": "centralizes", - "centralising": "centralizing", - "centre": "center", - "centred": "centered", - "centrefold": "centerfold", - "centrefolds": "centerfolds", - "centrepiece": "centerpiece", - "centrepieces": "centerpieces", - "centres": "centers", - "channelled": "channeled", - "channelling": "channeling", - "characterise": "characterize", - "characterised": "characterized", - "characterises": "characterizes", - "characterising": "characterizing", - "cheque": "check", - "chequebook": "checkbook", - "chequebooks": "checkbooks", - "chequered": "checkered", - "cheques": "checks", - "chilli": "chili", - "chimaera": "chimera", - "chimaeras": "chimeras", - "chiselled": "chiseled", - "chiselling": "chiseling", - "circularise": "circularize", - "circularised": "circularized", - "circularises": "circularizes", - "circularising": "circularizing", - "civilise": "civilize", - "civilised": "civilized", - "civilises": "civilizes", - "civilising": "civilizing", - "clamour": "clamor", - "clamoured": "clamored", - "clamouring": "clamoring", - "clamours": "clamors", - "clangour": "clangor", - "clarinettist": "clarinetist", - "clarinettists": "clarinetists", - "collectivise": "collectivize", - "collectivised": "collectivized", - "collectivises": "collectivizes", - "collectivising": "collectivizing", - "colonisation": "colonization", - "colonise": "colonize", - "colonised": "colonized", - "coloniser": "colonizer", - "colonisers": "colonizers", - "colonises": "colonizes", - "colonising": "colonizing", - "colour": "color", - "colourant": "colorant", - "colourants": "colorants", - "coloured": "colored", - "coloureds": "coloreds", - "colourful": "colorful", - "colourfully": "colorfully", - "colouring": "coloring", - "colourize": "colorize", - "colourized": "colorized", - "colourizes": "colorizes", - "colourizing": "colorizing", - "colourless": "colorless", - "colours": "colors", - "commercialise": "commercialize", - "commercialised": "commercialized", - "commercialises": "commercializes", - "commercialising": "commercializing", - "compartmentalise": "compartmentalize", - "compartmentalised": "compartmentalized", - "compartmentalises": "compartmentalizes", - "compartmentalising": "compartmentalizing", - "computerise": "computerize", - "computerised": "computerized", - "computerises": "computerizes", - "computerising": "computerizing", - "conceptualise": "conceptualize", - "conceptualised": "conceptualized", - "conceptualises": "conceptualizes", - "conceptualising": "conceptualizing", - "connexion": "connection", - "connexions": "connections", - "contextualise": "contextualize", - "contextualised": "contextualized", - "contextualises": "contextualizes", - "contextualising": "contextualizing", - "cosier": "cozier", - "cosies": "cozies", - "cosiest": "coziest", - "cosily": "cozily", - "cosiness": "coziness", - "cosy": "cozy", - "councillor": "councilor", - "councillors": "councilors", - "counselled": "counseled", - "counselling": "counseling", - "counsellor": "counselor", - "counsellors": "counselors", - "crenellated": "crenelated", - "criminalise": "criminalize", - "criminalised": "criminalized", - "criminalises": "criminalizes", - "criminalising": "criminalizing", - "criticise": "criticize", - "criticised": "criticized", - "criticises": "criticizes", - "criticising": "criticizing", - "crueller": "crueler", - "cruellest": "cruelest", - "crystallisation": "crystallization", - "crystallise": "crystallize", - "crystallised": "crystallized", - "crystallises": "crystallizes", - "crystallising": "crystallizing", - "cudgelled": "cudgeled", - "cudgelling": "cudgeling", - "customise": "customize", - "customised": "customized", - "customises": "customizes", - "customising": "customizing", - "cypher": "cipher", - "cyphers": "ciphers", - "decentralisation": "decentralization", - "decentralise": "decentralize", - "decentralised": "decentralized", - "decentralises": "decentralizes", - "decentralising": "decentralizing", - "decriminalisation": "decriminalization", - "decriminalise": "decriminalize", - "decriminalised": "decriminalized", - "decriminalises": "decriminalizes", - "decriminalising": "decriminalizing", - "defence": "defense", - "defenceless": "defenseless", - "defences": "defenses", - "dehumanisation": "dehumanization", - "dehumanise": "dehumanize", - "dehumanised": "dehumanized", - "dehumanises": "dehumanizes", - "dehumanising": "dehumanizing", - "demeanour": "demeanor", - "demilitarisation": "demilitarization", - "demilitarise": "demilitarize", - "demilitarised": "demilitarized", - "demilitarises": "demilitarizes", - "demilitarising": "demilitarizing", - "demobilisation": "demobilization", - "demobilise": "demobilize", - "demobilised": "demobilized", - "demobilises": "demobilizes", - "demobilising": "demobilizing", - "democratisation": "democratization", - "democratise": "democratize", - "democratised": "democratized", - "democratises": "democratizes", - "democratising": "democratizing", - "demonise": "demonize", - "demonised": "demonized", - "demonises": "demonizes", - "demonising": "demonizing", - "demoralisation": "demoralization", - "demoralise": "demoralize", - "demoralised": "demoralized", - "demoralises": "demoralizes", - "demoralising": "demoralizing", - "denationalisation": "denationalization", - "denationalise": "denationalize", - "denationalised": "denationalized", - "denationalises": "denationalizes", - "denationalising": "denationalizing", - "deodorise": "deodorize", - "deodorised": "deodorized", - "deodorises": "deodorizes", - "deodorising": "deodorizing", - "depersonalise": "depersonalize", - "depersonalised": "depersonalized", - "depersonalises": "depersonalizes", - "depersonalising": "depersonalizing", - "deputise": "deputize", - "deputised": "deputized", - "deputises": "deputizes", - "deputising": "deputizing", - "desensitisation": "desensitization", - "desensitise": "desensitize", - "desensitised": "desensitized", - "desensitises": "desensitizes", - "desensitising": "desensitizing", - "destabilisation": "destabilization", - "destabilise": "destabilize", - "destabilised": "destabilized", - "destabilises": "destabilizes", - "destabilising": "destabilizing", - "dialled": "dialed", - "dialling": "dialing", - "dialogue": "dialog", - "dialogues": "dialogs", - "diarrhoea": "diarrhea", - "digitise": "digitize", - "digitised": "digitized", - "digitises": "digitizes", - "digitising": "digitizing", - "disc": "disk", - "discolour": "discolor", - "discoloured": "discolored", - "discolouring": "discoloring", - "discolours": "discolors", - "discs": "disks", - "disembowelled": "disemboweled", - "disembowelling": "disemboweling", - "disfavour": "disfavor", - "dishevelled": "disheveled", - "dishonour": "dishonor", - "dishonourable": "dishonorable", - "dishonourably": "dishonorably", - "dishonoured": "dishonored", - "dishonouring": "dishonoring", - "dishonours": "dishonors", - "disorganisation": "disorganization", - "disorganised": "disorganized", - "distil": "distill", - "distils": "distills", - "doin": "doing", - "doin'": "doing", - "dramatisation": "dramatization", - "dramatisations": "dramatizations", - "dramatise": "dramatize", - "dramatised": "dramatized", - "dramatises": "dramatizes", - "dramatising": "dramatizing", - "draught": "draft", - "draughtboard": "draftboard", - "draughtboards": "draftboards", - "draughtier": "draftier", - "draughtiest": "draftiest", - "draughts": "drafts", - "draughtsman": "draftsman", - "draughtsmanship": "draftsmanship", - "draughtsmen": "draftsmen", - "draughtswoman": "draftswoman", - "draughtswomen": "draftswomen", - "draughty": "drafty", - "drivelled": "driveled", - "drivelling": "driveling", - "duelled": "dueled", - "duelling": "dueling", - "economise": "economize", - "economised": "economized", - "economises": "economizes", - "economising": "economizing", - "edoema": "edema ", - "editorialise": "editorialize", - "editorialised": "editorialized", - "editorialises": "editorializes", - "editorialising": "editorializing", - "empathise": "empathize", - "empathised": "empathized", - "empathises": "empathizes", - "empathising": "empathizing", - "emphasise": "emphasize", - "emphasised": "emphasized", - "emphasises": "emphasizes", - "emphasising": "emphasizing", - "enamelled": "enameled", - "enamelling": "enameling", - "enamoured": "enamored", - "encyclopaedia": "encyclopedia", - "encyclopaedias": "encyclopedias", - "encyclopaedic": "encyclopedic", - "endeavour": "endeavor", - "endeavoured": "endeavored", - "endeavouring": "endeavoring", - "endeavours": "endeavors", - "energise": "energize", - "energised": "energized", - "energises": "energizes", - "energising": "energizing", - "enrol": "enroll", - "enrols": "enrolls", - "enthral": "enthrall", - "enthrals": "enthralls", - "epaulette": "epaulet", - "epaulettes": "epaulets", - "epicentre": "epicenter", - "epicentres": "epicenters", - "epilogue": "epilog", - "epilogues": "epilogs", - "epitomise": "epitomize", - "epitomised": "epitomized", - "epitomises": "epitomizes", - "epitomising": "epitomizing", - "equalisation": "equalization", - "equalise": "equalize", - "equalised": "equalized", - "equaliser": "equalizer", - "equalisers": "equalizers", - "equalises": "equalizes", - "equalising": "equalizing", - "eulogise": "eulogize", - "eulogised": "eulogized", - "eulogises": "eulogizes", - "eulogising": "eulogizing", - "evangelise": "evangelize", - "evangelised": "evangelized", - "evangelises": "evangelizes", - "evangelising": "evangelizing", - "exorcise": "exorcize", - "exorcised": "exorcized", - "exorcises": "exorcizes", - "exorcising": "exorcizing", - "extemporisation": "extemporization", - "extemporise": "extemporize", - "extemporised": "extemporized", - "extemporises": "extemporizes", - "extemporising": "extemporizing", - "externalisation": "externalization", - "externalisations": "externalizations", - "externalise": "externalize", - "externalised": "externalized", - "externalises": "externalizes", - "externalising": "externalizing", - "factorise": "factorize", - "factorised": "factorized", - "factorises": "factorizes", - "factorising": "factorizing", - "faecal": "fecal", - "faeces": "feces", - "familiarisation": "familiarization", - "familiarise": "familiarize", - "familiarised": "familiarized", - "familiarises": "familiarizes", - "familiarising": "familiarizing", - "fantasise": "fantasize", - "fantasised": "fantasized", - "fantasises": "fantasizes", - "fantasising": "fantasizing", - "favour": "favor", - "favourable": "favorable", - "favourably": "favorably", - "favoured": "favored", - "favouring": "favoring", - "favourite": "favorite", - "favourites": "favorites", - "favouritism": "favoritism", - "favours": "favors", - "feminise": "feminize", - "feminised": "feminized", - "feminises": "feminizes", - "feminising": "feminizing", - "fertilisation": "fertilization", - "fertilise": "fertilize", - "fertilised": "fertilized", - "fertiliser": "fertilizer", - "fertilisers": "fertilizers", - "fertilises": "fertilizes", - "fertilising": "fertilizing", - "fervour": "fervor", - "fibre": "fiber", - "fibreglass": "fiberglass", - "fibres": "fibers", - "fictionalisation": "fictionalization", - "fictionalisations": "fictionalizations", - "fictionalise": "fictionalize", - "fictionalised": "fictionalized", - "fictionalises": "fictionalizes", - "fictionalising": "fictionalizing", - "fillet": "filet", - "filleted ": "fileted ", - "filleting": "fileting", - "fillets ": "filets ", - "finalisation": "finalization", - "finalise": "finalize", - "finalised": "finalized", - "finalises": "finalizes", - "finalising": "finalizing", - "flautist": "flutist", - "flautists": "flutists", - "flavour": "flavor", - "flavoured": "flavored", - "flavouring": "flavoring", - "flavourings": "flavorings", - "flavourless": "flavorless", - "flavours": "flavors", - "flavoursome": "flavorsome", - "flyer / flier ": "flier / flyer ", - "foetal": "fetal", - "foetid": "fetid", - "foetus": "fetus", - "foetuses": "fetuses", - "formalisation": "formalization", - "formalise": "formalize", - "formalised": "formalized", - "formalises": "formalizes", - "formalising": "formalizing", - "fossilisation": "fossilization", - "fossilise": "fossilize", - "fossilised": "fossilized", - "fossilises": "fossilizes", - "fossilising": "fossilizing", - "fraternisation": "fraternization", - "fraternise": "fraternize", - "fraternised": "fraternized", - "fraternises": "fraternizes", - "fraternising": "fraternizing", - "fulfil": "fulfill", - "fulfilment": "fulfillment", - "fulfils": "fulfills", - "funnelled": "funneled", - "funnelling": "funneling", - "galvanise": "galvanize", - "galvanised": "galvanized", - "galvanises": "galvanizes", - "galvanising": "galvanizing", - "gambolled": "gamboled", - "gambolling": "gamboling", - "gaol": "jail", - "gaolbird": "jailbird", - "gaolbirds": "jailbirds", - "gaolbreak": "jailbreak", - "gaolbreaks": "jailbreaks", - "gaoled": "jailed", - "gaoler": "jailer", - "gaolers": "jailers", - "gaoling": "jailing", - "gaols": "jails", - "gases": "gasses", - "gauge": "gage", - "gauged": "gaged", - "gauges": "gages", - "gauging": "gaging", - "generalisation": "generalization", - "generalisations": "generalizations", - "generalise": "generalize", - "generalised": "generalized", - "generalises": "generalizes", - "generalising": "generalizing", - "ghettoise": "ghettoize", - "ghettoised": "ghettoized", - "ghettoises": "ghettoizes", - "ghettoising": "ghettoizing", - "gipsies": "gypsies", - "glamorise": "glamorize", - "glamorised": "glamorized", - "glamorises": "glamorizes", - "glamorising": "glamorizing", - "glamour": "glamor", - "globalisation": "globalization", - "globalise": "globalize", - "globalised": "globalized", - "globalises": "globalizes", - "globalising": "globalizing", - "glueing ": "gluing ", - "goin": "going", - "goin'": "going", - "goitre": "goiter", - "goitres": "goiters", - "gonorrhoea": "gonorrhea", - "gramme": "gram", - "grammes": "grams", - "gravelled": "graveled", - "grey": "gray", - "greyed": "grayed", - "greying": "graying", - "greyish": "grayish", - "greyness": "grayness", - "greys": "grays", - "grovelled": "groveled", - "grovelling": "groveling", - "groyne": "groin", - "groynes ": "groins", - "gruelling": "grueling", - "gruellingly": "gruelingly", - "gryphon": "griffin", - "gryphons": "griffins", - "gynaecological": "gynecological", - "gynaecologist": "gynecologist", - "gynaecologists": "gynecologists", - "gynaecology": "gynecology", - "haematological": "hematological", - "haematologist": "hematologist", - "haematologists": "hematologists", - "haematology": "hematology", - "haemoglobin": "hemoglobin", - "haemophilia": "hemophilia", - "haemophiliac": "hemophiliac", - "haemophiliacs": "hemophiliacs", - "haemorrhage": "hemorrhage", - "haemorrhaged": "hemorrhaged", - "haemorrhages": "hemorrhages", - "haemorrhaging": "hemorrhaging", - "haemorrhoids": "hemorrhoids", - "harbour": "harbor", - "harboured": "harbored", - "harbouring": "harboring", - "harbours": "harbors", - "harmonisation": "harmonization", - "harmonise": "harmonize", - "harmonised": "harmonized", - "harmonises": "harmonizes", - "harmonising": "harmonizing", - "havin": "having", - "havin'": "having", - "homoeopath": "homeopath", - "homoeopathic": "homeopathic", - "homoeopaths": "homeopaths", - "homoeopathy": "homeopathy", - "homogenise": "homogenize", - "homogenised": "homogenized", - "homogenises": "homogenizes", - "homogenising": "homogenizing", - "honour": "honor", - "honourable": "honorable", - "honourably": "honorably", - "honoured": "honored", - "honouring": "honoring", - "honours": "honors", - "hospitalisation": "hospitalization", - "hospitalise": "hospitalize", - "hospitalised": "hospitalized", - "hospitalises": "hospitalizes", - "hospitalising": "hospitalizing", - "humanise": "humanize", - "humanised": "humanized", - "humanises": "humanizes", - "humanising": "humanizing", - "humour": "humor", - "humoured": "humored", - "humouring": "humoring", - "humourless": "humorless", - "humours": "humors", - "hybridise": "hybridize", - "hybridised": "hybridized", - "hybridises": "hybridizes", - "hybridising": "hybridizing", - "hypnotise": "hypnotize", - "hypnotised": "hypnotized", - "hypnotises": "hypnotizes", - "hypnotising": "hypnotizing", - "hypothesise": "hypothesize", - "hypothesised": "hypothesized", - "hypothesises": "hypothesizes", - "hypothesising": "hypothesizing", - "idealisation": "idealization", - "idealise": "idealize", - "idealised": "idealized", - "idealises": "idealizes", - "idealising": "idealizing", - "idolise": "idolize", - "idolised": "idolized", - "idolises": "idolizes", - "idolising": "idolizing", - "immobilisation": "immobilization", - "immobilise": "immobilize", - "immobilised": "immobilized", - "immobiliser": "immobilizer", - "immobilisers": "immobilizers", - "immobilises": "immobilizes", - "immobilising": "immobilizing", - "immortalise": "immortalize", - "immortalised": "immortalized", - "immortalises": "immortalizes", - "immortalising": "immortalizing", - "immunisation": "immunization", - "immunise": "immunize", - "immunised": "immunized", - "immunises": "immunizes", - "immunising": "immunizing", - "impanelled": "impaneled", - "impanelling": "impaneling", - "imperilled": "imperiled", - "imperilling": "imperiling", - "individualise": "individualize", - "individualised": "individualized", - "individualises": "individualizes", - "individualising": "individualizing", - "industrialise": "industrialize", - "industrialised": "industrialized", - "industrialises": "industrializes", - "industrialising": "industrializing", - "inflexion": "inflection", - "inflexions": "inflections", - "initialise": "initialize", - "initialised": "initialized", - "initialises": "initializes", - "initialising": "initializing", - "initialled": "initialed", - "initialling": "initialing", - "instal": "install", - "instalment": "installment", - "instalments": "installments", - "instals": "installs", - "instil": "instill", - "instils": "instills", - "institutionalisation": "institutionalization", - "institutionalise": "institutionalize", - "institutionalised": "institutionalized", - "institutionalises": "institutionalizes", - "institutionalising": "institutionalizing", - "intellectualise": "intellectualize", - "intellectualised": "intellectualized", - "intellectualises": "intellectualizes", - "intellectualising": "intellectualizing", - "internalisation": "internalization", - "internalise": "internalize", - "internalised": "internalized", - "internalises": "internalizes", - "internalising": "internalizing", - "internationalisation": "internationalization", - "internationalise": "internationalize", - "internationalised": "internationalized", - "internationalises": "internationalizes", - "internationalising": "internationalizing", - "ionisation": "ionization", - "ionise": "ionize", - "ionised": "ionized", - "ioniser": "ionizer", - "ionisers": "ionizers", - "ionises": "ionizes", - "ionising": "ionizing", - "italicise": "italicize", - "italicised": "italicized", - "italicises": "italicizes", - "italicising": "italicizing", - "itemise": "itemize", - "itemised": "itemized", - "itemises": "itemizes", - "itemising": "itemizing", - "jeopardise": "jeopardize", - "jeopardised": "jeopardized", - "jeopardises": "jeopardizes", - "jeopardising": "jeopardizing", - "jewelled": "jeweled", - "jeweller": "jeweler", - "jewellers": "jewelers", - "jewellery": "jewelry", - "judgement ": "judgment", - "kilogramme": "kilogram", - "kilogrammes": "kilograms", - "kilometre": "kilometer", - "kilometres": "kilometers", - "labelled": "labeled", - "labelling": "labeling", - "labour": "labor", - "laboured": "labored", - "labourer": "laborer", - "labourers": "laborers", - "labouring": "laboring", - "labours": "labors", - "lacklustre": "lackluster", - "legalisation": "legalization", - "legalise": "legalize", - "legalised": "legalized", - "legalises": "legalizes", - "legalising": "legalizing", - "legitimise": "legitimize", - "legitimised": "legitimized", - "legitimises": "legitimizes", - "legitimising": "legitimizing", - "leukaemia": "leukemia", - "levelled": "leveled", - "leveller": "leveler", - "levellers": "levelers", - "levelling": "leveling", - "libelled": "libeled", - "libelling": "libeling", - "libellous": "libelous", - "liberalisation": "liberalization", - "liberalise": "liberalize", - "liberalised": "liberalized", - "liberalises": "liberalizes", - "liberalising": "liberalizing", - "licence": "license", - "licenced": "licensed", - "licences": "licenses", - "licencing": "licensing", - "likeable": "likable ", - "lionisation": "lionization", - "lionise": "lionize", - "lionised": "lionized", - "lionises": "lionizes", - "lionising": "lionizing", - "liquidise": "liquidize", - "liquidised": "liquidized", - "liquidiser": "liquidizer", - "liquidisers": "liquidizers", - "liquidises": "liquidizes", - "liquidising": "liquidizing", - "litre": "liter", - "litres": "liters", - "localise": "localize", - "localised": "localized", - "localises": "localizes", - "localising": "localizing", - "lovin": "loving", - "lovin'": "loving", - "louvre": "louver", - "louvred": "louvered", - "louvres": "louvers ", - "lustre": "luster", - "magnetise": "magnetize", - "magnetised": "magnetized", - "magnetises": "magnetizes", - "magnetising": "magnetizing", - "manoeuvrability": "maneuverability", - "manoeuvrable": "maneuverable", - "manoeuvre": "maneuver", - "manoeuvred": "maneuvered", - "manoeuvres": "maneuvers", - "manoeuvring": "maneuvering", - "manoeuvrings": "maneuverings", - "marginalisation": "marginalization", - "marginalise": "marginalize", - "marginalised": "marginalized", - "marginalises": "marginalizes", - "marginalising": "marginalizing", - "marshalled": "marshaled", - "marshalling": "marshaling", - "marvelled": "marveled", - "marvelling": "marveling", - "marvellous": "marvelous", - "marvellously": "marvelously", - "materialisation": "materialization", - "materialise": "materialize", - "materialised": "materialized", - "materialises": "materializes", - "materialising": "materializing", - "maximisation": "maximization", - "maximise": "maximize", - "maximised": "maximized", - "maximises": "maximizes", - "maximising": "maximizing", - "meagre": "meager", - "mechanisation": "mechanization", - "mechanise": "mechanize", - "mechanised": "mechanized", - "mechanises": "mechanizes", - "mechanising": "mechanizing", - "mediaeval": "medieval", - "memorialise": "memorialize", - "memorialised": "memorialized", - "memorialises": "memorializes", - "memorialising": "memorializing", - "memorise": "memorize", - "memorised": "memorized", - "memorises": "memorizes", - "memorising": "memorizing", - "mesmerise": "mesmerize", - "mesmerised": "mesmerized", - "mesmerises": "mesmerizes", - "mesmerising": "mesmerizing", - "metabolise": "metabolize", - "metabolised": "metabolized", - "metabolises": "metabolizes", - "metabolising": "metabolizing", - "metre": "meter", - "metres": "meters", - "micrometre": "micrometer", - "micrometres": "micrometers", - "militarise": "militarize", - "militarised": "militarized", - "militarises": "militarizes", - "militarising": "militarizing", - "milligramme": "milligram", - "milligrammes": "milligrams", - "millilitre": "milliliter", - "millilitres": "milliliters", - "millimetre": "millimeter", - "millimetres": "millimeters", - "miniaturisation": "miniaturization", - "miniaturise": "miniaturize", - "miniaturised": "miniaturized", - "miniaturises": "miniaturizes", - "miniaturising": "miniaturizing", - "minibuses": "minibusses ", - "minimise": "minimize", - "minimised": "minimized", - "minimises": "minimizes", - "minimising": "minimizing", - "misbehaviour": "misbehavior", - "misdemeanour": "misdemeanor", - "misdemeanours": "misdemeanors", - "misspelt": "misspelled ", - "mitre": "miter", - "mitres": "miters", - "mobilisation": "mobilization", - "mobilise": "mobilize", - "mobilised": "mobilized", - "mobilises": "mobilizes", - "mobilising": "mobilizing", - "modelled": "modeled", - "modeller": "modeler", - "modellers": "modelers", - "modelling": "modeling", - "modernise": "modernize", - "modernised": "modernized", - "modernises": "modernizes", - "modernising": "modernizing", - "moisturise": "moisturize", - "moisturised": "moisturized", - "moisturiser": "moisturizer", - "moisturisers": "moisturizers", - "moisturises": "moisturizes", - "moisturising": "moisturizing", - "monologue": "monolog", - "monologues": "monologs", - "monopolisation": "monopolization", - "monopolise": "monopolize", - "monopolised": "monopolized", - "monopolises": "monopolizes", - "monopolising": "monopolizing", - "moralise": "moralize", - "moralised": "moralized", - "moralises": "moralizes", - "moralising": "moralizing", - "motorised": "motorized", - "mould": "mold", - "moulded": "molded", - "moulder": "molder", - "mouldered": "moldered", - "mouldering": "moldering", - "moulders": "molders", - "mouldier": "moldier", - "mouldiest": "moldiest", - "moulding": "molding", - "mouldings": "moldings", - "moulds": "molds", - "mouldy": "moldy", - "moult": "molt", - "moulted": "molted", - "moulting": "molting", - "moults": "molts", - "moustache": "mustache", - "moustached": "mustached", - "moustaches": "mustaches", - "moustachioed": "mustachioed", - "multicoloured": "multicolored", - "nationalisation": "nationalization", - "nationalisations": "nationalizations", - "nationalise": "nationalize", - "nationalised": "nationalized", - "nationalises": "nationalizes", - "nationalising": "nationalizing", - "naturalisation": "naturalization", - "naturalise": "naturalize", - "naturalised": "naturalized", - "naturalises": "naturalizes", - "naturalising": "naturalizing", - "neighbour": "neighbor", - "neighbourhood": "neighborhood", - "neighbourhoods": "neighborhoods", - "neighbouring": "neighboring", - "neighbourliness": "neighborliness", - "neighbourly": "neighborly", - "neighbours": "neighbors", - "neutralisation": "neutralization", - "neutralise": "neutralize", - "neutralised": "neutralized", - "neutralises": "neutralizes", - "neutralising": "neutralizing", - "normalisation": "normalization", - "normalise": "normalize", - "normalised": "normalized", - "normalises": "normalizes", - "normalising": "normalizing", - "odour": "odor", - "odourless": "odorless", - "odours": "odors", - "oesophagus": "esophagus", - "oesophaguses": "esophaguses", - "oestrogen": "estrogen", - "offence": "offense", - "offences": "offenses", - "omelette": "omelet", - "omelettes": "omelets", - "optimise": "optimize", - "optimised": "optimized", - "optimises": "optimizes", - "optimising": "optimizing", - "organisation": "organization", - "organisational": "organizational", - "organisations": "organizations", - "organise": "organize", - "organised": "organized", - "organiser": "organizer", - "organisers": "organizers", - "organises": "organizes", - "organising": "organizing", - "orthopaedic": "orthopedic", - "orthopaedics": "orthopedics", - "ostracise": "ostracize", - "ostracised": "ostracized", - "ostracises": "ostracizes", - "ostracising": "ostracizing", - "outmanoeuvre": "outmaneuver", - "outmanoeuvred": "outmaneuvered", - "outmanoeuvres": "outmaneuvers", - "outmanoeuvring": "outmaneuvering", - "overemphasise": "overemphasize", - "overemphasised": "overemphasized", - "overemphasises": "overemphasizes", - "overemphasising": "overemphasizing", - "oxidisation": "oxidization", - "oxidise": "oxidize", - "oxidised": "oxidized", - "oxidises": "oxidizes", - "oxidising": "oxidizing", - "paederast": "pederast", - "paederasts": "pederasts", - "paediatric": "pediatric", - "paediatrician": "pediatrician", - "paediatricians": "pediatricians", - "paediatrics": "pediatrics", - "paedophile": "pedophile", - "paedophiles": "pedophiles", - "paedophilia": "pedophilia", - "palaeolithic": "paleolithic", - "palaeontologist": "paleontologist", - "palaeontologists": "paleontologists", - "palaeontology": "paleontology", - "panelled": "paneled", - "panelling": "paneling", - "panellist": "panelist", - "panellists": "panelists", - "paralyse": "paralyze", - "paralysed": "paralyzed", - "paralyses": "paralyzes", - "paralysing": "paralyzing", - "parcelled": "parceled", - "parcelling": "parceling", - "parlour": "parlor", - "parlours": "parlors", - "particularise": "particularize", - "particularised": "particularized", - "particularises": "particularizes", - "particularising": "particularizing", - "passivisation": "passivization", - "passivise": "passivize", - "passivised": "passivized", - "passivises": "passivizes", - "passivising": "passivizing", - "pasteurisation": "pasteurization", - "pasteurise": "pasteurize", - "pasteurised": "pasteurized", - "pasteurises": "pasteurizes", - "pasteurising": "pasteurizing", - "patronise": "patronize", - "patronised": "patronized", - "patronises": "patronizes", - "patronising": "patronizing", - "patronisingly": "patronizingly", - "pedalled": "pedaled", - "pedalling": "pedaling", - "pedestrianisation": "pedestrianization", - "pedestrianise": "pedestrianize", - "pedestrianised": "pedestrianized", - "pedestrianises": "pedestrianizes", - "pedestrianising": "pedestrianizing", - "penalise": "penalize", - "penalised": "penalized", - "penalises": "penalizes", - "penalising": "penalizing", - "pencilled": "penciled", - "pencilling": "penciling", - "personalise": "personalize", - "personalised": "personalized", - "personalises": "personalizes", - "personalising": "personalizing", - "pharmacopoeia": "pharmacopeia", - "pharmacopoeias": "pharmacopeias", - "philosophise": "philosophize", - "philosophised": "philosophized", - "philosophises": "philosophizes", - "philosophising": "philosophizing", - "philtre": "filter", - "philtres": "filters", - "phoney ": "phony ", - "plagiarise": "plagiarize", - "plagiarised": "plagiarized", - "plagiarises": "plagiarizes", - "plagiarising": "plagiarizing", - "plough": "plow", - "ploughed": "plowed", - "ploughing": "plowing", - "ploughman": "plowman", - "ploughmen": "plowmen", - "ploughs": "plows", - "ploughshare": "plowshare", - "ploughshares": "plowshares", - "polarisation": "polarization", - "polarise": "polarize", - "polarised": "polarized", - "polarises": "polarizes", - "polarising": "polarizing", - "politicisation": "politicization", - "politicise": "politicize", - "politicised": "politicized", - "politicises": "politicizes", - "politicising": "politicizing", - "popularisation": "popularization", - "popularise": "popularize", - "popularised": "popularized", - "popularises": "popularizes", - "popularising": "popularizing", - "pouffe": "pouf", - "pouffes": "poufs", - "practise": "practice", - "practised": "practiced", - "practises": "practices", - "practising ": "practicing ", - "praesidium": "presidium", - "praesidiums ": "presidiums ", - "pressurisation": "pressurization", - "pressurise": "pressurize", - "pressurised": "pressurized", - "pressurises": "pressurizes", - "pressurising": "pressurizing", - "pretence": "pretense", - "pretences": "pretenses", - "primaeval": "primeval", - "prioritisation": "prioritization", - "prioritise": "prioritize", - "prioritised": "prioritized", - "prioritises": "prioritizes", - "prioritising": "prioritizing", - "privatisation": "privatization", - "privatisations": "privatizations", - "privatise": "privatize", - "privatised": "privatized", - "privatises": "privatizes", - "privatising": "privatizing", - "professionalisation": "professionalization", - "professionalise": "professionalize", - "professionalised": "professionalized", - "professionalises": "professionalizes", - "professionalising": "professionalizing", - "programme": "program", - "programmes": "programs", - "prologue": "prolog", - "prologues": "prologs", - "propagandise": "propagandize", - "propagandised": "propagandized", - "propagandises": "propagandizes", - "propagandising": "propagandizing", - "proselytise": "proselytize", - "proselytised": "proselytized", - "proselytiser": "proselytizer", - "proselytisers": "proselytizers", - "proselytises": "proselytizes", - "proselytising": "proselytizing", - "psychoanalyse": "psychoanalyze", - "psychoanalysed": "psychoanalyzed", - "psychoanalyses": "psychoanalyzes", - "psychoanalysing": "psychoanalyzing", - "publicise": "publicize", - "publicised": "publicized", - "publicises": "publicizes", - "publicising": "publicizing", - "pulverisation": "pulverization", - "pulverise": "pulverize", - "pulverised": "pulverized", - "pulverises": "pulverizes", - "pulverising": "pulverizing", - "pummelled": "pummel", - "pummelling": "pummeled", - "pyjama": "pajama", - "pyjamas": "pajamas", - "pzazz": "pizzazz", - "quarrelled": "quarreled", - "quarrelling": "quarreling", - "radicalise": "radicalize", - "radicalised": "radicalized", - "radicalises": "radicalizes", - "radicalising": "radicalizing", - "rancour": "rancor", - "randomise": "randomize", - "randomised": "randomized", - "randomises": "randomizes", - "randomising": "randomizing", - "rationalisation": "rationalization", - "rationalisations": "rationalizations", - "rationalise": "rationalize", - "rationalised": "rationalized", - "rationalises": "rationalizes", - "rationalising": "rationalizing", - "ravelled": "raveled", - "ravelling": "raveling", - "realisable": "realizable", - "realisation": "realization", - "realisations": "realizations", - "realise": "realize", - "realised": "realized", - "realises": "realizes", - "realising": "realizing", - "recognisable": "recognizable", - "recognisably": "recognizably", - "recognisance": "recognizance", - "recognise": "recognize", - "recognised": "recognized", - "recognises": "recognizes", - "recognising": "recognizing", - "reconnoitre": "reconnoiter", - "reconnoitred": "reconnoitered", - "reconnoitres": "reconnoiters", - "reconnoitring": "reconnoitering", - "refuelled": "refueled", - "refuelling": "refueling", - "regularisation": "regularization", - "regularise": "regularize", - "regularised": "regularized", - "regularises": "regularizes", - "regularising": "regularizing", - "remodelled": "remodeled", - "remodelling": "remodeling", - "remould": "remold", - "remoulded": "remolded", - "remoulding": "remolding", - "remoulds": "remolds", - "reorganisation": "reorganization", - "reorganisations": "reorganizations", - "reorganise": "reorganize", - "reorganised": "reorganized", - "reorganises": "reorganizes", - "reorganising": "reorganizing", - "revelled": "reveled", - "reveller": "reveler", - "revellers": "revelers", - "revelling": "reveling", - "revitalise": "revitalize", - "revitalised": "revitalized", - "revitalises": "revitalizes", - "revitalising": "revitalizing", - "revolutionise": "revolutionize", - "revolutionised": "revolutionized", - "revolutionises": "revolutionizes", - "revolutionising": "revolutionizing", - "rhapsodise": "rhapsodize", - "rhapsodised": "rhapsodized", - "rhapsodises": "rhapsodizes", - "rhapsodising": "rhapsodizing", - "rigour": "rigor", - "rigours": "rigors", - "ritualised": "ritualized", - "rivalled": "rivaled", - "rivalling": "rivaling", - "romanticise": "romanticize", - "romanticised": "romanticized", - "romanticises": "romanticizes", - "romanticising": "romanticizing", - "rumour": "rumor", - "rumoured": "rumored", - "rumours": "rumors", - "sabre": "saber", - "sabres": "sabers", - "saltpetre": "saltpeter", - "sanitise": "sanitize", - "sanitised": "sanitized", - "sanitises": "sanitizes", - "sanitising": "sanitizing", - "satirise": "satirize", - "satirised": "satirized", - "satirises": "satirizes", - "satirising": "satirizing", - "saviour": "savior", - "saviours": "saviors", - "savour": "savor", - "savoured": "savored", - "savouries": "savories", - "savouring": "savoring", - "savours": "savors", - "savoury": "savory", - "scandalise": "scandalize", - "scandalised": "scandalized", - "scandalises": "scandalizes", - "scandalising": "scandalizing", - "sceptic": "skeptic", - "sceptical": "skeptical", - "sceptically": "skeptically", - "scepticism": "skepticism", - "sceptics": "skeptics", - "sceptre": "scepter", - "sceptres": "scepters", - "scrutinise": "scrutinize", - "scrutinised": "scrutinized", - "scrutinises": "scrutinizes", - "scrutinising": "scrutinizing", - "secularisation": "secularization", - "secularise": "secularize", - "secularised": "secularized", - "secularises": "secularizes", - "secularising": "secularizing", - "sensationalise": "sensationalize", - "sensationalised": "sensationalized", - "sensationalises": "sensationalizes", - "sensationalising": "sensationalizing", - "sensitise": "sensitize", - "sensitised": "sensitized", - "sensitises": "sensitizes", - "sensitising": "sensitizing", - "sentimentalise": "sentimentalize", - "sentimentalised": "sentimentalized", - "sentimentalises": "sentimentalizes", - "sentimentalising": "sentimentalizing", - "sepulchre": "sepulcher", - "sepulchres": "sepulchers ", - "serialisation": "serialization", - "serialisations": "serializations", - "serialise": "serialize", - "serialised": "serialized", - "serialises": "serializes", - "serialising": "serializing", - "sermonise": "sermonize", - "sermonised": "sermonized", - "sermonises": "sermonizes", - "sermonising": "sermonizing", - "sheikh ": "sheik ", - "shovelled": "shoveled", - "shovelling": "shoveling", - "shrivelled": "shriveled", - "shrivelling": "shriveling", - "signalise": "signalize", - "signalised": "signalized", - "signalises": "signalizes", - "signalising": "signalizing", - "signalled": "signaled", - "signalling": "signaling", - "smoulder": "smolder", - "smouldered": "smoldered", - "smouldering": "smoldering", - "smoulders": "smolders", - "snivelled": "sniveled", - "snivelling": "sniveling", - "snorkelled": "snorkeled", - "snorkelling": "snorkeling", - "snowplough": "snowplow", - "snowploughs": "snowplow", - "socialisation": "socialization", - "socialise": "socialize", - "socialised": "socialized", - "socialises": "socializes", - "socialising": "socializing", - "sodomise": "sodomize", - "sodomised": "sodomized", - "sodomises": "sodomizes", - "sodomising": "sodomizing", - "solemnise": "solemnize", - "solemnised": "solemnized", - "solemnises": "solemnizes", - "solemnising": "solemnizing", - "sombre": "somber", - "specialisation": "specialization", - "specialisations": "specializations", - "specialise": "specialize", - "specialised": "specialized", - "specialises": "specializes", - "specialising": "specializing", - "spectre": "specter", - "spectres": "specters", - "spiralled": "spiraled", - "spiralling": "spiraling", - "splendour": "splendor", - "splendours": "splendors", - "squirrelled": "squirreled", - "squirrelling": "squirreling", - "stabilisation": "stabilization", - "stabilise": "stabilize", - "stabilised": "stabilized", - "stabiliser": "stabilizer", - "stabilisers": "stabilizers", - "stabilises": "stabilizes", - "stabilising": "stabilizing", - "standardisation": "standardization", - "standardise": "standardize", - "standardised": "standardized", - "standardises": "standardizes", - "standardising": "standardizing", - "stencilled": "stenciled", - "stencilling": "stenciling", - "sterilisation": "sterilization", - "sterilisations": "sterilizations", - "sterilise": "sterilize", - "sterilised": "sterilized", - "steriliser": "sterilizer", - "sterilisers": "sterilizers", - "sterilises": "sterilizes", - "sterilising": "sterilizing", - "stigmatisation": "stigmatization", - "stigmatise": "stigmatize", - "stigmatised": "stigmatized", - "stigmatises": "stigmatizes", - "stigmatising": "stigmatizing", - "storey": "story", - "storeys": "stories", - "subsidisation": "subsidization", - "subsidise": "subsidize", - "subsidised": "subsidized", - "subsidiser": "subsidizer", - "subsidisers": "subsidizers", - "subsidises": "subsidizes", - "subsidising": "subsidizing", - "succour": "succor", - "succoured": "succored", - "succouring": "succoring", - "succours": "succors", - "sulphate": "sulfate", - "sulphates": "sulfates", - "sulphide": "sulfide", - "sulphides": "sulfides", - "sulphur": "sulfur", - "sulphurous": "sulfurous", - "summarise": "summarize", - "summarised": "summarized", - "summarises": "summarizes", - "summarising": "summarizing", - "swivelled": "swiveled", - "swivelling": "swiveling", - "symbolise": "symbolize", - "symbolised": "symbolized", - "symbolises": "symbolizes", - "symbolising": "symbolizing", - "sympathise": "sympathize", - "sympathised": "sympathized", - "sympathiser": "sympathizer", - "sympathisers": "sympathizers", - "sympathises": "sympathizes", - "sympathising": "sympathizing", - "synchronisation": "synchronization", - "synchronise": "synchronize", - "synchronised": "synchronized", - "synchronises": "synchronizes", - "synchronising": "synchronizing", - "synthesise": "synthesize", - "synthesised": "synthesized", - "synthesiser": "synthesizer", - "synthesisers": "synthesizers", - "synthesises": "synthesizes", - "synthesising": "synthesizing", - "syphon": "siphon", - "syphoned": "siphoned", - "syphoning": "siphoning", - "syphons": "siphons", - "systematisation": "systematization", - "systematise": "systematize", - "systematised": "systematized", - "systematises": "systematizes", - "systematising": "systematizing", - "tantalise": "tantalize", - "tantalised": "tantalized", - "tantalises": "tantalizes", - "tantalising": "tantalizing", - "tantalisingly": "tantalizingly", - "tasselled": "tasseled", - "technicolour": "technicolor", - "temporise": "temporize", - "temporised": "temporized", - "temporises": "temporizes", - "temporising": "temporizing", - "tenderise": "tenderize", - "tenderised": "tenderized", - "tenderises": "tenderizes", - "tenderising": "tenderizing", - "terrorise": "terrorize", - "terrorised": "terrorized", - "terrorises": "terrorizes", - "terrorising": "terrorizing", - "theatre": "theater", - "theatregoer": "theatergoer", - "theatregoers": "theatergoers", - "theatres": "theaters", - "theorise": "theorize", - "theorised": "theorized", - "theorises": "theorizes", - "theorising": "theorizing", - "tonne": "ton", - "tonnes": "tons", - "towelled": "toweled", - "towelling": "toweling", - "toxaemia": "toxemia", - "tranquillise": "tranquilize", - "tranquillised": "tranquilized", - "tranquilliser": "tranquilizer", - "tranquillisers": "tranquilizers", - "tranquillises": "tranquilizes", - "tranquillising": "tranquilizing", - "tranquillity": "tranquility", - "tranquillize": "tranquilize", - "tranquillized": "tranquilized", - "tranquillizer": "tranquilizer", - "tranquillizers": "tranquilizers", - "tranquillizes": "tranquilizes", - "tranquillizing": "tranquilizing", - "tranquilly": "tranquility", - "transistorised": "transistorized", - "traumatise": "traumatize", - "traumatised": "traumatized", - "traumatises": "traumatizes", - "traumatising": "traumatizing", - "travelled": "traveled", - "traveller": "traveler", - "travellers": "travelers", - "travelling": "traveling", - "travelogue": "travelog", - "travelogues ": "travelogs ", - "trialled": "trialed", - "trialling": "trialing", - "tricolour": "tricolor", - "tricolours": "tricolors", - "trivialise": "trivialize", - "trivialised": "trivialized", - "trivialises": "trivializes", - "trivialising": "trivializing", - "tumour": "tumor", - "tumours": "tumors", - "tunnelled": "tunneled", - "tunnelling": "tunneling", - "tyrannise": "tyrannize", - "tyrannised": "tyrannized", - "tyrannises": "tyrannizes", - "tyrannising": "tyrannizing", - "tyre": "tire", - "tyres": "tires", - "unauthorised": "unauthorized", - "uncivilised": "uncivilized", - "underutilised": "underutilized", - "unequalled": "unequaled", - "unfavourable": "unfavorable", - "unfavourably": "unfavorably", - "unionisation": "unionization", - "unionise": "unionize", - "unionised": "unionized", - "unionises": "unionizes", - "unionising": "unionizing", - "unorganised": "unorganized", - "unravelled": "unraveled", - "unravelling": "unraveling", - "unrecognisable": "unrecognizable", - "unrecognised": "unrecognized", - "unrivalled": "unrivaled", - "unsavoury": "unsavory", - "untrammelled": "untrammeled", - "urbanisation": "urbanization", - "urbanise": "urbanize", - "urbanised": "urbanized", - "urbanises": "urbanizes", - "urbanising": "urbanizing", - "utilisable": "utilizable", - "utilisation": "utilization", - "utilise": "utilize", - "utilised": "utilized", - "utilises": "utilizes", - "utilising": "utilizing", - "valour": "valor", - "vandalise": "vandalize", - "vandalised": "vandalized", - "vandalises": "vandalizes", - "vandalising": "vandalizing", - "vaporisation": "vaporization", - "vaporise": "vaporize", - "vaporised": "vaporized", - "vaporises": "vaporizes", - "vaporising": "vaporizing", - "vapour": "vapor", - "vapours": "vapors", - "verbalise": "verbalize", - "verbalised": "verbalized", - "verbalises": "verbalizes", - "verbalising": "verbalizing", - "victimisation": "victimization", - "victimise": "victimize", - "victimised": "victimized", - "victimises": "victimizes", - "victimising": "victimizing", - "videodisc": "videodisk", - "videodiscs": "videodisks", - "vigour": "vigor", - "visualisation": "visualization", - "visualisations": "visualizations", - "visualise": "visualize", - "visualised": "visualized", - "visualises": "visualizes", - "visualising": "visualizing", - "vocalisation": "vocalization", - "vocalisations": "vocalizations", - "vocalise": "vocalize", - "vocalised": "vocalized", - "vocalises": "vocalizes", - "vocalising": "vocalizing", - "vulcanised": "vulcanized", - "vulgarisation": "vulgarization", - "vulgarise": "vulgarize", - "vulgarised": "vulgarized", - "vulgarises": "vulgarizes", - "vulgarising": "vulgarizing", - "waggon": "wagon", - "waggons": "wagons", - "watercolour": "watercolor", - "watercolours": "watercolors", - "weaselled": "weaseled", - "weaselling": "weaseling", - "westernisation": "westernization", - "westernise": "westernize", - "westernised": "westernized", - "westernises": "westernizes", - "westernising": "westernizing", - "womanise": "womanize", - "womanised": "womanized", - "womaniser": "womanizer", - "womanisers": "womanizers", - "womanises": "womanizes", - "womanising": "womanizing", - "woollen": "woolen", - "woollens": "woolens", - "woollies": "woolies", - "woolly": "wooly", - "worshipped ": "worshiped", - "worshipping ": "worshiping ", - "worshipper": "worshiper", - "yodelled": "yodeled", - "yodelling": "yodeling", - "yoghourt": "yogurt", - "yoghourts": "yogurts", - "yoghurt": "yogurt", - "yoghurts": "yogurts", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ea8e355ac..8e2266a40 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -4,25 +4,20 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "id" lex_attr_getters.update(LEX_ATTRS) - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py deleted file mode 100644 index 09ac6a6d3..000000000 --- a/spacy/lang/id/norm_exceptions.py +++ /dev/null @@ -1,532 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# Daftar kosakata yang sering salah dieja -# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja -_exc = { - # Slang and abbreviations - "silahkan": "silakan", - "yg": "yang", - "kalo": "kalau", - "cawu": "caturwulan", - "ok": "oke", - "gak": "tidak", - "enggak": "tidak", - "nggak": "tidak", - "ndak": "tidak", - "ngga": "tidak", - "dgn": "dengan", - "tdk": "tidak", - "jg": "juga", - "klo": "kalau", - "denger": "dengar", - "pinter": "pintar", - "krn": "karena", - "nemuin": "menemukan", - "jgn": "jangan", - "udah": "sudah", - "sy": "saya", - "udh": "sudah", - "dapetin": "mendapatkan", - "ngelakuin": "melakukan", - "ngebuat": "membuat", - "membikin": "membuat", - "bikin": "buat", - # Daftar kosakata yang sering salah dieja - "malpraktik": "malapraktik", - "malfungsi": "malafungsi", - "malserap": "malaserap", - "maladaptasi": "malaadaptasi", - "malsuai": "malasuai", - "maldistribusi": "maladistribusi", - "malgizi": "malagizi", - "malsikap": "malasikap", - "memperhatikan": "memerhatikan", - "akte": "akta", - "cemilan": "camilan", - "esei": "esai", - "frase": "frasa", - "kafeteria": "kafetaria", - "ketapel": "katapel", - "kenderaan": "kendaraan", - "menejemen": "manajemen", - "menejer": "manajer", - "mesjid": "masjid", - "rebo": "rabu", - "seksama": "saksama", - "senggama": "sanggama", - "sekedar": "sekadar", - "seprei": "seprai", - "semedi": "semadi", - "samadi": "semadi", - "amandemen": "amendemen", - "algoritma": "algoritme", - "aritmatika": "aritmetika", - "metoda": "metode", - "materai": "meterai", - "meterei": "meterai", - "kalendar": "kalender", - "kadaluwarsa": "kedaluwarsa", - "katagori": "kategori", - "parlamen": "parlemen", - "sekular": "sekuler", - "selular": "seluler", - "sirkular": "sirkuler", - "survai": "survei", - "survey": "survei", - "aktuil": "aktual", - "formil": "formal", - "trotoir": "trotoar", - "komersiil": "komersial", - "komersil": "komersial", - "tradisionil": "tradisionial", - "orisinil": "orisinal", - "orijinil": "orisinal", - "afdol": "afdal", - "antri": "antre", - "apotik": "apotek", - "atlit": "atlet", - "atmosfir": "atmosfer", - "cidera": "cedera", - "cendikiawan": "cendekiawan", - "cepet": "cepat", - "cinderamata": "cenderamata", - "debet": "debit", - "difinisi": "definisi", - "dekrit": "dekret", - "disain": "desain", - "diskripsi": "deskripsi", - "diskotik": "diskotek", - "eksim": "eksem", - "exim": "eksem", - "faidah": "faedah", - "ekstrim": "ekstrem", - "ekstrimis": "ekstremis", - "komplit": "komplet", - "konkrit": "konkret", - "kongkrit": "konkret", - "kongkret": "konkret", - "kridit": "kredit", - "musium": "museum", - "pinalti": "penalti", - "piranti": "peranti", - "pinsil": "pensil", - "personil": "personel", - "sistim": "sistem", - "teoritis": "teoretis", - "vidio": "video", - "cengkeh": "cengkih", - "desertasi": "disertasi", - "hakekat": "hakikat", - "intelejen": "intelijen", - "kaedah": "kaidah", - "kempes": "kempis", - "kementrian": "kementerian", - "ledeng": "leding", - "nasehat": "nasihat", - "penasehat": "penasihat", - "praktek": "praktik", - "praktekum": "praktikum", - "resiko": "risiko", - "retsleting": "ritsleting", - "senen": "senin", - "amuba": "ameba", - "punggawa": "penggawa", - "surban": "serban", - "nomer": "nomor", - "sorban": "serban", - "bis": "bus", - "agribisnis": "agrobisnis", - "kantung": "kantong", - "khutbah": "khotbah", - "mandur": "mandor", - "rubuh": "roboh", - "pastur": "pastor", - "supir": "sopir", - "goncang": "guncang", - "goa": "gua", - "kaos": "kaus", - "kokoh": "kukuh", - "komulatif": "kumulatif", - "kolomnis": "kolumnis", - "korma": "kurma", - "lobang": "lubang", - "limo": "limusin", - "limosin": "limusin", - "mangkok": "mangkuk", - "saos": "saus", - "sop": "sup", - "sorga": "surga", - "tegor": "tegur", - "telor": "telur", - "obrak-abrik": "ubrak-abrik", - "ekwivalen": "ekuivalen", - "frekwensi": "frekuensi", - "konsekwensi": "konsekuensi", - "kwadran": "kuadran", - "kwadrat": "kuadrat", - "kwalifikasi": "kualifikasi", - "kwalitas": "kualitas", - "kwalitet": "kualitas", - "kwalitatif": "kualitatif", - "kwantitas": "kuantitas", - "kwantitatif": "kuantitatif", - "kwantum": "kuantum", - "kwartal": "kuartal", - "kwintal": "kuintal", - "kwitansi": "kuitansi", - "kwatir": "khawatir", - "kuatir": "khawatir", - "jadual": "jadwal", - "hirarki": "hierarki", - "karir": "karier", - "aktip": "aktif", - "daptar": "daftar", - "efektip": "efektif", - "epektif": "efektif", - "epektip": "efektif", - "Pebruari": "Februari", - "pisik": "fisik", - "pondasi": "fondasi", - "photo": "foto", - "photokopi": "fotokopi", - "hapal": "hafal", - "insap": "insaf", - "insyaf": "insaf", - "konperensi": "konferensi", - "kreatip": "kreatif", - "kreativ": "kreatif", - "maap": "maaf", - "napsu": "nafsu", - "negatip": "negatif", - "negativ": "negatif", - "objektip": "objektif", - "obyektip": "objektif", - "obyektif": "objektif", - "pasip": "pasif", - "pasiv": "pasif", - "positip": "positif", - "positiv": "positif", - "produktip": "produktif", - "produktiv": "produktif", - "sarap": "saraf", - "sertipikat": "sertifikat", - "subjektip": "subjektif", - "subyektip": "subjektif", - "subyektif": "subjektif", - "tarip": "tarif", - "transitip": "transitif", - "transitiv": "transitif", - "faham": "paham", - "fikir": "pikir", - "berfikir": "berpikir", - "telefon": "telepon", - "telfon": "telepon", - "telpon": "telepon", - "tilpon": "telepon", - "nafas": "napas", - "bernafas": "bernapas", - "pernafasan": "pernapasan", - "vermak": "permak", - "vulpen": "pulpen", - "aktifis": "aktivis", - "konfeksi": "konveksi", - "motifasi": "motivasi", - "Nopember": "November", - "propinsi": "provinsi", - "babtis": "baptis", - "jerembab": "jerembap", - "lembab": "lembap", - "sembab": "sembap", - "saptu": "sabtu", - "tekat": "tekad", - "bejad": "bejat", - "nekad": "nekat", - "otoped": "otopet", - "skuad": "skuat", - "jenius": "genius", - "marjin": "margin", - "marjinal": "marginal", - "obyek": "objek", - "subyek": "subjek", - "projek": "proyek", - "azas": "asas", - "ijasah": "ijazah", - "jenasah": "jenazah", - "plasa": "plaza", - "bathin": "batin", - "Katholik": "Katolik", - "orthografi": "ortografi", - "pathogen": "patogen", - "theologi": "teologi", - "ijin": "izin", - "rejeki": "rezeki", - "rejim": "rezim", - "jaman": "zaman", - "jamrud": "zamrud", - "jinah": "zina", - "perjinahan": "perzinaan", - "anugrah": "anugerah", - "cendrawasih": "cenderawasih", - "jendral": "jenderal", - "kripik": "keripik", - "krupuk": "kerupuk", - "ksatria": "kesatria", - "mentri": "menteri", - "negri": "negeri", - "Prancis": "Perancis", - "sebrang": "seberang", - "menyebrang": "menyeberang", - "Sumatra": "Sumatera", - "trampil": "terampil", - "isteri": "istri", - "justeru": "justru", - "perajurit": "prajurit", - "putera": "putra", - "puteri": "putri", - "samudera": "samudra", - "sastera": "sastra", - "sutera": "sutra", - "terompet": "trompet", - "iklas": "ikhlas", - "iktisar": "ikhtisar", - "kafilah": "khafilah", - "kawatir": "khawatir", - "kotbah": "khotbah", - "kusyuk": "khusyuk", - "makluk": "makhluk", - "mahluk": "makhluk", - "mahkluk": "makhluk", - "nahkoda": "nakhoda", - "nakoda": "nakhoda", - "tahta": "takhta", - "takhyul": "takhayul", - "tahyul": "takhayul", - "tahayul": "takhayul", - "akhli": "ahli", - "anarkhi": "anarki", - "kharisma": "karisma", - "kharismatik": "karismatik", - "mahsud": "maksud", - "makhsud": "maksud", - "rakhmat": "rahmat", - "tekhnik": "teknik", - "tehnik": "teknik", - "tehnologi": "teknologi", - "ikhwal": "ihwal", - "expor": "ekspor", - "extra": "ekstra", - "komplex": "komplek", - "sex": "seks", - "taxi": "taksi", - "extasi": "ekstasi", - "syaraf": "saraf", - "syurga": "surga", - "mashur": "masyhur", - "masyur": "masyhur", - "mahsyur": "masyhur", - "mashyur": "masyhur", - "muadzin": "muazin", - "adzan": "azan", - "ustadz": "ustaz", - "ustad": "ustaz", - "ustadzah": "ustaz", - "dzikir": "zikir", - "dzuhur": "zuhur", - "dhuhur": "zuhur", - "zhuhur": "zuhur", - "analisa": "analisis", - "diagnosa": "diagnosis", - "hipotesa": "hipotesis", - "sintesa": "sintesis", - "aktiviti": "aktivitas", - "aktifitas": "aktivitas", - "efektifitas": "efektivitas", - "komuniti": "komunitas", - "kreatifitas": "kreativitas", - "produktifitas": "produktivitas", - "realiti": "realitas", - "realita": "realitas", - "selebriti": "selebritas", - "spotifitas": "sportivitas", - "universiti": "universitas", - "utiliti": "utilitas", - "validiti": "validitas", - "dilokalisir": "dilokalisasi", - "didramatisir": "didramatisasi", - "dipolitisir": "dipolitisasi", - "dinetralisir": "dinetralisasi", - "dikonfrontir": "dikonfrontasi", - "mendominir": "mendominasi", - "koordinir": "koordinasi", - "proklamir": "proklamasi", - "terorganisir": "terorganisasi", - "terealisir": "terealisasi", - "robah": "ubah", - "dirubah": "diubah", - "merubah": "mengubah", - "terlanjur": "telanjur", - "terlantar": "telantar", - "penglepasan": "pelepasan", - "pelihatan": "penglihatan", - "pemukiman": "permukiman", - "pengrumahan": "perumahan", - "penyewaan": "persewaan", - "menyintai": "mencintai", - "menyolok": "mencolok", - "contek": "sontek", - "mencontek": "menyontek", - "pungkir": "mungkir", - "dipungkiri": "dimungkiri", - "kupungkiri": "kumungkiri", - "kaupungkiri": "kaumungkiri", - "nampak": "tampak", - "nampaknya": "tampaknya", - "nongkrong": "tongkrong", - "berternak": "beternak", - "berterbangan": "beterbangan", - "berserta": "beserta", - "berperkara": "beperkara", - "berpergian": "bepergian", - "berkerja": "bekerja", - "berberapa": "beberapa", - "terbersit": "tebersit", - "terpercaya": "tepercaya", - "terperdaya": "teperdaya", - "terpercik": "tepercik", - "terpergok": "tepergok", - "aksesoris": "aksesori", - "handal": "andal", - "hantar": "antar", - "panutan": "anutan", - "atsiri": "asiri", - "bhakti": "bakti", - "china": "cina", - "dharma": "darma", - "diktaktor": "diktator", - "eksport": "ekspor", - "hembus": "embus", - "hadits": "hadis", - "hadist": "hadits", - "harafiah": "harfiah", - "himbau": "imbau", - "import": "impor", - "inget": "ingat", - "hisap": "isap", - "interprestasi": "interpretasi", - "kangker": "kanker", - "konggres": "kongres", - "lansekap": "lanskap", - "maghrib": "magrib", - "emak": "mak", - "moderen": "modern", - "pasport": "paspor", - "perduli": "peduli", - "ramadhan": "ramadan", - "rapih": "rapi", - "Sansekerta": "Sanskerta", - "shalat": "salat", - "sholat": "salat", - "silahkan": "silakan", - "standard": "standar", - "hutang": "utang", - "zinah": "zina", - "ambulan": "ambulans", - "antartika": "sntarktika", - "arteri": "arteria", - "asik": "asyik", - "australi": "australia", - "denga": "dengan", - "depo": "depot", - "detil": "detail", - "ensiklopedi": "ensiklopedia", - "elit": "elite", - "frustasi": "frustrasi", - "gladi": "geladi", - "greget": "gereget", - "itali": "italia", - "karna": "karena", - "klenteng": "kelenteng", - "erling": "kerling", - "kontruksi": "konstruksi", - "masal": "massal", - "merk": "merek", - "respon": "respons", - "diresponi": "direspons", - "skak": "sekak", - "stir": "setir", - "singapur": "singapura", - "standarisasi": "standardisasi", - "varitas": "varietas", - "amphibi": "amfibi", - "anjlog": "anjlok", - "alpukat": "avokad", - "alpokat": "avokad", - "bolpen": "pulpen", - "cabe": "cabai", - "cabay": "cabai", - "ceret": "cerek", - "differensial": "diferensial", - "duren": "durian", - "faksimili": "faksimile", - "faksimil": "faksimile", - "graha": "gerha", - "goblog": "goblok", - "gombrong": "gombroh", - "horden": "gorden", - "korden": "gorden", - "gubug": "gubuk", - "imaginasi": "imajinasi", - "jerigen": "jeriken", - "jirigen": "jeriken", - "carut-marut": "karut-marut", - "kwota": "kuota", - "mahzab": "mazhab", - "mempesona": "memesona", - "milyar": "miliar", - "missi": "misi", - "nenas": "nanas", - "negoisasi": "negosiasi", - "automotif": "otomotif", - "pararel": "paralel", - "paska": "pasca", - "prosen": "persen", - "pete": "petai", - "petay": "petai", - "proffesor": "profesor", - "rame": "ramai", - "rapot": "rapor", - "rileks": "relaks", - "rileksasi": "relaksasi", - "renumerasi": "remunerasi", - "seketaris": "sekretaris", - "sekertaris": "sekretaris", - "sensorik": "sensoris", - "sentausa": "sentosa", - "strawberi": "stroberi", - "strawbery": "stroberi", - "taqwa": "takwa", - "tauco": "taoco", - "tauge": "taoge", - "toge": "taoge", - "tauladan": "teladan", - "taubat": "tobat", - "trilyun": "triliun", - "vissi": "visi", - "coklat": "cokelat", - "narkotika": "narkotik", - "oase": "oasis", - "politisi": "politikus", - "terong": "terung", - "wool": "wol", - "himpit": "impit", - "mujizat": "mukjizat", - "mujijat": "mukjizat", - "yag": "yang", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 4fcfaddb4..8d85b8fc7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -2,26 +2,21 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class LuxembourgishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "lb" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py deleted file mode 100644 index 7063e6863..000000000 --- a/spacy/lang/lb/norm_exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# TODO -# norm execptions: find a possibility to deal with the zillions of spelling -# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) -# here one could include the most common spelling mistakes - -_exc = {"dass": "datt", "viläicht": "vläicht"} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 7c0ed8a04..c9cd82d7b 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -186,10 +186,6 @@ def suffix(string): return string[-3:] -def cluster(string): - return 0 - - def is_alpha(string): return string.isalpha() @@ -218,20 +214,11 @@ def is_stop(string, stops=set()): return string.lower() in stops -def is_oov(string): - return True - - -def get_prob(string): - return -20.0 - - LEX_ATTRS = { attrs.LOWER: lower, attrs.NORM: lower, attrs.PREFIX: prefix, attrs.SUFFIX: suffix, - attrs.CLUSTER: cluster, attrs.IS_ALPHA: is_alpha, attrs.IS_DIGIT: is_digit, attrs.IS_LOWER: is_lower, @@ -239,8 +226,6 @@ LEX_ATTRS = { attrs.IS_TITLE: is_title, attrs.IS_UPPER: is_upper, attrs.IS_STOP: is_stop, - attrs.IS_OOV: is_oov, - attrs.PROB: get_prob, attrs.LIKE_EMAIL: like_email, attrs.LIKE_NUM: like_num, attrs.IS_PUNCT: is_punct, diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index f786d6542..c09996126 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP -from .norm_exceptions import NORM_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class PortugueseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "pt" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py deleted file mode 100644 index ea650cb31..000000000 --- a/spacy/lang/pt/norm_exceptions.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# These exceptions are used to add NORM values based on a token's ORTH value. -# Individual languages can also add their own exceptions and overwrite them - -# for example, British vs. American spelling in English. - -# Norms are only set if no alternative is provided in the tokenizer exceptions. -# Note that this does not change any other token attributes. Its main purpose -# is to normalise the word representations so that equivalent tokens receive -# similar representations. For example: $ and € are very different, but they're -# both currency symbols. By normalising currency symbols to $, all symbols are -# seen as similar, no matter how common they are in the training data. - - -NORM_EXCEPTIONS = { - "R$": "$", # Real - "r$": "$", # Real - "Cz$": "$", # Cruzado - "cz$": "$", # Cruzado - "NCz$": "$", # Cruzado Novo - "ncz$": "$", # Cruzado Novo -} diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index f34fc5435..f0e77d811 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .lemmatizer import RussianLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS -from ...util import update_exc, add_lookups +from ...util import update_exc from ...language import Language from ...lookups import Lookups -from ...attrs import LANG, NORM +from ...attrs import LANG class RussianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "ru" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py deleted file mode 100644 index 43e08948c..000000000 --- a/spacy/lang/ru/norm_exceptions.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang - "прив": "привет", - "дарова": "привет", - "дак": "так", - "дык": "так", - "здарова": "привет", - "пакедава": "пока", - "пакедаво": "пока", - "ща": "сейчас", - "спс": "спасибо", - "пжлст": "пожалуйста", - "плиз": "пожалуйста", - "ладненько": "ладно", - "лады": "ладно", - "лан": "ладно", - "ясн": "ясно", - "всм": "всмысле", - "хош": "хочешь", - "хаюшки": "привет", - "оч": "очень", - "че": "что", - "чо": "что", - "шо": "что", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f27b87102..286d6693b 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -3,22 +3,17 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class SerbianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "sr" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py deleted file mode 100644 index 69f2c3173..000000000 --- a/spacy/lang/sr/norm_exceptions.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang - "ћале": "отац", - "кева": "мајка", - "смор": "досада", - "кец": "јединица", - "тебра": "брат", - "штребер": "ученик", - "факс": "факултет", - "профа": "професор", - "бус": "аутобус", - "пискарало": "службеник", - "бакутанер": "бака", - "џибер": "простак", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py deleted file mode 100644 index fbdceb98c..000000000 --- a/spacy/lang/ta/norm_exceptions.py +++ /dev/null @@ -1,139 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -_exc = { - # Regional words normal - # Sri Lanka - wikipeadia - "இங்க": "இங்கே", - "வாங்க": "வாருங்கள்", - "ஒண்டு": "ஒன்று", - "கண்டு": "கன்று", - "கொண்டு": "கொன்று", - "பண்டி": "பன்றி", - "பச்ச": "பச்சை", - "அம்பது": "ஐம்பது", - "வெச்ச": "வைத்து", - "வச்ச": "வைத்து", - "வச்சி": "வைத்து", - "வாளைப்பழம்": "வாழைப்பழம்", - "மண்ணு": "மண்", - "பொன்னு": "பொன்", - "சாவல்": "சேவல்", - "அங்கால": "அங்கு ", - "அசுப்பு": "நடமாட்டம்", - "எழுவான் கரை": "எழுவான்கரை", - "ஓய்யாரம்": "எழில் ", - "ஒளும்பு": "எழும்பு", - "ஓர்மை": "துணிவு", - "கச்சை": "கோவணம்", - "கடப்பு": "தெருவாசல்", - "சுள்ளி": "காய்ந்த குச்சி", - "திறாவுதல்": "தடவுதல்", - "நாசமறுப்பு": "தொல்லை", - "பரிசாரி": "வைத்தியன்", - "பறவாதி": "பேராசைக்காரன்", - "பிசினி": "உலோபி ", - "விசர்": "பைத்தியம்", - "ஏனம்": "பாத்திரம்", - "ஏலா": "இயலாது", - "ஒசில்": "அழகு", - "ஒள்ளுப்பம்": "கொஞ்சம்", - # Srilankan and indian - "குத்துமதிப்பு": "", - "நூனாயம்": "நூல்நயம்", - "பைய": "மெதுவாக", - "மண்டை": "தலை", - "வெள்ளனே": "சீக்கிரம்", - "உசுப்பு": "எழுப்பு", - "ஆணம்": "குழம்பு", - "உறக்கம்": "தூக்கம்", - "பஸ்": "பேருந்து", - "களவு": "திருட்டு ", - # relationship - "புருசன்": "கணவன்", - "பொஞ்சாதி": "மனைவி", - "புள்ள": "பிள்ளை", - "பிள்ள": "பிள்ளை", - "ஆம்பிளப்புள்ள": "ஆண் பிள்ளை", - "பொம்பிளப்புள்ள": "பெண் பிள்ளை", - "அண்ணாச்சி": "அண்ணா", - "அக்காச்சி": "அக்கா", - "தங்கச்சி": "தங்கை", - # difference words - "பொடியன்": "சிறுவன்", - "பொட்டை": "சிறுமி", - "பிறகு": "பின்பு", - "டக்கென்டு": "விரைவாக", - "கெதியா": "விரைவாக", - "கிறுகி": "திரும்பி", - "போயித்து வாறன்": "போய் வருகிறேன்", - "வருவாங்களா": "வருவார்களா", - # regular spokens - "சொல்லு": "சொல்", - "கேளு": "கேள்", - "சொல்லுங்க": "சொல்லுங்கள்", - "கேளுங்க": "கேளுங்கள்", - "நீங்கள்": "நீ", - "உன்": "உன்னுடைய", - # Portugeese formal words - "அலவாங்கு": "கடப்பாரை", - "ஆசுப்பத்திரி": "மருத்துவமனை", - "உரோதை": "சில்லு", - "கடுதாசி": "கடிதம்", - "கதிரை": "நாற்காலி", - "குசினி": "அடுக்களை", - "கோப்பை": "கிண்ணம்", - "சப்பாத்து": "காலணி", - "தாச்சி": "இரும்புச் சட்டி", - "துவாய்": "துவாலை", - "தவறணை": "மதுக்கடை", - "பீப்பா": "மரத்தாழி", - "யன்னல்": "சாளரம்", - "வாங்கு": "மரஇருக்கை", - # Dutch formal words - "இறாக்கை": "பற்சட்டம்", - "இலாட்சி": "இழுப்பறை", - "கந்தோர்": "பணிமனை", - "நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்", - # English formal words - "இஞ்சினியர்": "பொறியியலாளர்", - "சூப்பு": "ரசம்", - "செக்": "காசோலை", - "சேட்டு": "மேற்ச்சட்டை", - "மார்க்கட்டு": "சந்தை", - "விண்ணன்": "கெட்டிக்காரன்", - # Arabic formal words - "ஈமான்": "நம்பிக்கை", - "சுன்னத்து": "விருத்தசேதனம்", - "செய்த்தான்": "பிசாசு", - "மவுத்து": "இறப்பு", - "ஹலால்": "அங்கீகரிக்கப்பட்டது", - "கறாம்": "நிராகரிக்கப்பட்டது", - # Persian, Hindustanian and hindi formal words - "சுமார்": "கிட்டத்தட்ட", - "சிப்பாய்": "போர்வீரன்", - "சிபார்சு": "சிபாரிசு", - "ஜமீன்": "பணக்காரா்", - "அசல்": "மெய்யான", - "அந்தஸ்து": "கௌரவம்", - "ஆஜர்": "சமா்ப்பித்தல்", - "உசார்": "எச்சரிக்கை", - "அச்சா": "நல்ல", - # English words used in text conversations - "bcoz": "ஏனெனில்", - "bcuz": "ஏனெனில்", - "fav": "விருப்பமான", - "morning": "காலை வணக்கம்", - "gdeveng": "மாலை வணக்கம்", - "gdnyt": "இரவு வணக்கம்", - "gdnit": "இரவு வணக்கம்", - "plz": "தயவு செய்து", - "pls": "தயவு செய்து", - "thx": "நன்றி", - "thanx": "நன்றி", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 06970fbd7..512be0c59 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from ..norm_exceptions import BASE_NORMS -from ...attrs import LANG, NORM +from ...attrs import LANG from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, add_lookups +from ...util import DummyTokenizer class ThaiTokenizer(DummyTokenizer): @@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda _text: "th" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py deleted file mode 100644 index ed1b3e760..000000000 --- a/spacy/lang/th/norm_exceptions.py +++ /dev/null @@ -1,113 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) - "สนุ๊กเกอร์": "สนุกเกอร์", - "โน้ต": "โน้ต", - # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ) - "โทสับ": "โทรศัพท์", - "พุ่งนี้": "พรุ่งนี้", - # Strange (ให้ดูแปลกตา) - "ชะมะ": "ใช่ไหม", - "ชิมิ": "ใช่ไหม", - "ชะ": "ใช่ไหม", - "ช่ายมะ": "ใช่ไหม", - "ป่าว": "เปล่า", - "ป่ะ": "เปล่า", - "ปล่าว": "เปล่า", - "คัย": "ใคร", - "ไค": "ใคร", - "คราย": "ใคร", - "เตง": "ตัวเอง", - "ตะเอง": "ตัวเอง", - "รึ": "หรือ", - "เหรอ": "หรือ", - "หรา": "หรือ", - "หรอ": "หรือ", - "ชั้น": "ฉัน", - "ชั้ล": "ฉัน", - "ช้าน": "ฉัน", - "เทอ": "เธอ", - "เทอร์": "เธอ", - "เทอว์": "เธอ", - "แกร": "แก", - "ป๋ม": "ผม", - "บ่องตง": "บอกตรงๆ", - "ถ่ามตง": "ถามตรงๆ", - "ต่อมตง": "ตอบตรงๆ", - "เพิ่ล": "เพื่อน", - "จอบอ": "จอบอ", - "ดั้ย": "ได้", - "ขอบคุง": "ขอบคุณ", - "ยังงัย": "ยังไง", - "Inw": "เทพ", - "uou": "นอน", - "Lกรีeu": "เกรียน", - # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์) - "เปงราย": "เป็นอะไร", - "เปนรัย": "เป็นอะไร", - "เปงรัย": "เป็นอะไร", - "เป็นอัลไล": "เป็นอะไร", - "ทามมาย": "ทำไม", - "ทามมัย": "ทำไม", - "จังรุย": "จังเลย", - "จังเยย": "จังเลย", - "จุงเบย": "จังเลย", - "ไม่รู้": "มะรุ", - "เฮ่ย": "เฮ้ย", - "เห้ย": "เฮ้ย", - "น่าร็อค": "น่ารัก", - "น่าร๊าก": "น่ารัก", - "ตั้ลล๊าก": "น่ารัก", - "คือร๊ะ": "คืออะไร", - "โอป่ะ": "โอเคหรือเปล่า", - "น่ามคาน": "น่ารำคาญ", - "น่ามสาร": "น่าสงสาร", - "วงวาร": "สงสาร", - "บับว่า": "แบบว่า", - "อัลไล": "อะไร", - "อิจ": "อิจฉา", - # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์) - "กรู": "กู", - "กุ": "กู", - "กรุ": "กู", - "ตู": "กู", - "ตรู": "กู", - "มรึง": "มึง", - "เมิง": "มึง", - "มืง": "มึง", - "มุง": "มึง", - "สาด": "สัตว์", - "สัส": "สัตว์", - "สัก": "สัตว์", - "แสรด": "สัตว์", - "โคโตะ": "โคตร", - "โคด": "โคตร", - "โครต": "โคตร", - "โคตะระ": "โคตร", - "พ่อง": "พ่อมึง", - "แม่เมิง": "แม่มึง", - "เชี่ย": "เหี้ย", - # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร) - "แอร๊ยย": "อ๊าย", - "อร๊ายยย": "อ๊าย", - "มันส์": "มัน", - "วู๊วววววววว์": "วู้", - # Acronym (แบบคำย่อ) - "หมาลัย": "มหาวิทยาลัย", - "วิดวะ": "วิศวะ", - "สินสาด ": "ศิลปศาสตร์", - "สินกำ ": "ศิลปกรรมศาสตร์", - "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ", - "เมกา ": "อเมริกา", - "มอไซค์ ": "มอเตอร์ไซค์", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/language.py b/spacy/language.py index f23776def..703806627 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types from .gold import GoldParse from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer -from .attrs import IS_STOP, LANG +from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -77,6 +78,9 @@ class BaseDefaults(object): lemmatizer=lemmatizer, lookups=lookups, ) + vocab.lex_attr_getters[NORM] = util.add_lookups( + vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm") + ) for tag_str, exc in cls.morph_rules.items(): for orth_str, attrs in exc.items(): vocab.morphology.add_special_case(tag_str, orth_str, attrs) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f31733374..167f57462 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,8 +1,8 @@ from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t -from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG +from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG -from .structs cimport LexemeC, SerializedLexemeC +from .structs cimport LexemeC from .strings cimport StringStore from .vocab cimport Vocab @@ -24,22 +24,6 @@ cdef class Lexeme: self.vocab = vocab self.orth = lex.orth - @staticmethod - cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: - cdef SerializedLexemeC lex_data - buff = &lex.flags - end = &lex.sentiment + sizeof(lex.sentiment) - for i in range(sizeof(lex_data.data)): - lex_data.data[i] = buff[i] - return lex_data - - @staticmethod - cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: - buff = &lex.flags - end = &lex.sentiment + sizeof(lex.sentiment) - for i in range(sizeof(lex_data.data)): - buff[i] = lex_data.data[i] - @staticmethod cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: if name < (sizeof(flags_t) * 8): @@ -56,8 +40,6 @@ cdef class Lexeme: lex.prefix = value elif name == SUFFIX: lex.suffix = value - elif name == CLUSTER: - lex.cluster = value elif name == LANG: lex.lang = value @@ -84,8 +66,6 @@ cdef class Lexeme: return lex.suffix elif feat_name == LENGTH: return lex.length - elif feat_name == CLUSTER: - return lex.cluster elif feat_name == LANG: return lex.lang else: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index a081ffe42..dec2993fa 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from .attrs cimport IS_CURRENCY, IS_OOV, PROB +from .attrs cimport IS_CURRENCY from .attrs import intify_attrs from .errors import Errors, Warnings @@ -89,12 +89,11 @@ cdef class Lexeme: cdef attr_id_t attr attrs = intify_attrs(attrs) for attr, value in attrs.items(): - if attr == PROB: - self.c.prob = value - elif attr == CLUSTER: - self.c.cluster = int(value) - elif isinstance(value, int) or isinstance(value, long): - Lexeme.set_struct_attr(self.c, attr, value) + # skip PROB, e.g. from lexemes.jsonl + if isinstance(value, float): + continue + elif isinstance(value, (int, long)): + Lexeme.set_struct_attr(self.c, attr, value) else: Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) @@ -137,34 +136,6 @@ cdef class Lexeme: xp = get_array_module(vector) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - def to_bytes(self): - lex_data = Lexeme.c_to_bytes(self.c) - start = &self.c.flags - end = &self.c.sentiment + sizeof(self.c.sentiment) - if (end-start) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=end-start, - bad_length=sizeof(lex_data.data))) - byte_string = b"\0" * sizeof(lex_data.data) - byte_chars = byte_string - for i in range(sizeof(lex_data.data)): - byte_chars[i] = lex_data.data[i] - if len(byte_string) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=len(byte_string), - bad_length=sizeof(lex_data.data))) - return byte_string - - def from_bytes(self, bytes byte_string): - # This method doesn't really have a use-case --- wrote it for testing. - # Possibly delete? It puts the Lexeme out of synch with the vocab. - cdef SerializedLexemeC lex_data - if len(byte_string) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=len(byte_string), - bad_length=sizeof(lex_data.data))) - for i in range(len(byte_string)): - lex_data.data[i] = byte_string[i] - Lexeme.c_from_bytes(self.c, lex_data) - self.orth = self.c.orth - @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. @@ -208,10 +179,14 @@ cdef class Lexeme: """RETURNS (float): A scalar value indicating the positivity or negativity of the lexeme.""" def __get__(self): - return self.c.sentiment + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) + return sentiment_table.get(self.c.orth, 0.0) - def __set__(self, float sentiment): - self.c.sentiment = sentiment + def __set__(self, float x): + if "lexeme_sentiment" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_sentiment") + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") + sentiment_table[self.c.orth] = x @property def orth_(self): @@ -238,9 +213,13 @@ cdef class Lexeme: lexeme text. """ def __get__(self): - return self.c.norm + return self.c.norm def __set__(self, attr_t x): + if "lexeme_norm" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_norm") + norm_table = self.vocab.lookups.get_table("lexeme_norm") + norm_table[self.c.orth] = self.vocab.strings[x] self.c.norm = x property shape: @@ -276,10 +255,12 @@ cdef class Lexeme: property cluster: """RETURNS (int): Brown cluster ID.""" def __get__(self): - return self.c.cluster + cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + return cluster_table.get(self.c.orth, 0) - def __set__(self, attr_t x): - self.c.cluster = x + def __set__(self, int x): + cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table[self.c.orth] = x property lang: """RETURNS (uint64): Language of the parent vocabulary.""" @@ -293,10 +274,14 @@ cdef class Lexeme: """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" def __get__(self): - return self.c.prob + prob_table = self.vocab.load_extra_lookups("lexeme_prob") + settings_table = self.vocab.load_extra_lookups("lexeme_settings") + default_oov_prob = settings_table.get("oov_prob", -20.0) + return prob_table.get(self.c.orth, default_oov_prob) def __set__(self, float x): - self.c.prob = x + prob_table = self.vocab.load_extra_lookups("lexeme_prob") + prob_table[self.c.orth] = x property lower_: """RETURNS (unicode): Lowercase form of the word.""" @@ -314,7 +299,7 @@ cdef class Lexeme: return self.vocab.strings[self.c.norm] def __set__(self, unicode x): - self.c.norm = self.vocab.strings.add(x) + self.norm = self.vocab.strings.add(x) property shape_: """RETURNS (unicode): Transform of the word's string, to show @@ -362,13 +347,10 @@ cdef class Lexeme: def __set__(self, flags_t x): self.c.flags = x - property is_oov: + @property + def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_OOV) - - def __set__(self, attr_t x): - Lexeme.c_set_flag(self.c, IS_OOV, x) + return self.orth in self.vocab.vectors property is_stop: """RETURNS (bool): Whether the lexeme is a stop word.""" diff --git a/spacy/lookups.py b/spacy/lookups.py index bf250b4b4..1fa29bdfe 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -124,7 +124,7 @@ class Lookups(object): self._tables[key].update(value) return self - def to_disk(self, path, **kwargs): + def to_disk(self, path, filename="lookups.bin", **kwargs): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. @@ -136,11 +136,11 @@ class Lookups(object): path = ensure_path(path) if not path.exists(): path.mkdir() - filepath = path / "lookups.bin" + filepath = path / filename with filepath.open("wb") as file_: file_.write(self.to_bytes()) - def from_disk(self, path, **kwargs): + def from_disk(self, path, filename="lookups.bin", **kwargs): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. @@ -150,7 +150,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_disk """ path = ensure_path(path) - filepath = path / "lookups.bin" + filepath = path / filename if filepath.exists(): with filepath.open("rb") as file_: data = file_.read() diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b8e63a725..1f5f32675 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -23,29 +23,6 @@ cdef struct LexemeC: attr_t prefix attr_t suffix - attr_t cluster - - float prob - float sentiment - - -cdef struct SerializedLexemeC: - unsigned char[8 + 8*10 + 4 + 4] data - # sizeof(flags_t) # flags - # + sizeof(attr_t) # lang - # + sizeof(attr_t) # id - # + sizeof(attr_t) # length - # + sizeof(attr_t) # orth - # + sizeof(attr_t) # lower - # + sizeof(attr_t) # norm - # + sizeof(attr_t) # shape - # + sizeof(attr_t) # prefix - # + sizeof(attr_t) # suffix - # + sizeof(attr_t) # cluster - # + sizeof(float) # prob - # + sizeof(float) # cluster - # + sizeof(float) # l2_norm - cdef struct SpanC: hash_t id diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 9229c9970..ebb87c8d2 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -12,7 +12,7 @@ cdef enum symbol_t: LIKE_NUM LIKE_EMAIL IS_STOP - IS_OOV + IS_OOV_DEPRECATED IS_BRACKET IS_QUOTE IS_LEFT_PUNCT diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index e438caba5..83a9d0482 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -17,7 +17,7 @@ IDS = { "LIKE_NUM": LIKE_NUM, "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, - "IS_OOV": IS_OOV, + "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED, "IS_BRACKET": IS_BRACKET, "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index 837ceb323..503399ee4 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): assert tokens[7].text == "." -@pytest.mark.parametrize( - "text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")] -) -def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): - tokens = da_tokenizer(text) - assert tokens[0].norm_ == norm - - @pytest.mark.parametrize( "text,n_tokens", [ diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 2e065870e..3b464e1ae 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer): assert len(tokens) == 6 assert tokens[2].text == "z.Zt." assert tokens[2].lemma_ == "zur Zeit" - - -@pytest.mark.parametrize( - "text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])] -) -def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms): - tokens = de_tokenizer(text) - assert [token.norm_ for token in tokens] == norms - - -@pytest.mark.parametrize("text,norm", [("daß", "dass")]) -def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): - tokens = de_tokenizer(text) - assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..a78e1815f 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): assert [token.norm_ for token in tokens] == norms +@pytest.mark.skip @pytest.mark.parametrize( "text,norm", [("radicalised", "radicalized"), ("cuz", "because")] ) diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 7ca2394b7..ebfab75cf 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): assert len(tokens) == 9 assert tokens[1].text == "'t" assert tokens[1].lemma_ == "et" - - -@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) -def test_lb_norm_exceptions(lb_tokenizer, text, norm): - tokens = lb_tokenizer(text) - assert tokens[0].norm_ == norm diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 1671845ee..63faf44fc 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import pickle from spacy.vocab import Vocab from spacy.strings import StringStore @@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1) == len(strings1) - assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1) + assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP + assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"]) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) - assert list(vocab1_d) == list(vocab1) - assert list(vocab2_d) == list(vocab2) + # check strings rather than lexemes, which are only reloaded on demand + assert strings1 == [s for s in vocab1_d.strings if s != "_SP"] + assert strings2 == [s for s in vocab2_d.strings if s != "_SP"] if strings1 == strings2: - assert list(vocab1_d) == list(vocab2_d) + assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"] else: - assert list(vocab1_d) != list(vocab2_d) + assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"] @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): vocab = Vocab(strings=strings) length = len(vocab) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab) == length + assert len(vocab.strings) == len(strings) + 1 # adds _SP @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) == list(sstore2_d) else: assert list(sstore1_d) != list(sstore2_d) + +@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) +def test_pickle_vocab(strings, lex_attr): + vocab = Vocab(strings=strings) + vocab[strings[0]].norm_ = lex_attr + vocab_pickled = pickle.dumps(vocab) + vocab_unpickled = pickle.loads(vocab_pickled) + assert vocab.to_bytes() == vocab_unpickled.to_bytes() diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 701222afc..bcda2999a 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes(): nlp_bytes = nlp.to_bytes() new_nlp.from_bytes(nlp_bytes) # Make sure we have the previously saved lookup table - assert len(new_nlp.vocab.lookups) == 1 + assert "lemma_lookup" in new_nlp.vocab.lookups assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index b57c6705a..af73a79bf 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["dogs"].check_flag(is_len4) is True -def test_lexeme_bytes_roundtrip(en_vocab): - one = en_vocab["one"] - alpha = en_vocab["alpha"] - assert one.orth != alpha.orth - assert one.lower != alpha.lower - alpha.from_bytes(one.to_bytes()) - - assert one.orth_ == alpha.orth_ - assert one.orth == alpha.orth - assert one.lower == alpha.lower - assert one.lower_ == alpha.lower_ - - def test_vocab_lexeme_oov_rank(en_vocab): """Test that default rank is OOV_RANK.""" lex = en_vocab["word"] diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index f78dd33c4..af15e9e91 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) - assert len(vocab.lookups) == 1 assert table_name in vocab.lookups vocab_bytes = vocab.to_bytes() new_vocab = Vocab() new_vocab.from_bytes(vocab_bytes) - assert len(new_vocab.lookups) == 1 + assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 @@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) - assert len(vocab.lookups) == 1 assert table_name in vocab.lookups with make_tempdir() as tmpdir: vocab.to_disk(tmpdir) new_vocab = Vocab() new_vocab.from_disk(tmpdir) - assert len(new_vocab.lookups) == 1 + assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 322ef462a..16d9801ab 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -329,3 +329,15 @@ def test_vocab_prune_vectors(): neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) + + +def test_vector_is_oov(): + vocab = Vocab(vectors_name="test_vocab_is_oov") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + assert vocab["cat"].is_oov is True + assert vocab["dog"].is_oov is True + assert vocab["hamster"].is_oov is False diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b79d2d805..45deebc93 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -17,7 +17,7 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..symbols cimport conj @@ -259,7 +259,7 @@ cdef class Token: @property def prob(self): """RETURNS (float): Smoothed log probability estimate of token type.""" - return self.c.lex.prob + return self.vocab[self.c.lex.orth].prob @property def sentiment(self): @@ -267,7 +267,7 @@ cdef class Token: negativity of the token.""" if "sentiment" in self.doc.user_token_hooks: return self.doc.user_token_hooks["sentiment"](self) - return self.c.lex.sentiment + return self.vocab[self.c.lex.orth].sentiment @property def lang(self): @@ -286,7 +286,7 @@ cdef class Token: @property def cluster(self): """RETURNS (int): Brown cluster ID.""" - return self.c.lex.cluster + return self.vocab[self.c.lex.orth].cluster @property def orth(self): @@ -923,7 +923,7 @@ cdef class Token: @property def is_oov(self): """RETURNS (bool): Whether the token is out-of-vocabulary.""" - return Lexeme.c_check_flag(self.c.lex, IS_OOV) + return self.c.lex.orth in self.vocab.vectors @property def is_stop(self): diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index d989d6c40..73754eb02 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -30,6 +30,7 @@ cdef class Vocab: cpdef public Morphology morphology cpdef public object vectors cpdef public object lookups + cpdef public object lookups_extra cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ef2e86bcc..68f0ac0db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token -from .attrs cimport PROB, LANG, ORTH, TAG, POS -from .structs cimport SerializedLexemeC +from .attrs cimport LANG, ORTH, TAG, POS from .compat import copy_reg, basestring_ from .errors import Errors @@ -22,6 +21,8 @@ from .vectors import Vectors from ._ml import link_vectors_to_models from .lookups import Lookups from . import util +from .lang.norm_exceptions import BASE_NORMS +from .lang.lex_attrs import LEX_ATTRS cdef class Vocab: @@ -32,8 +33,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, - **deprecated_kwargs): + strings=tuple(), lookups=None, lookups_extra=None, + oov_prob=-20., vectors_name=None, **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -44,6 +45,7 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. + lookups_extra (Lookups): Container for optional lookup tables and dictionaries. name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ @@ -51,8 +53,12 @@ cdef class Vocab: tag_map = tag_map if tag_map is not None else {} if lookups in (None, True, False): lookups = Lookups() + if "lexeme_norm" not in lookups: + lookups.add_table("lexeme_norm") if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) + if lookups_extra in (None, True, False): + lookups_extra = Lookups() self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -65,6 +71,7 @@ cdef class Vocab: self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors(name=vectors_name) self.lookups = lookups + self.lookups_extra = lookups_extra @property def lang(self): @@ -173,9 +180,7 @@ cdef class Vocab: value = func(string) if isinstance(value, unicode): value = self.strings.add(value) - if attr == PROB: - lex.prob = value - elif value is not None: + if value is not None: Lexeme.set_struct_attr(lex, attr, value) if not is_oov: self._add_lex_to_vocab(lex.orth, lex) @@ -435,17 +440,16 @@ cdef class Vocab: path = util.ensure_path(path) if not path.exists(): path.mkdir() - setters = ["strings", "lexemes", "vectors"] + setters = ["strings", "vectors"] exclude = util.get_serialization_exclude(setters, exclude, kwargs) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - if "lexemes" not in exclude: - with (path / "lexemes.bin").open("wb") as file_: - file_.write(self.lexemes_to_bytes()) if "vectors" not in "exclude" and self.vectors is not None: self.vectors.to_disk(path) if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) + if "lookups_extra" not in "exclude" and self.lookups_extra is not None: + self.lookups_extra.to_disk(path, filename="lookups_extra.bin") def from_disk(self, path, exclude=tuple(), **kwargs): """Loads state from a directory. Modifies the object in place and @@ -458,13 +462,10 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) - getters = ["strings", "lexemes", "vectors"] + getters = ["strings", "vectors"] exclude = util.get_serialization_exclude(getters, exclude, kwargs) if "strings" not in exclude: self.strings.from_disk(path / "strings.json") # TODO: add exclude? - if "lexemes" not in exclude: - with (path / "lexemes.bin").open("rb") as file_: - self.lexemes_from_bytes(file_.read()) if "vectors" not in exclude: if self.vectors is not None: self.vectors.from_disk(path, exclude=["strings"]) @@ -472,6 +473,14 @@ cdef class Vocab: link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) + if "lookups_extra" not in exclude: + self.lookups_extra.from_disk(path, filename="lookups_extra.bin") + if "lexeme_norm" in self.lookups: + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") + ) + self.length = 0 + self._by_orth = PreshMap() return self def to_bytes(self, exclude=tuple(), **kwargs): @@ -490,9 +499,9 @@ cdef class Vocab: getters = OrderedDict(( ("strings", lambda: self.strings.to_bytes()), - ("lexemes", lambda: self.lexemes_to_bytes()), ("vectors", deserialize_vectors), - ("lookups", lambda: self.lookups.to_bytes()) + ("lookups", lambda: self.lookups.to_bytes()), + ("lookups_extra", lambda: self.lookups_extra.to_bytes()) )) exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -514,99 +523,62 @@ cdef class Vocab: setters = OrderedDict(( ("strings", lambda b: self.strings.from_bytes(b)), - ("lexemes", lambda b: self.lexemes_from_bytes(b)), ("vectors", lambda b: serialize_vectors(b)), - ("lookups", lambda b: self.lookups.from_bytes(b)) + ("lookups", lambda b: self.lookups.from_bytes(b)), + ("lookups_extra", lambda b: self.lookups_extra.from_bytes(b)) )) exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) + if "lexeme_norm" in self.lookups: + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") + ) + self.length = 0 + self._by_orth = PreshMap() if self.vectors.name is not None: link_vectors_to_models(self) return self - def lexemes_to_bytes(self): - cdef hash_t key - cdef size_t addr - cdef LexemeC* lexeme = NULL - cdef SerializedLexemeC lex_data - cdef int size = 0 - for key, addr in self._by_orth.items(): - if addr == 0: - continue - size += sizeof(lex_data.data) - byte_string = b"\0" * size - byte_ptr = byte_string - cdef int j - cdef int i = 0 - for key, addr in self._by_orth.items(): - if addr == 0: - continue - lexeme = addr - lex_data = Lexeme.c_to_bytes(lexeme) - for j in range(sizeof(lex_data.data)): - byte_ptr[i] = lex_data.data[j] - i += 1 - return byte_string - - def lexemes_from_bytes(self, bytes bytes_data): - """Load the binary vocabulary data from the given string.""" - cdef LexemeC* lexeme - cdef hash_t key - cdef unicode py_str - cdef int i = 0 - cdef int j = 0 - cdef SerializedLexemeC lex_data - chunk_size = sizeof(lex_data.data) - cdef void* ptr - cdef unsigned char* bytes_ptr = bytes_data - for i in range(0, len(bytes_data), chunk_size): - lexeme = self.mem.alloc(1, sizeof(LexemeC)) - for j in range(sizeof(lex_data.data)): - lex_data.data[j] = bytes_ptr[i+j] - Lexeme.c_from_bytes(lexeme, lex_data) - prev_entry = self._by_orth.get(lexeme.orth) - if prev_entry != NULL: - memcpy(prev_entry, lexeme, sizeof(LexemeC)) - continue - ptr = self.strings._map.get(lexeme.orth) - if ptr == NULL: - continue - py_str = self.strings[lexeme.orth] - if self.strings[py_str] != lexeme.orth: - raise ValueError(Errors.E086.format(string=py_str, - orth_id=lexeme.orth, - hash_id=self.strings[py_str])) - self._by_orth.set(lexeme.orth, lexeme) - self.length += 1 - def _reset_cache(self, keys, strings): # I'm not sure this made sense. Disable it for now. raise NotImplementedError + def load_extra_lookups(self, table_name): + if table_name not in self.lookups_extra: + if self.lang + "_extra" in util.registry.lookups: + tables = util.registry.lookups.get(self.lang + "_extra") + for name, filename in tables.items(): + if table_name == name: + data = util.load_language_data(filename) + self.lookups_extra.add_table(name, data) + if table_name not in self.lookups_extra: + self.lookups_extra.add_table(table_name) + return self.lookups_extra.get_table(table_name) + + def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors morph = vocab.morphology - length = vocab.length data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) - lexemes_data = vocab.lexemes_to_bytes() + lookups = vocab.lookups + lookups_extra = vocab.lookups_extra return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, bytes lexemes_data, int length): + lex_attr_getters, lookups, lookups_extra): cdef Vocab vocab = Vocab() - vocab.length = length vocab.vectors = vectors vocab.strings = sstore vocab.morphology = morphology vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) - vocab.lexemes_from_bytes(lexemes_data) - vocab.length = length + vocab.lookups = lookups + vocab.lookups_extra = lookups_extra return vocab