Reorganise English language data

2025-11-07 19:37:38 +03:00 · 2017-05-08 15:47:25 +02:00 · 2017-05-08 15:47:25 +02:00 · c7c21b980f
commit c7c21b980f
parent 1bf9d5ec8b
8 changed files with 242 additions and 582 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,14 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..language import Language
-from ..lemmatizer import Lemmatizer
-from ..vocab import Vocab
-from ..tokenizer import Tokenizer
-from ..attrs import LANG
-from ..deprecated import fix_glove_vectors_loading
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+from .morph_rules import MORPH_RULES
+from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC

-from .language_data import *
+from ..language_data import BASE_EXCEPTIONS
+from ..language import Language
+from ..attrs import LANG
+from ..util import update_exc


 class English(Language):
@ -18,20 +20,13 @@ class English(Language):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'en'

-        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        tag_map = TAG_MAP
-        stop_words = STOP_WORDS
-
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        tag_map = dict(TAG_MAP)
+        stop_words = set(STOP_WORDS)
        morph_rules = dict(MORPH_RULES)
        lemma_rules = dict(LEMMA_RULES)
        lemma_index = dict(LEMMA_INDEX)
        lemma_exc = dict(LEMMA_EXC)


-    def __init__(self, **overrides):
-        # Special-case hack for loading the GloVe vectors, to support <1.0
-        overrides = fix_glove_vectors_loading(overrides)
-        Language.__init__(self, **overrides)
-
-
-EXPORT = English
+__all__ = ['English']
--- a/spacy/en/lemmatizer/init.py
+++ b/spacy/en/lemmatizer/init.py
@ -1,3 +1,5 @@
+
+from .lookup import LOOKUP
 from ._adjectives import ADJECTIVES
 from ._adjectives_irreg import ADJECTIVES_IRREG
 from ._adverbs import ADVERBS
@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG
 from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES


-INDEX = {
-    "adj": ADJECTIVES,
-    "adv": ADVERBS,
-    "noun": NOUNS,
-    "verb": VERBS
-}
+LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}

+LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
+             'verb': VERBS_IRREG}

-EXC = {
-    "adj": ADJECTIVES_IRREG,
-    "adv": ADVERBS_IRREG,
-    "noun": NOUNS_IRREG,
-    "verb": VERBS_IRREG
-}
-
-
-RULES = {
-    "adj": ADJECTIVE_RULES,
-    "noun": NOUN_RULES,
-    "verb": VERB_RULES,
-    "punct": PUNCT_RULES
-}
+LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
+               'punct': PUNCT_RULES}
--- a/spacy/en/lemmatizer/lookup.py
+++ b/spacy/en/lemmatizer/lookup.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-LOOK_UP = {
+LOOKUP = {
    " furtherst": "further",
    " skilled": "skill",
    "'cause": "because",
--- a/spacy/en/lex_attrs.py
+++ b/spacy/en/lex_attrs.py
@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# Number words
+
+NUM_WORDS = set("""
+zero one two three four five six seven eight nine ten eleven twelve thirteen
+fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
+sixty seventy eighty ninety hundred thousand million billion trillion
+quadrillion gajillion bazillion
+""".split())
+
+
+# Ordinal words
+
+ORDINAL_WORDS = set("""
+first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
+thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
+twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
+hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
+bazillionth
+""".split())
--- a/spacy/en/morph_rules.py
+++ b/spacy/en/morph_rules.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..symbols import *
-from ..language_data import PRON_LEMMA
+from ..symbols import LEMMA
+from ..deprecated import PRON_LEMMA


 MORPH_RULES = {
--- a/spacy/en/stop_words.py
+++ b/spacy/en/stop_words.py
@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would

 yet you your yours yourself yourselves
 """.split())
-
-
-# Number words
-
-NUM_WORDS = set("""
-zero one two three four five six seven eight nine ten eleven twelve thirteen
-fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
-sixty seventy eighty ninety hundred thousand million billion trillion
-quadrillion gajillion bazillion
-""".split())
-
-
-# Ordinal words
-
-ORDINAL_WORDS = set("""
-first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
-thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
-twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
-hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
-bazillionth
-""".split())
--- a/spacy/en/tag_map.py
+++ b/spacy/en/tag_map.py
@ -1,7 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..symbols import *
+from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON


 TAG_MAP = {
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -1,13 +1,12 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..symbols import *
-from ..language_data import PRON_LEMMA
+from ..symbols import ORTH, LEMMA, TAG, NORM
+from ..deprecated import PRON_LEMMA


-EXC = {}
-
-EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
+_exc = {}
+_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
               "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]


@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",

 for pron in ["i"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'m"] = [
+        _exc[orth + "'m"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}
-        ]
+            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]

-        EXC[orth + "m"] = [
+        _exc[orth + "m"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
-        ]
+            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]

-        EXC[orth + "'ma"] = [
+        _exc[orth + "'ma"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
-            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
-        ]
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]

-        EXC[orth + "ma"] = [
+        _exc[orth + "ma"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
-            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
-        ]
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]


 for pron in ["i", "you", "he", "she", "it", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'ll"] = [
+        _exc[orth + "'ll"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-        ]
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]

-        EXC[orth + "ll"] = [
+        _exc[orth + "ll"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-        ]
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]

-        EXC[orth + "'ll've"] = [
+        _exc[orth + "'ll've"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "llve"] = [
+        _exc[orth + "llve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "'d"] = [
+        _exc[orth + "'d"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-        ]
+            {ORTH: "'d", LEMMA: "would", TAG: "MD"}]

-        EXC[orth + "d"] = [
+        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"}
-        ]
+            {ORTH: "d", LEMMA: "would", TAG: "MD"}]

-        EXC[orth + "'d've"] = [
+        _exc[orth + "'d've"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "dve"] = [
+        _exc[orth + "dve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]


 for pron in ["i", "you", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'ve"] = [
+        _exc[orth + "'ve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "ve"] = [
+        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]


 for pron in ["you", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'re"] = [
+        _exc[orth + "'re"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'re", LEMMA: "be", NORM: "are"}
-        ]
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

-        EXC[orth + "re"] = [
+        _exc[orth + "re"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}
-        ]
+            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]


 for pron in ["he", "she", "it"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'s"] = [
+        _exc[orth + "'s"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'s"}
-        ]
+            {ORTH: "'s"}]

-        EXC[orth + "s"] = [
+        _exc[orth + "s"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "s"}
-        ]
-
+            {ORTH: "s"}]


 # W-words, relative pronouns, prepositions etc.

 for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
    for orth in [word, word.title()]:
-        EXC[orth + "'s"] = [
+        _exc[orth + "'s"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'s"}
-        ]
+            {ORTH: "'s"}]

-        EXC[orth + "s"] = [
+        _exc[orth + "s"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "s"}
-        ]
+            {ORTH: "s"}]

-        EXC[orth + "'ll"] = [
+        _exc[orth + "'ll"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-        ]
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]

-        EXC[orth + "ll"] = [
+        _exc[orth + "ll"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-        ]
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]

-        EXC[orth + "'ll've"] = [
+        _exc[orth + "'ll've"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "llve"] = [
+        _exc[orth + "llve"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "'re"] = [
+        _exc[orth + "'re"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'re", LEMMA: "be", NORM: "are"}
-        ]
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

-        EXC[orth + "re"] = [
+        _exc[orth + "re"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "re", LEMMA: "be", NORM: "are"}
-        ]
+            {ORTH: "re", LEMMA: "be", NORM: "are"}]

-        EXC[orth + "'ve"] = [
+        _exc[orth + "'ve"] = [
            {ORTH: orth},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "ve"] = [
+        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "'d"] = [
+        _exc[orth + "'d"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d"}
-        ]
+            {ORTH: "'d"}]

-        EXC[orth + "d"] = [
+        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "d"}
-        ]
+            {ORTH: "d"}]

-        EXC[orth + "'d've"] = [
+        _exc[orth + "'d've"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[orth + "dve"] = [
+        _exc[orth + "dve"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]


 # Verbs
@ -221,54 +187,44 @@ for verb_data in [
    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
    {ORTH: "should", TAG: "MD"},
    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
-    {ORTH: "would", TAG: "MD"}
-]:
+    {ORTH: "would", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
-
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "n't"] = [
+        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-        ]
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]

-        EXC[data[ORTH] + "nt"] = [
+        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-        ]
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]

-        EXC[data[ORTH] + "n't've"] = [
+        _exc[data[ORTH] + "n't've"] = [
            dict(data),
            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[data[ORTH] + "ntve"] = [
+        _exc[data[ORTH] + "ntve"] = [
            dict(data),
            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]


 for verb_data in [
    {ORTH: "could", TAG: "MD"},
    {ORTH: "might"},
    {ORTH: "must"},
-    {ORTH: "should"}
-]:
+    {ORTH: "should"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
-
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "'ve"] = [
+        _exc[data[ORTH] + "'ve"] = [
            dict(data),
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

-        EXC[data[ORTH] + "ve"] = [
+        _exc[data[ORTH] + "ve"] = [
            dict(data),
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-        ]
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]


 for verb_data in [
@ -276,22 +232,17 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be"},
-    {ORTH: "were", LEMMA: "be"}
-]:
+    {ORTH: "were", LEMMA: "be"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
-
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "n't"] = [
+        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-        ]
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]

-        EXC[data[ORTH] + "nt"] = [
+        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-        ]
-
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]


 # Other contractions with trailing apostrophe
@ -302,22 +253,14 @@ for exc_data in [
    {ORTH: "nothin", LEMMA: "nothing"},
    {ORTH: "nuthin", LEMMA: "nothing"},
    {ORTH: "ol", LEMMA: "old"},
-    {ORTH: "somethin", LEMMA: "something"}
-]:
+    {ORTH: "somethin", LEMMA: "something"}]:
    exc_data_tc = dict(exc_data)
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
-
    for data in [exc_data, exc_data_tc]:
        data_apos = dict(data)
        data_apos[ORTH] = data_apos[ORTH] + "'"
-
-        EXC[data[ORTH]] = [
-            dict(data)
-        ]
-
-        EXC[data_apos[ORTH]] = [
-            dict(data_apos)
-        ]
+        _exc[data[ORTH]] = [dict(data)]
+        _exc[data_apos[ORTH]] = [dict(data_apos)]


 # Other contractions with leading apostrophe
@ -326,449 +269,181 @@ for exc_data in [
    {ORTH: "cause", LEMMA: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
    {ORTH: "ll", LEMMA: "will"},
-    {ORTH: "nuff", LEMMA: "enough"}
-]:
+    {ORTH: "nuff", LEMMA: "enough"}]:
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
-
    for data in [exc_data, exc_data_apos]:
-        EXC[data[ORTH]] = [
-            dict(data)
-        ]
+        _exc[data[ORTH]] = [dict(data)]


 # Times

 for h in range(1, 12 + 1):
    hour = str(h)
-
    for period in ["a.m.", "am"]:
-        EXC[hour + period] = [
+        _exc[hour+period] = [
            {ORTH: hour},
-            {ORTH: period, LEMMA: "a.m."}
-        ]
+            {ORTH: period, LEMMA: "a.m."}]
    for period in ["p.m.", "pm"]:
-        EXC[hour + period] = [
+        _exc[hour+period] = [
            {ORTH: hour},
-            {ORTH: period, LEMMA: "p.m."}
-        ]
+            {ORTH: period, LEMMA: "p.m."}]


 # Rest

-OTHER = {
-    " ": [
-        {ORTH: " ", TAG: "SP"}
-    ],
-
-    "\u00a0": [
-        {ORTH: "\u00a0", TAG: "SP", LEMMA: "  "}
-    ],
-
-    "'S": [
-        {ORTH: "'S", LEMMA: "'s"}
-    ],
-
-    "'s": [
-        {ORTH: "'s", LEMMA: "'s"}
-    ],
-
-    "'re": [
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "\u2018S": [
-        {ORTH: "\u2018S", LEMMA: "'s"}
-    ],
-
-    "\u2018s": [
-        {ORTH: "\u2018s", LEMMA: "'s"}
-    ],
-
-    "and/or": [
-        {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
-    ],
-
-    "'Cause": [
-        {ORTH: "'Cause", LEMMA: "because"}
-    ],
-
+_other_exc = {
    "y'all": [
        {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
-        {ORTH: "all"}
-    ],
+        {ORTH: "all"}],

    "yall": [
        {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
-        {ORTH: "all"}
-    ],
-
-    "ma'am": [
-        {ORTH: "ma'am", LEMMA: "madam"}
-    ],
-
-    "Ma'am": [
-        {ORTH: "Ma'am", LEMMA: "madam"}
-    ],
-
-    "o'clock": [
-        {ORTH: "o'clock", LEMMA: "o'clock"}
-    ],
-
-    "O'clock": [
-        {ORTH: "O'clock", LEMMA: "o'clock"}
-    ],
+        {ORTH: "all"}],

    "how'd'y": [
        {ORTH: "how", LEMMA: "how"},
        {ORTH: "'d", LEMMA: "do"},
-        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
-    ],
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "How'd'y": [
        {ORTH: "How", LEMMA: "how"},
        {ORTH: "'d", LEMMA: "do"},
-        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
-    ],
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "not've": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
+        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],

    "notve": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
+        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],

    "Not've": [
        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
+        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],

    "Notve": [
        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
+        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],

    "cannot": [
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
-    ],
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "Cannot": [
        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
-    ],
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "gonna": [
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}
-    ],
+        {ORTH: "na", LEMMA: "to"}],

    "Gonna": [
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}
-    ],
+        {ORTH: "na", LEMMA: "to"}],

    "gotta": [
        {ORTH: "got"},
-        {ORTH: "ta", LEMMA: "to"}
-    ],
+        {ORTH: "ta", LEMMA: "to"}],

    "Gotta": [
        {ORTH: "Got"},
-        {ORTH: "ta", LEMMA: "to"}
-    ],
+        {ORTH: "ta", LEMMA: "to"}],

    "let's": [
        {ORTH: "let"},
-        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
-    ],
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],

    "Let's": [
        {ORTH: "Let", LEMMA: "let"},
-        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
-    ],
-
-    "\u2014": [
-        {ORTH: "\u2014", TAG: ":", LEMMA: "--"}
-    ],
-
-    "\n": [
-        {ORTH: "\n", TAG: "SP"}
-    ],
-
-    "\t": [
-        {ORTH: "\t", TAG: "SP"}
-    ]
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
 }

-
-# Abbreviations
-
-ABBREVIATIONS = {
-    "Mt.": [
-        {ORTH: "Mt.", LEMMA: "Mount"}
-    ],
-
-    "Ak.": [
-        {ORTH: "Ak.", LEMMA: "Alaska"}
-    ],
-
-    "Ala.": [
-        {ORTH: "Ala.", LEMMA: "Alabama"}
-    ],
-
-    "Apr.": [
-        {ORTH: "Apr.", LEMMA: "April"}
-    ],
-
-    "Ariz.": [
-        {ORTH: "Ariz.", LEMMA: "Arizona"}
-    ],
-
-    "Ark.": [
-        {ORTH: "Ark.", LEMMA: "Arkansas"}
-    ],
-
-    "Aug.": [
-        {ORTH: "Aug.", LEMMA: "August"}
-    ],
-
-    "Calif.": [
-        {ORTH: "Calif.", LEMMA: "California"}
-    ],
-
-    "Colo.": [
-        {ORTH: "Colo.", LEMMA: "Colorado"}
-    ],
-
-    "Conn.": [
-        {ORTH: "Conn.", LEMMA: "Connecticut"}
-    ],
-
-    "Dec.": [
-        {ORTH: "Dec.", LEMMA: "December"}
-    ],
-
-    "Del.": [
-        {ORTH: "Del.", LEMMA: "Delaware"}
-    ],
-
-    "Feb.": [
-        {ORTH: "Feb.", LEMMA: "February"}
-    ],
-
-    "Fla.": [
-        {ORTH: "Fla.", LEMMA: "Florida"}
-    ],
-
-    "Ga.": [
-        {ORTH: "Ga.", LEMMA: "Georgia"}
-    ],
-
-    "Ia.": [
-        {ORTH: "Ia.", LEMMA: "Iowa"}
-    ],
-
-    "Id.": [
-        {ORTH: "Id.", LEMMA: "Idaho"}
-    ],
-
-    "Ill.": [
-        {ORTH: "Ill.", LEMMA: "Illinois"}
-    ],
-
-    "Ind.": [
-        {ORTH: "Ind.", LEMMA: "Indiana"}
-    ],
-
-    "Jan.": [
-        {ORTH: "Jan.", LEMMA: "January"}
-    ],
-
-    "Jul.": [
-        {ORTH: "Jul.", LEMMA: "July"}
-    ],
-
-    "Jun.": [
-        {ORTH: "Jun.", LEMMA: "June"}
-    ],
-
-    "Kan.": [
-        {ORTH: "Kan.", LEMMA: "Kansas"}
-    ],
-
-    "Kans.": [
-        {ORTH: "Kans.", LEMMA: "Kansas"}
-    ],
-
-    "Ky.": [
-        {ORTH: "Ky.", LEMMA: "Kentucky"}
-    ],
-
-    "La.": [
-        {ORTH: "La.", LEMMA: "Louisiana"}
-    ],
-
-    "Mar.": [
-        {ORTH: "Mar.", LEMMA: "March"}
-    ],
-
-    "Mass.": [
-        {ORTH: "Mass.", LEMMA: "Massachusetts"}
-    ],
-
-    "May.": [
-        {ORTH: "May.", LEMMA: "May"}
-    ],
-
-    "Mich.": [
-        {ORTH: "Mich.", LEMMA: "Michigan"}
-    ],
-
-    "Minn.": [
-        {ORTH: "Minn.", LEMMA: "Minnesota"}
-    ],
-
-    "Miss.": [
-        {ORTH: "Miss.", LEMMA: "Mississippi"}
-    ],
-
-    "N.C.": [
-        {ORTH: "N.C.", LEMMA: "North Carolina"}
-    ],
-
-    "N.D.": [
-        {ORTH: "N.D.", LEMMA: "North Dakota"}
-    ],
-
-    "N.H.": [
-        {ORTH: "N.H.", LEMMA: "New Hampshire"}
-    ],
-
-    "N.J.": [
-        {ORTH: "N.J.", LEMMA: "New Jersey"}
-    ],
-
-    "N.M.": [
-        {ORTH: "N.M.", LEMMA: "New Mexico"}
-    ],
-
-    "N.Y.": [
-        {ORTH: "N.Y.", LEMMA: "New York"}
-    ],
-
-    "Neb.": [
-        {ORTH: "Neb.", LEMMA: "Nebraska"}
-    ],
-
-    "Nebr.": [
-        {ORTH: "Nebr.", LEMMA: "Nebraska"}
-    ],
-
-    "Nev.": [
-        {ORTH: "Nev.", LEMMA: "Nevada"}
-    ],
-
-    "Nov.": [
-        {ORTH: "Nov.", LEMMA: "November"}
-    ],
-
-    "Oct.": [
-        {ORTH: "Oct.", LEMMA: "October"}
-    ],
-
-    "Okla.": [
-        {ORTH: "Okla.", LEMMA: "Oklahoma"}
-    ],
-
-    "Ore.": [
-        {ORTH: "Ore.", LEMMA: "Oregon"}
-    ],
-
-    "Pa.": [
-        {ORTH: "Pa.", LEMMA: "Pennsylvania"}
-    ],
-
-    "S.C.": [
-        {ORTH: "S.C.", LEMMA: "South Carolina"}
-    ],
-
-    "Sep.": [
-        {ORTH: "Sep.", LEMMA: "September"}
-    ],
-
-    "Sept.": [
-        {ORTH: "Sept.", LEMMA: "September"}
-    ],
-
-    "Tenn.": [
-        {ORTH: "Tenn.", LEMMA: "Tennessee"}
-    ],
-
-    "Va.": [
-        {ORTH: "Va.", LEMMA: "Virginia"}
-    ],
-
-    "Wash.": [
-        {ORTH: "Wash.", LEMMA: "Washington"}
-    ],
-
-    "Wis.": [
-        {ORTH: "Wis.", LEMMA: "Wisconsin"}
-    ]
-}
+_exc.update(_other_exc)


-TOKENIZER_EXCEPTIONS = dict(EXC)
-TOKENIZER_EXCEPTIONS.update(OTHER)
-TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
+for exc_data in [
+    {ORTH: "'S", LEMMA: "'s"},
+    {ORTH: "'s", LEMMA: "'s"},
+    {ORTH: "\u2018S", LEMMA: "'s"},
+    {ORTH: "\u2018s", LEMMA: "'s"},
+    {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
+    {ORTH: "'re", LEMMA: "be", NORM: "are"},
+    {ORTH: "'Cause", LEMMA: "because"},
+    {ORTH: "'cause", LEMMA: "because"},
+    {ORTH: "ma'am", LEMMA: "madam"},
+    {ORTH: "Ma'am", LEMMA: "madam"},
+    {ORTH: "o'clock", LEMMA: "o'clock"},
+    {ORTH: "O'clock", LEMMA: "o'clock"},
+
+    {ORTH: "Mt.", LEMMA: "Mount"},
+    {ORTH: "Ak.", LEMMA: "Alaska"},
+    {ORTH: "Ala.", LEMMA: "Alabama"},
+    {ORTH: "Apr.", LEMMA: "April"},
+    {ORTH: "Ariz.", LEMMA: "Arizona"},
+    {ORTH: "Ark.", LEMMA: "Arkansas"},
+    {ORTH: "Aug.", LEMMA: "August"},
+    {ORTH: "Calif.", LEMMA: "California"},
+    {ORTH: "Colo.", LEMMA: "Colorado"},
+    {ORTH: "Conn.", LEMMA: "Connecticut"},
+    {ORTH: "Dec.", LEMMA: "December"},
+    {ORTH: "Del.", LEMMA: "Delaware"},
+    {ORTH: "Feb.", LEMMA: "February"},
+    {ORTH: "Fla.", LEMMA: "Florida"},
+    {ORTH: "Ga.", LEMMA: "Georgia"},
+    {ORTH: "Ia.", LEMMA: "Iowa"},
+    {ORTH: "Id.", LEMMA: "Idaho"},
+    {ORTH: "Ill.", LEMMA: "Illinois"},
+    {ORTH: "Ind.", LEMMA: "Indiana"},
+    {ORTH: "Jan.", LEMMA: "January"},
+    {ORTH: "Jul.", LEMMA: "July"},
+    {ORTH: "Jun.", LEMMA: "June"},
+    {ORTH: "Kan.", LEMMA: "Kansas"},
+    {ORTH: "Kans.", LEMMA: "Kansas"},
+    {ORTH: "Ky.", LEMMA: "Kentucky"},
+    {ORTH: "La.", LEMMA: "Louisiana"},
+    {ORTH: "Mar.", LEMMA: "March"},
+    {ORTH: "Mass.", LEMMA: "Massachusetts"},
+    {ORTH: "May.", LEMMA: "May"},
+    {ORTH: "Mich.", LEMMA: "Michigan"},
+    {ORTH: "Minn.", LEMMA: "Minnesota"},
+    {ORTH: "Miss.", LEMMA: "Mississippi"},
+    {ORTH: "N.C.", LEMMA: "North Carolina"},
+    {ORTH: "N.D.", LEMMA: "North Dakota"},
+    {ORTH: "N.H.", LEMMA: "New Hampshire"},
+    {ORTH: "N.J.", LEMMA: "New Jersey"},
+    {ORTH: "N.M.", LEMMA: "New Mexico"},
+    {ORTH: "N.Y.", LEMMA: "New York"},
+    {ORTH: "Neb.", LEMMA: "Nebraska"},
+    {ORTH: "Nebr.", LEMMA: "Nebraska"},
+    {ORTH: "Nev.", LEMMA: "Nevada"},
+    {ORTH: "Nov.", LEMMA: "November"},
+    {ORTH: "Oct.", LEMMA: "October"},
+    {ORTH: "Okla.", LEMMA: "Oklahoma"},
+    {ORTH: "Ore.", LEMMA: "Oregon"},
+    {ORTH: "Pa.", LEMMA: "Pennsylvania"},
+    {ORTH: "S.C.", LEMMA: "South Carolina"},
+    {ORTH: "Sep.", LEMMA: "September"},
+    {ORTH: "Sept.", LEMMA: "September"},
+    {ORTH: "Tenn.", LEMMA: "Tennessee"},
+    {ORTH: "Va.", LEMMA: "Virginia"},
+    {ORTH: "Wash.", LEMMA: "Washington"},
+    {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)]


-# Remove EXCLUDE_EXC if in exceptions
-
-for string in EXCLUDE_EXC:
-    if string in TOKENIZER_EXCEPTIONS:
-        TOKENIZER_EXCEPTIONS.pop(string)
+for orth in [
+    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
+    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
+    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
+    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
+    _exc[orth] = [{ORTH: orth}]


-# Abbreviations with only one ORTH token
+for string in _exclude:
+    if string in _exc:
+        _exc.pop(string)

-ORTH_ONLY = [
-    "'d",
-    "a.m.",
-    "Adm.",
-    "Bros.",
-    "co.",
-    "Co.",
-    "Corp.",
-    "D.C.",
-    "Dr.",
-    "e.g.",
-    "E.g.",
-    "E.G.",
-    "Gen.",
-    "Gov.",
-    "i.e.",
-    "I.e.",
-    "I.E.",
-    "Inc.",
-    "Jr.",
-    "Ltd.",
-    "Md.",
-    "Messrs.",
-    "Mo.",
-    "Mont.",
-    "Mr.",
-    "Mrs.",
-    "Ms.",
-    "p.m.",
-    "Ph.D.",
-    "Rep.",
-    "Rev.",
-    "Sen.",
-    "St.",
-    "vs.",
-]
+
+TOKENIZER_EXCEPTIONS = dict(_exc)