Reorganise English language data

2025-11-07 19:37:38 +03:00 · 2017-05-08 15:47:25 +02:00 · 2017-05-08 15:47:25 +02:00 · c7c21b980f
commit c7c21b980f
parent 1bf9d5ec8b
8 changed files with 242 additions and 582 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,14 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..language import Language
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..lemmatizer import Lemmatizer
+from .tag_map import TAG_MAP
-from ..vocab import Vocab
+from .stop_words import STOP_WORDS
-from ..tokenizer import Tokenizer
+from .morph_rules import MORPH_RULES
-from ..attrs import LANG
+from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
 from ..deprecated import fix_glove_vectors_loading
-from .language_data import *
+from ..language_data import BASE_EXCEPTIONS
 from ..language import Language
 from ..attrs import LANG
 from ..util import update_exc
 class English(Language):
@ -18,20 +20,13 @@ class English(Language):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'en'
-        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = TAG_MAP
+        tag_map = dict(TAG_MAP)
-        stop_words = STOP_WORDS
+        stop_words = set(STOP_WORDS)
        morph_rules = dict(MORPH_RULES)
        lemma_rules = dict(LEMMA_RULES)
        lemma_index = dict(LEMMA_INDEX)
        lemma_exc = dict(LEMMA_EXC)
-    def __init__(self, **overrides):
+__all__ = ['English']
        # Special-case hack for loading the GloVe vectors, to support <1.0
        overrides = fix_glove_vectors_loading(overrides)
        Language.__init__(self, **overrides)
 EXPORT = English
--- a/spacy/en/lemmatizer/init.py
+++ b/spacy/en/lemmatizer/init.py
@ -1,3 +1,5 @@
 from .lookup import LOOKUP
 from ._adjectives import ADJECTIVES
 from ._adjectives_irreg import ADJECTIVES_IRREG
 from ._adverbs import ADVERBS
@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG
 from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
-INDEX = {
+LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
    "adj": ADJECTIVES,
    "adv": ADVERBS,
    "noun": NOUNS,
    "verb": VERBS
 }
 LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
             'verb': VERBS_IRREG}
-EXC = {
+LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
-    "adj": ADJECTIVES_IRREG,
+               'punct': PUNCT_RULES}
    "adv": ADVERBS_IRREG,
    "noun": NOUNS_IRREG,
    "verb": VERBS_IRREG
 }
 RULES = {
    "adj": ADJECTIVE_RULES,
    "noun": NOUN_RULES,
    "verb": VERB_RULES,
    "punct": PUNCT_RULES
 }
--- a/spacy/en/lemmatizer/lookup.py
+++ b/spacy/en/lemmatizer/lookup.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
-LOOK_UP = {
+LOOKUP = {
    " furtherst": "further",
    " skilled": "skill",
    "'cause": "because",
--- a/spacy/en/lex_attrs.py
+++ b/spacy/en/lex_attrs.py
@ -0,0 +1,23 @@
 # coding: utf8
 from __future__ import unicode_literals
 # Number words
 NUM_WORDS = set("""
 zero one two three four five six seven eight nine ten eleven twelve thirteen
 fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
 sixty seventy eighty ninety hundred thousand million billion trillion
 quadrillion gajillion bazillion
 """.split())
 # Ordinal words
 ORDINAL_WORDS = set("""
 first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
 thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
 twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
 hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
 bazillionth
 """.split())
--- a/spacy/en/morph_rules.py
+++ b/spacy/en/morph_rules.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..symbols import *
+from ..symbols import LEMMA
-from ..language_data import PRON_LEMMA
+from ..deprecated import PRON_LEMMA
 MORPH_RULES = {
--- a/spacy/en/stop_words.py
+++ b/spacy/en/stop_words.py
@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 """.split())
 # Number words
 NUM_WORDS = set("""
 zero one two three four five six seven eight nine ten eleven twelve thirteen
 fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
 sixty seventy eighty ninety hundred thousand million billion trillion
 quadrillion gajillion bazillion
 """.split())
 # Ordinal words
 ORDINAL_WORDS = set("""
 first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
 thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
 twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
 hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
 bazillionth
 """.split())
--- a/spacy/en/tag_map.py
+++ b/spacy/en/tag_map.py
@ -1,7 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..symbols import *
+from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
 TAG_MAP = {
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -1,13 +1,12 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..symbols import *
+from ..symbols import ORTH, LEMMA, TAG, NORM
-from ..language_data import PRON_LEMMA
+from ..deprecated import PRON_LEMMA
-EXC = {}
+_exc = {}
-
+_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
               "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 for pron in ["i"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'m"] = [
+        _exc[orth + "'m"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}
+            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
        ]
-        EXC[orth + "m"] = [
+        _exc[orth + "m"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
+            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
        ]
-        EXC[orth + "'ma"] = [
+        _exc[orth + "'ma"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
-            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
        ]
-        EXC[orth + "ma"] = [
+        _exc[orth + "ma"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
-            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
        ]
 for pron in ["i", "you", "he", "she", "it", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'ll"] = [
+        _exc[orth + "'ll"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
        ]
-        EXC[orth + "ll"] = [
+        _exc[orth + "ll"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
        ]
-        EXC[orth + "'ll've"] = [
+        _exc[orth + "'ll've"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "llve"] = [
+        _exc[orth + "llve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "'d"] = [
+        _exc[orth + "'d"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"}
+            {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
        ]
-        EXC[orth + "d"] = [
+        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"}
+            {ORTH: "d", LEMMA: "would", TAG: "MD"}]
        ]
-        EXC[orth + "'d've"] = [
+        _exc[orth + "'d've"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "dve"] = [
+        _exc[orth + "dve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
 for pron in ["i", "you", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'ve"] = [
+        _exc[orth + "'ve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "ve"] = [
+        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
 for pron in ["you", "we", "they"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'re"] = [
+        _exc[orth + "'re"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'re", LEMMA: "be", NORM: "are"}
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}]
        ]
-        EXC[orth + "re"] = [
+        _exc[orth + "re"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}
+            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
        ]
 for pron in ["he", "she", "it"]:
    for orth in [pron, pron.title()]:
-        EXC[orth + "'s"] = [
+        _exc[orth + "'s"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'s"}
+            {ORTH: "'s"}]
        ]
-        EXC[orth + "s"] = [
+        _exc[orth + "s"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "s"}
+            {ORTH: "s"}]
        ]
 # W-words, relative pronouns, prepositions etc.
 for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
    for orth in [word, word.title()]:
-        EXC[orth + "'s"] = [
+        _exc[orth + "'s"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'s"}
+            {ORTH: "'s"}]
        ]
-        EXC[orth + "s"] = [
+        _exc[orth + "s"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "s"}
+            {ORTH: "s"}]
        ]
-        EXC[orth + "'ll"] = [
+        _exc[orth + "'ll"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
        ]
-        EXC[orth + "ll"] = [
+        _exc[orth + "ll"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
        ]
-        EXC[orth + "'ll've"] = [
+        _exc[orth + "'ll've"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "llve"] = [
+        _exc[orth + "llve"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "'re"] = [
+        _exc[orth + "'re"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'re", LEMMA: "be", NORM: "are"}
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}]
        ]
-        EXC[orth + "re"] = [
+        _exc[orth + "re"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "re", LEMMA: "be", NORM: "are"}
+            {ORTH: "re", LEMMA: "be", NORM: "are"}]
        ]
-        EXC[orth + "'ve"] = [
+        _exc[orth + "'ve"] = [
            {ORTH: orth},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "ve"] = [
+        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "'d"] = [
+        _exc[orth + "'d"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d"}
+            {ORTH: "'d"}]
        ]
-        EXC[orth + "d"] = [
+        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "d"}
+            {ORTH: "d"}]
        ]
-        EXC[orth + "'d've"] = [
+        _exc[orth + "'d've"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[orth + "dve"] = [
+        _exc[orth + "dve"] = [
            {ORTH: orth, LEMMA: word},
            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
 # Verbs
@ -221,54 +187,44 @@ for verb_data in [
    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
    {ORTH: "should", TAG: "MD"},
    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
-    {ORTH: "would", TAG: "MD"}
+    {ORTH: "would", TAG: "MD"}]:
 ]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "n't"] = [
+        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
        ]
-        EXC[data[ORTH] + "nt"] = [
+        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
        ]
-        EXC[data[ORTH] + "n't've"] = [
+        _exc[data[ORTH] + "n't've"] = [
            dict(data),
            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[data[ORTH] + "ntve"] = [
+        _exc[data[ORTH] + "ntve"] = [
            dict(data),
            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
 for verb_data in [
    {ORTH: "could", TAG: "MD"},
    {ORTH: "might"},
    {ORTH: "must"},
-    {ORTH: "should"}
+    {ORTH: "should"}]:
 ]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "'ve"] = [
+        _exc[data[ORTH] + "'ve"] = [
            dict(data),
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        ]
-        EXC[data[ORTH] + "ve"] = [
+        _exc[data[ORTH] + "ve"] = [
            dict(data),
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
        ]
 for verb_data in [
@ -276,22 +232,17 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be"},
-    {ORTH: "were", LEMMA: "be"}
+    {ORTH: "were", LEMMA: "be"}]:
 ]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
-        EXC[data[ORTH] + "n't"] = [
+        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
        ]
-        EXC[data[ORTH] + "nt"] = [
+        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
        ]
 # Other contractions with trailing apostrophe
@ -302,22 +253,14 @@ for exc_data in [
    {ORTH: "nothin", LEMMA: "nothing"},
    {ORTH: "nuthin", LEMMA: "nothing"},
    {ORTH: "ol", LEMMA: "old"},
-    {ORTH: "somethin", LEMMA: "something"}
+    {ORTH: "somethin", LEMMA: "something"}]:
 ]:
    exc_data_tc = dict(exc_data)
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
    for data in [exc_data, exc_data_tc]:
        data_apos = dict(data)
        data_apos[ORTH] = data_apos[ORTH] + "'"
-
+        _exc[data[ORTH]] = [dict(data)]
-        EXC[data[ORTH]] = [
+        _exc[data_apos[ORTH]] = [dict(data_apos)]
            dict(data)
        ]
        EXC[data_apos[ORTH]] = [
            dict(data_apos)
        ]
 # Other contractions with leading apostrophe
@ -326,449 +269,181 @@ for exc_data in [
    {ORTH: "cause", LEMMA: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
    {ORTH: "ll", LEMMA: "will"},
-    {ORTH: "nuff", LEMMA: "enough"}
+    {ORTH: "nuff", LEMMA: "enough"}]:
 ]:
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
-        EXC[data[ORTH]] = [
+        _exc[data[ORTH]] = [dict(data)]
            dict(data)
        ]
 # Times
 for h in range(1, 12 + 1):
    hour = str(h)
    for period in ["a.m.", "am"]:
-        EXC[hour + period] = [
+        _exc[hour+period] = [
            {ORTH: hour},
-            {ORTH: period, LEMMA: "a.m."}
+            {ORTH: period, LEMMA: "a.m."}]
        ]
    for period in ["p.m.", "pm"]:
-        EXC[hour + period] = [
+        _exc[hour+period] = [
            {ORTH: hour},
-            {ORTH: period, LEMMA: "p.m."}
+            {ORTH: period, LEMMA: "p.m."}]
        ]
 # Rest
-OTHER = {
+_other_exc = {
    " ": [
        {ORTH: " ", TAG: "SP"}
    ],
    "\u00a0": [
        {ORTH: "\u00a0", TAG: "SP", LEMMA: "  "}
    ],
    "'S": [
        {ORTH: "'S", LEMMA: "'s"}
    ],
    "'s": [
        {ORTH: "'s", LEMMA: "'s"}
    ],
    "'re": [
        {ORTH: "'re", LEMMA: "be", NORM: "are"}
    ],
    "\u2018S": [
        {ORTH: "\u2018S", LEMMA: "'s"}
    ],
    "\u2018s": [
        {ORTH: "\u2018s", LEMMA: "'s"}
    ],
    "and/or": [
        {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
    ],
    "'Cause": [
        {ORTH: "'Cause", LEMMA: "because"}
    ],
    "y'all": [
        {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
-        {ORTH: "all"}
+        {ORTH: "all"}],
    ],
    "yall": [
        {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
-        {ORTH: "all"}
+        {ORTH: "all"}],
    ],
    "ma'am": [
        {ORTH: "ma'am", LEMMA: "madam"}
    ],
    "Ma'am": [
        {ORTH: "Ma'am", LEMMA: "madam"}
    ],
    "o'clock": [
        {ORTH: "o'clock", LEMMA: "o'clock"}
    ],
    "O'clock": [
        {ORTH: "O'clock", LEMMA: "o'clock"}
    ],
    "how'd'y": [
        {ORTH: "how", LEMMA: "how"},
        {ORTH: "'d", LEMMA: "do"},
-        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
    ],
    "How'd'y": [
        {ORTH: "How", LEMMA: "how"},
        {ORTH: "'d", LEMMA: "do"},
-        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
    ],
    "not've": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
    ],
    "notve": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
    ],
    "Not've": [
        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
    ],
    "Notve": [
        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
    ],
    "cannot": [
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}],
    ],
    "Cannot": [
        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}],
    ],
    "gonna": [
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}
+        {ORTH: "na", LEMMA: "to"}],
    ],
    "Gonna": [
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}
+        {ORTH: "na", LEMMA: "to"}],
    ],
    "gotta": [
        {ORTH: "got"},
-        {ORTH: "ta", LEMMA: "to"}
+        {ORTH: "ta", LEMMA: "to"}],
    ],
    "Gotta": [
        {ORTH: "Got"},
-        {ORTH: "ta", LEMMA: "to"}
+        {ORTH: "ta", LEMMA: "to"}],
    ],
    "let's": [
        {ORTH: "let"},
-        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
    ],
    "Let's": [
        {ORTH: "Let", LEMMA: "let"},
-        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
    ],
    "\u2014": [
        {ORTH: "\u2014", TAG: ":", LEMMA: "--"}
    ],
    "\n": [
        {ORTH: "\n", TAG: "SP"}
    ],
    "\t": [
        {ORTH: "\t", TAG: "SP"}
    ]
 }
-
+_exc.update(_other_exc)
 # Abbreviations
 ABBREVIATIONS = {
    "Mt.": [
        {ORTH: "Mt.", LEMMA: "Mount"}
    ],
    "Ak.": [
        {ORTH: "Ak.", LEMMA: "Alaska"}
    ],
    "Ala.": [
        {ORTH: "Ala.", LEMMA: "Alabama"}
    ],
    "Apr.": [
        {ORTH: "Apr.", LEMMA: "April"}
    ],
    "Ariz.": [
        {ORTH: "Ariz.", LEMMA: "Arizona"}
    ],
    "Ark.": [
        {ORTH: "Ark.", LEMMA: "Arkansas"}
    ],
    "Aug.": [
        {ORTH: "Aug.", LEMMA: "August"}
    ],
    "Calif.": [
        {ORTH: "Calif.", LEMMA: "California"}
    ],
    "Colo.": [
        {ORTH: "Colo.", LEMMA: "Colorado"}
    ],
    "Conn.": [
        {ORTH: "Conn.", LEMMA: "Connecticut"}
    ],
    "Dec.": [
        {ORTH: "Dec.", LEMMA: "December"}
    ],
    "Del.": [
        {ORTH: "Del.", LEMMA: "Delaware"}
    ],
    "Feb.": [
        {ORTH: "Feb.", LEMMA: "February"}
    ],
    "Fla.": [
        {ORTH: "Fla.", LEMMA: "Florida"}
    ],
    "Ga.": [
        {ORTH: "Ga.", LEMMA: "Georgia"}
    ],
    "Ia.": [
        {ORTH: "Ia.", LEMMA: "Iowa"}
    ],
    "Id.": [
        {ORTH: "Id.", LEMMA: "Idaho"}
    ],
    "Ill.": [
        {ORTH: "Ill.", LEMMA: "Illinois"}
    ],
    "Ind.": [
        {ORTH: "Ind.", LEMMA: "Indiana"}
    ],
    "Jan.": [
        {ORTH: "Jan.", LEMMA: "January"}
    ],
    "Jul.": [
        {ORTH: "Jul.", LEMMA: "July"}
    ],
    "Jun.": [
        {ORTH: "Jun.", LEMMA: "June"}
    ],
    "Kan.": [
        {ORTH: "Kan.", LEMMA: "Kansas"}
    ],
    "Kans.": [
        {ORTH: "Kans.", LEMMA: "Kansas"}
    ],
    "Ky.": [
        {ORTH: "Ky.", LEMMA: "Kentucky"}
    ],
    "La.": [
        {ORTH: "La.", LEMMA: "Louisiana"}
    ],
    "Mar.": [
        {ORTH: "Mar.", LEMMA: "March"}
    ],
    "Mass.": [
        {ORTH: "Mass.", LEMMA: "Massachusetts"}
    ],
    "May.": [
        {ORTH: "May.", LEMMA: "May"}
    ],
    "Mich.": [
        {ORTH: "Mich.", LEMMA: "Michigan"}
    ],
    "Minn.": [
        {ORTH: "Minn.", LEMMA: "Minnesota"}
    ],
    "Miss.": [
        {ORTH: "Miss.", LEMMA: "Mississippi"}
    ],
    "N.C.": [
        {ORTH: "N.C.", LEMMA: "North Carolina"}
    ],
    "N.D.": [
        {ORTH: "N.D.", LEMMA: "North Dakota"}
    ],
    "N.H.": [
        {ORTH: "N.H.", LEMMA: "New Hampshire"}
    ],
    "N.J.": [
        {ORTH: "N.J.", LEMMA: "New Jersey"}
    ],
    "N.M.": [
        {ORTH: "N.M.", LEMMA: "New Mexico"}
    ],
    "N.Y.": [
        {ORTH: "N.Y.", LEMMA: "New York"}
    ],
    "Neb.": [
        {ORTH: "Neb.", LEMMA: "Nebraska"}
    ],
    "Nebr.": [
        {ORTH: "Nebr.", LEMMA: "Nebraska"}
    ],
    "Nev.": [
        {ORTH: "Nev.", LEMMA: "Nevada"}
    ],
    "Nov.": [
        {ORTH: "Nov.", LEMMA: "November"}
    ],
    "Oct.": [
        {ORTH: "Oct.", LEMMA: "October"}
    ],
    "Okla.": [
        {ORTH: "Okla.", LEMMA: "Oklahoma"}
    ],
    "Ore.": [
        {ORTH: "Ore.", LEMMA: "Oregon"}
    ],
    "Pa.": [
        {ORTH: "Pa.", LEMMA: "Pennsylvania"}
    ],
    "S.C.": [
        {ORTH: "S.C.", LEMMA: "South Carolina"}
    ],
    "Sep.": [
        {ORTH: "Sep.", LEMMA: "September"}
    ],
    "Sept.": [
        {ORTH: "Sept.", LEMMA: "September"}
    ],
    "Tenn.": [
        {ORTH: "Tenn.", LEMMA: "Tennessee"}
    ],
    "Va.": [
        {ORTH: "Va.", LEMMA: "Virginia"}
    ],
    "Wash.": [
        {ORTH: "Wash.", LEMMA: "Washington"}
    ],
    "Wis.": [
        {ORTH: "Wis.", LEMMA: "Wisconsin"}
    ]
 }
-TOKENIZER_EXCEPTIONS = dict(EXC)
+for exc_data in [
-TOKENIZER_EXCEPTIONS.update(OTHER)
+    {ORTH: "'S", LEMMA: "'s"},
-TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
+    {ORTH: "'s", LEMMA: "'s"},
    {ORTH: "\u2018S", LEMMA: "'s"},
    {ORTH: "\u2018s", LEMMA: "'s"},
    {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
    {ORTH: "'re", LEMMA: "be", NORM: "are"},
    {ORTH: "'Cause", LEMMA: "because"},
    {ORTH: "'cause", LEMMA: "because"},
    {ORTH: "ma'am", LEMMA: "madam"},
    {ORTH: "Ma'am", LEMMA: "madam"},
    {ORTH: "o'clock", LEMMA: "o'clock"},
    {ORTH: "O'clock", LEMMA: "o'clock"},
    {ORTH: "Mt.", LEMMA: "Mount"},
    {ORTH: "Ak.", LEMMA: "Alaska"},
    {ORTH: "Ala.", LEMMA: "Alabama"},
    {ORTH: "Apr.", LEMMA: "April"},
    {ORTH: "Ariz.", LEMMA: "Arizona"},
    {ORTH: "Ark.", LEMMA: "Arkansas"},
    {ORTH: "Aug.", LEMMA: "August"},
    {ORTH: "Calif.", LEMMA: "California"},
    {ORTH: "Colo.", LEMMA: "Colorado"},
    {ORTH: "Conn.", LEMMA: "Connecticut"},
    {ORTH: "Dec.", LEMMA: "December"},
    {ORTH: "Del.", LEMMA: "Delaware"},
    {ORTH: "Feb.", LEMMA: "February"},
    {ORTH: "Fla.", LEMMA: "Florida"},
    {ORTH: "Ga.", LEMMA: "Georgia"},
    {ORTH: "Ia.", LEMMA: "Iowa"},
    {ORTH: "Id.", LEMMA: "Idaho"},
    {ORTH: "Ill.", LEMMA: "Illinois"},
    {ORTH: "Ind.", LEMMA: "Indiana"},
    {ORTH: "Jan.", LEMMA: "January"},
    {ORTH: "Jul.", LEMMA: "July"},
    {ORTH: "Jun.", LEMMA: "June"},
    {ORTH: "Kan.", LEMMA: "Kansas"},
    {ORTH: "Kans.", LEMMA: "Kansas"},
    {ORTH: "Ky.", LEMMA: "Kentucky"},
    {ORTH: "La.", LEMMA: "Louisiana"},
    {ORTH: "Mar.", LEMMA: "March"},
    {ORTH: "Mass.", LEMMA: "Massachusetts"},
    {ORTH: "May.", LEMMA: "May"},
    {ORTH: "Mich.", LEMMA: "Michigan"},
    {ORTH: "Minn.", LEMMA: "Minnesota"},
    {ORTH: "Miss.", LEMMA: "Mississippi"},
    {ORTH: "N.C.", LEMMA: "North Carolina"},
    {ORTH: "N.D.", LEMMA: "North Dakota"},
    {ORTH: "N.H.", LEMMA: "New Hampshire"},
    {ORTH: "N.J.", LEMMA: "New Jersey"},
    {ORTH: "N.M.", LEMMA: "New Mexico"},
    {ORTH: "N.Y.", LEMMA: "New York"},
    {ORTH: "Neb.", LEMMA: "Nebraska"},
    {ORTH: "Nebr.", LEMMA: "Nebraska"},
    {ORTH: "Nev.", LEMMA: "Nevada"},
    {ORTH: "Nov.", LEMMA: "November"},
    {ORTH: "Oct.", LEMMA: "October"},
    {ORTH: "Okla.", LEMMA: "Oklahoma"},
    {ORTH: "Ore.", LEMMA: "Oregon"},
    {ORTH: "Pa.", LEMMA: "Pennsylvania"},
    {ORTH: "S.C.", LEMMA: "South Carolina"},
    {ORTH: "Sep.", LEMMA: "September"},
    {ORTH: "Sept.", LEMMA: "September"},
    {ORTH: "Tenn.", LEMMA: "Tennessee"},
    {ORTH: "Va.", LEMMA: "Virginia"},
    {ORTH: "Wash.", LEMMA: "Washington"},
    {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]
-# Remove EXCLUDE_EXC if in exceptions
+for orth in [
-
+    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
-for string in EXCLUDE_EXC:
+    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
-    if string in TOKENIZER_EXCEPTIONS:
+    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
-        TOKENIZER_EXCEPTIONS.pop(string)
+    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
    _exc[orth] = [{ORTH: orth}]
-# Abbreviations with only one ORTH token
+for string in _exclude:
    if string in _exc:
        _exc.pop(string)
-ORTH_ONLY = [
+
-    "'d",
+TOKENIZER_EXCEPTIONS = dict(_exc)
    "a.m.",
    "Adm.",
    "Bros.",
    "co.",
    "Co.",
    "Corp.",
    "D.C.",
    "Dr.",
    "e.g.",
    "E.g.",
    "E.G.",
    "Gen.",
    "Gov.",
    "i.e.",
    "I.e.",
    "I.E.",
    "Inc.",
    "Jr.",
    "Ltd.",
    "Md.",
    "Messrs.",
    "Mo.",
    "Mont.",
    "Mr.",
    "Mrs.",
    "Ms.",
    "p.m.",
    "Ph.D.",
    "Rep.",
    "Rev.",
    "Sen.",
    "St.",
    "vs.",
 ]