from typing import Dict, List from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc _exc: Dict[str, List[Dict]] = {} _exclude = [ "Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore", ] # Pronouns for pron in ["i"]: for orth in [pron, pron.title()]: _exc[orth + "'m"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'m", NORM: "am"}, ] _exc[orth + "m"] = [ {ORTH: orth, NORM: pron}, {ORTH: "m"}, ] _exc[orth + "'ma"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'m", NORM: "am"}, {ORTH: "a", NORM: "gonna"}, ] _exc[orth + "ma"] = [ {ORTH: orth, NORM: pron}, {ORTH: "m", NORM: "am"}, {ORTH: "a", NORM: "gonna"}, ] for pron in ["i", "you", "he", "she", "it", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ll"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'ll", NORM: "will"}, ] _exc[orth + "ll"] = [ {ORTH: orth, NORM: pron}, {ORTH: "ll", NORM: "will"}, ] _exc[orth + "'ll've"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'ll", NORM: "will"}, {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "llve"] = [ {ORTH: orth, NORM: pron}, {ORTH: "ll", NORM: "will"}, {ORTH: "ve", NORM: "have"}, ] _exc[orth + "'d"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ {ORTH: orth, NORM: pron}, {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'d", NORM: "would"}, {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "dve"] = [ {ORTH: orth, NORM: pron}, {ORTH: "d", NORM: "would"}, {ORTH: "ve", NORM: "have"}, ] for pron in ["i", "you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ve"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "ve"] = [ {ORTH: orth, NORM: pron}, {ORTH: "ve", NORM: "have"}, ] for pron in ["you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'re"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'re", NORM: "are"}, ] _exc[orth + "re"] = [ {ORTH: orth, NORM: pron}, {ORTH: "re", NORM: "are"}, ] for pron in ["he", "she", "it"]: for orth in [pron, pron.title()]: _exc[orth + "'s"] = [ {ORTH: orth, NORM: pron}, {ORTH: "'s", NORM: "'s"}, ] _exc[orth + "s"] = [ {ORTH: orth, NORM: pron}, {ORTH: "s"}, ] # W-words, relative pronouns, prepositions etc. for word, morph in [ ("who", None), ("what", None), ("when", None), ("where", None), ("why", None), ("how", None), ("there", None), ("that", "Number=Sing|Person=3"), ("this", "Number=Sing|Person=3"), ("these", "Number=Plur|Person=3"), ("those", "Number=Plur|Person=3"), ]: for orth in [word, word.title()]: if morph != "Number=Plur|Person=3": _exc[orth + "'s"] = [ {ORTH: orth, NORM: word}, {ORTH: "'s", NORM: "'s"}, ] _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}] _exc[orth + "'ll"] = [ {ORTH: orth, NORM: word}, {ORTH: "'ll", NORM: "will"}, ] _exc[orth + "ll"] = [ {ORTH: orth, NORM: word}, {ORTH: "ll", NORM: "will"}, ] _exc[orth + "'ll've"] = [ {ORTH: orth, NORM: word}, {ORTH: "'ll", NORM: "will"}, {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "llve"] = [ {ORTH: orth, NORM: word}, {ORTH: "ll", NORM: "will"}, {ORTH: "ve", NORM: "have"}, ] if morph != "Number=Sing|Person=3": _exc[orth + "'re"] = [ {ORTH: orth, NORM: word}, {ORTH: "'re", NORM: "are"}, ] _exc[orth + "re"] = [ {ORTH: orth, NORM: word}, {ORTH: "re", NORM: "are"}, ] _exc[orth + "'ve"] = [ {ORTH: orth, NORM: word}, {ORTH: "'ve"}, ] _exc[orth + "ve"] = [ {ORTH: orth}, {ORTH: "ve", NORM: "have"}, ] _exc[orth + "'d"] = [ {ORTH: orth, NORM: word}, {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ {ORTH: orth, NORM: word}, {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ {ORTH: orth, NORM: word}, {ORTH: "'d", NORM: "would"}, {ORTH: "'ve", NORM: "have"}, ] _exc[orth + "dve"] = [ {ORTH: orth, NORM: word}, {ORTH: "d", NORM: "would"}, {ORTH: "ve", NORM: "have"}, ] # Verbs for verb_data in [ {ORTH: "ca", NORM: "can"}, {ORTH: "could", NORM: "could"}, {ORTH: "do", NORM: "do"}, {ORTH: "does", NORM: "does"}, {ORTH: "did", NORM: "do"}, {ORTH: "had", NORM: "have"}, {ORTH: "may", NORM: "may"}, {ORTH: "might", NORM: "might"}, {ORTH: "must", NORM: "must"}, {ORTH: "need", NORM: "need"}, {ORTH: "ought", NORM: "ought"}, {ORTH: "sha", NORM: "shall"}, {ORTH: "should", NORM: "should"}, {ORTH: "wo", NORM: "will"}, {ORTH: "would", NORM: "would"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "n't"] = [ dict(data), {ORTH: "n't", NORM: "not"}, ] _exc[data[ORTH] + "nt"] = [ dict(data), {ORTH: "nt", NORM: "not"}, ] _exc[data[ORTH] + "n't've"] = [ dict(data), {ORTH: "n't", NORM: "not"}, {ORTH: "'ve", NORM: "have"}, ] _exc[data[ORTH] + "ntve"] = [ dict(data), {ORTH: "nt", NORM: "not"}, {ORTH: "ve", NORM: "have"}, ] for verb_data in [ {ORTH: "could", NORM: "could"}, {ORTH: "might", NORM: "might"}, {ORTH: "must", NORM: "must"}, {ORTH: "should", NORM: "should"}, {ORTH: "would", NORM: "would"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve"}] _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve"}] for verb_data in [ {ORTH: "ai", "number": 2}, {ORTH: "are", NORM: "are", "number": 2}, {ORTH: "is", NORM: "is"}, {ORTH: "was", NORM: "was"}, {ORTH: "were", NORM: "were"}, {ORTH: "have", NORM: "have"}, {ORTH: "has", NORM: "has"}, {ORTH: "dare", NORM: "dare"}, ]: verb_data_tc = dict(verb_data) # type: ignore[call-overload] verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: # type: ignore[assignment] _exc[data[ORTH] + "n't"] = [ dict(data), {ORTH: "n't", NORM: "not"}, ] _exc[data[ORTH] + "nt"] = [ dict(data), {ORTH: "nt", NORM: "not"}, ] # Other contractions with trailing apostrophe for exc_data in [ {ORTH: "doin", NORM: "doing"}, {ORTH: "goin", NORM: "going"}, {ORTH: "nothin", NORM: "nothing"}, {ORTH: "nuthin", NORM: "nothing"}, {ORTH: "ol", NORM: "old"}, {ORTH: "somethin", NORM: "something"}, ]: exc_data_tc = dict(exc_data) exc_data_tc[ORTH] = exc_data_tc[ORTH].title() for data in [exc_data, exc_data_tc]: data_apos = dict(data) data_apos[ORTH] = data_apos[ORTH] + "'" _exc[data[ORTH]] = [dict(data)] _exc[data_apos[ORTH]] = [dict(data_apos)] # Other contractions with leading apostrophe for exc_data in [ {ORTH: "em", NORM: "them"}, {ORTH: "ll", NORM: "will"}, {ORTH: "nuff", NORM: "enough"}, ]: exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] for data in [exc_data, exc_data_apos]: _exc[data[ORTH]] = [data] # Times for h in range(1, 12 + 1): for period in ["a.m.", "am"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, {ORTH: period, NORM: "a.m."}, ] for period in ["p.m.", "pm"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, {ORTH: period, NORM: "p.m."}, ] # Rest _other_exc = { "y'all": [{ORTH: "y'", NORM: "you"}, {ORTH: "all"}], "yall": [{ORTH: "y", NORM: "you"}, {ORTH: "all"}], "how'd'y": [{ORTH: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}], "How'd'y": [{ORTH: "How", NORM: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}], "not've": [{ORTH: "not"}, {ORTH: "'ve", NORM: "have"}], "notve": [{ORTH: "not"}, {ORTH: "ve", NORM: "have"}], "Not've": [{ORTH: "Not", NORM: "not"}, {ORTH: "'ve", NORM: "have"}], "Notve": [{ORTH: "Not", NORM: "not"}, {ORTH: "ve", NORM: "have"}], "cannot": [{ORTH: "can"}, {ORTH: "not"}], "Cannot": [{ORTH: "Can", NORM: "can"}, {ORTH: "not"}], "gonna": [{ORTH: "gon", NORM: "going"}, {ORTH: "na", NORM: "to"}], "Gonna": [{ORTH: "Gon", NORM: "going"}, {ORTH: "na", NORM: "to"}], "gotta": [{ORTH: "got"}, {ORTH: "ta", NORM: "to"}], "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", NORM: "to"}], "let's": [{ORTH: "let"}, {ORTH: "'s", NORM: "us"}], "Let's": [{ORTH: "Let", NORM: "let"}, {ORTH: "'s", NORM: "us"}], "c'mon": [{ORTH: "c'm", NORM: "come"}, {ORTH: "on"}], "C'mon": [{ORTH: "C'm", NORM: "come"}, {ORTH: "on"}], } _exc.update(_other_exc) for exc_data in [ {ORTH: "'S", NORM: "'s"}, {ORTH: "'s", NORM: "'s"}, {ORTH: "\u2018S", NORM: "'s"}, {ORTH: "\u2018s", NORM: "'s"}, {ORTH: "and/or", NORM: "and/or"}, {ORTH: "w/o", NORM: "without"}, {ORTH: "'re", NORM: "are"}, {ORTH: "'Cause", NORM: "because"}, {ORTH: "'cause", NORM: "because"}, {ORTH: "'cos", NORM: "because"}, {ORTH: "'Cos", NORM: "because"}, {ORTH: "'coz", NORM: "because"}, {ORTH: "'Coz", NORM: "because"}, {ORTH: "'cuz", NORM: "because"}, {ORTH: "'Cuz", NORM: "because"}, {ORTH: "'bout", NORM: "about"}, {ORTH: "ma'am", NORM: "madam"}, {ORTH: "Ma'am", NORM: "madam"}, {ORTH: "o'clock", NORM: "o'clock"}, {ORTH: "O'clock", NORM: "o'clock"}, {ORTH: "lovin'", NORM: "loving"}, {ORTH: "Lovin'", NORM: "loving"}, {ORTH: "lovin", NORM: "loving"}, {ORTH: "Lovin", NORM: "loving"}, {ORTH: "havin'", NORM: "having"}, {ORTH: "Havin'", NORM: "having"}, {ORTH: "havin", NORM: "having"}, {ORTH: "Havin", NORM: "having"}, {ORTH: "doin'", NORM: "doing"}, {ORTH: "Doin'", NORM: "doing"}, {ORTH: "doin", NORM: "doing"}, {ORTH: "Doin", NORM: "doing"}, {ORTH: "goin'", NORM: "going"}, {ORTH: "Goin'", NORM: "going"}, {ORTH: "goin", NORM: "going"}, {ORTH: "Goin", NORM: "going"}, {ORTH: "Mt.", NORM: "Mount"}, {ORTH: "Ak.", NORM: "Alaska"}, {ORTH: "Ala.", NORM: "Alabama"}, {ORTH: "Apr.", NORM: "April"}, {ORTH: "Ariz.", NORM: "Arizona"}, {ORTH: "Ark.", NORM: "Arkansas"}, {ORTH: "Aug.", NORM: "August"}, {ORTH: "Calif.", NORM: "California"}, {ORTH: "Colo.", NORM: "Colorado"}, {ORTH: "Conn.", NORM: "Connecticut"}, {ORTH: "Dec.", NORM: "December"}, {ORTH: "Del.", NORM: "Delaware"}, {ORTH: "Feb.", NORM: "February"}, {ORTH: "Fla.", NORM: "Florida"}, {ORTH: "Ga.", NORM: "Georgia"}, {ORTH: "Ia.", NORM: "Iowa"}, {ORTH: "Id.", NORM: "Idaho"}, {ORTH: "Ill.", NORM: "Illinois"}, {ORTH: "Ind.", NORM: "Indiana"}, {ORTH: "Jan.", NORM: "January"}, {ORTH: "Jul.", NORM: "July"}, {ORTH: "Jun.", NORM: "June"}, {ORTH: "Kan.", NORM: "Kansas"}, {ORTH: "Kans.", NORM: "Kansas"}, {ORTH: "Ky.", NORM: "Kentucky"}, {ORTH: "La.", NORM: "Louisiana"}, {ORTH: "Mar.", NORM: "March"}, {ORTH: "Mass.", NORM: "Massachusetts"}, {ORTH: "Mich.", NORM: "Michigan"}, {ORTH: "Minn.", NORM: "Minnesota"}, {ORTH: "Miss.", NORM: "Mississippi"}, {ORTH: "N.C.", NORM: "North Carolina"}, {ORTH: "N.D.", NORM: "North Dakota"}, {ORTH: "N.H.", NORM: "New Hampshire"}, {ORTH: "N.J.", NORM: "New Jersey"}, {ORTH: "N.M.", NORM: "New Mexico"}, {ORTH: "N.Y.", NORM: "New York"}, {ORTH: "Neb.", NORM: "Nebraska"}, {ORTH: "Nebr.", NORM: "Nebraska"}, {ORTH: "Nev.", NORM: "Nevada"}, {ORTH: "Nov.", NORM: "November"}, {ORTH: "Oct.", NORM: "October"}, {ORTH: "Okla.", NORM: "Oklahoma"}, {ORTH: "Ore.", NORM: "Oregon"}, {ORTH: "Pa.", NORM: "Pennsylvania"}, {ORTH: "S.C.", NORM: "South Carolina"}, {ORTH: "Sep.", NORM: "September"}, {ORTH: "Sept.", NORM: "September"}, {ORTH: "Tenn.", NORM: "Tennessee"}, {ORTH: "Va.", NORM: "Virginia"}, {ORTH: "Wash.", NORM: "Washington"}, {ORTH: "Wis.", NORM: "Wisconsin"}, ]: _exc[exc_data[ORTH]] = [exc_data] for orth in [ "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.", "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.", "Ph.D.", "Prof.", "Rep.", "Rev.", "Sen.", "St.", "vs.", "v.s.", ]: _exc[orth] = [{ORTH: orth}] for string in _exclude: if string in _exc: _exc.pop(string) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)