From c7c21b980fb57ad0235833f70289ce91af7963f7 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:47:25 +0200 Subject: [PATCH] Reorganise English language data --- spacy/en/__init__.py | 31 +- spacy/en/lemmatizer/__init__.py | 27 +- .../lookup.py} | 4 +- spacy/en/lex_attrs.py | 23 + spacy/en/morph_rules.py | 4 +- spacy/en/{word_sets.py => stop_words.py} | 21 - spacy/en/tag_map.py | 3 +- spacy/en/tokenizer_exceptions.py | 711 +++++------------- 8 files changed, 242 insertions(+), 582 deletions(-) rename spacy/en/{lemmatization.py => lemmatizer/lookup.py} (99%) create mode 100644 spacy/en/lex_attrs.py rename spacy/en/{word_sets.py => stop_words.py} (74%) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 3298a6822..d6858c799 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,14 +1,16 @@ # coding: utf8 from __future__ import unicode_literals -from ..language import Language -from ..lemmatizer import Lemmatizer -from ..vocab import Vocab -from ..tokenizer import Tokenizer -from ..attrs import LANG -from ..deprecated import fix_glove_vectors_loading +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .morph_rules import MORPH_RULES +from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC -from .language_data import * +from ..language_data import BASE_EXCEPTIONS +from ..language import Language +from ..attrs import LANG +from ..util import update_exc class English(Language): @@ -18,20 +20,13 @@ class English(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'en' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - tag_map = TAG_MAP - stop_words = STOP_WORDS - + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) morph_rules = dict(MORPH_RULES) lemma_rules = dict(LEMMA_RULES) lemma_index = dict(LEMMA_INDEX) lemma_exc = dict(LEMMA_EXC) - def __init__(self, **overrides): - # Special-case hack for loading the GloVe vectors, to support <1.0 - overrides = fix_glove_vectors_loading(overrides) - Language.__init__(self, **overrides) - - -EXPORT = English \ No newline at end of file +__all__ = ['English'] diff --git a/spacy/en/lemmatizer/__init__.py b/spacy/en/lemmatizer/__init__.py index 263dcf70a..0b77319d9 100644 --- a/spacy/en/lemmatizer/__init__.py +++ b/spacy/en/lemmatizer/__init__.py @@ -1,3 +1,5 @@ + +from .lookup import LOOKUP from ._adjectives import ADJECTIVES from ._adjectives_irreg import ADJECTIVES_IRREG from ._adverbs import ADVERBS @@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES -INDEX = { - "adj": ADJECTIVES, - "adv": ADVERBS, - "noun": NOUNS, - "verb": VERBS -} +LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS} +LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG, + 'verb': VERBS_IRREG} -EXC = { - "adj": ADJECTIVES_IRREG, - "adv": ADVERBS_IRREG, - "noun": NOUNS_IRREG, - "verb": VERBS_IRREG -} - - -RULES = { - "adj": ADJECTIVE_RULES, - "noun": NOUN_RULES, - "verb": VERB_RULES, - "punct": PUNCT_RULES -} +LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES, + 'punct': PUNCT_RULES} diff --git a/spacy/en/lemmatization.py b/spacy/en/lemmatizer/lookup.py similarity index 99% rename from spacy/en/lemmatization.py rename to spacy/en/lemmatizer/lookup.py index 646f60673..86c1a89d3 100644 --- a/spacy/en/lemmatization.py +++ b/spacy/en/lemmatizer/lookup.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -LOOK_UP = { +LOOKUP = { " furtherst": "further", " skilled": "skill", "'cause": "because", @@ -41585,4 +41585,4 @@ LOOK_UP = { "zoospores": "zoospore", "zucchinis": "zucchini", "zygotes": "zygote" -} \ No newline at end of file +} diff --git a/spacy/en/lex_attrs.py b/spacy/en/lex_attrs.py new file mode 100644 index 000000000..450c04f34 --- /dev/null +++ b/spacy/en/lex_attrs.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Number words + +NUM_WORDS = set(""" +zero one two three four five six seven eight nine ten eleven twelve thirteen +fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty +sixty seventy eighty ninety hundred thousand million billion trillion +quadrillion gajillion bazillion +""".split()) + + +# Ordinal words + +ORDINAL_WORDS = set(""" +first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth +thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth +twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth +hundreth thousandth millionth billionth trillionth quadrillionth gajillionth +bazillionth +""".split()) diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py index 51a50736e..8e1c9e082 100644 --- a/spacy/en/morph_rules.py +++ b/spacy/en/morph_rules.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA +from ..symbols import LEMMA +from ..deprecated import PRON_LEMMA MORPH_RULES = { diff --git a/spacy/en/word_sets.py b/spacy/en/stop_words.py similarity index 74% rename from spacy/en/word_sets.py rename to spacy/en/stop_words.py index deb5dc44b..640940fea 100644 --- a/spacy/en/word_sets.py +++ b/spacy/en/stop_words.py @@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves """.split()) - - -# Number words - -NUM_WORDS = set(""" -zero one two three four five six seven eight nine ten eleven twelve thirteen -fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty -sixty seventy eighty ninety hundred thousand million billion trillion -quadrillion gajillion bazillion -""".split()) - - -# Ordinal words - -ORDINAL_WORDS = set(""" -first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth -thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth -twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth -hundreth thousandth millionth billionth trillionth quadrillionth gajillionth -bazillionth -""".split()) diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py index ea14f7d4d..92c171904 100644 --- a/spacy/en/tag_map.py +++ b/spacy/en/tag_map.py @@ -1,7 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * +from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON TAG_MAP = { diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 3d009241b..6c62fc752 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -1,13 +1,12 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA +from ..symbols import ORTH, LEMMA, TAG, NORM +from ..deprecated import PRON_LEMMA -EXC = {} - -EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", +_exc = {} +_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"] @@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", for pron in ["i"]: for orth in [pron, pron.title()]: - EXC[orth + "'m"] = [ + _exc[orth + "'m"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} - ] + {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] - EXC[orth + "m"] = [ + _exc[orth + "m"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } - ] + {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] - EXC[orth + "'ma"] = [ + _exc[orth + "'ma"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] + {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] - EXC[orth + "ma"] = [ + _exc[orth + "ma"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] + {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] for pron in ["i", "you", "he", "she", "it", "we", "they"]: for orth in [pron, pron.title()]: - EXC[orth + "'ll"] = [ + _exc[orth + "'ll"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] + {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] - EXC[orth + "ll"] = [ + _exc[orth + "ll"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] + {ORTH: "ll", LEMMA: "will", TAG: "MD"}] - EXC[orth + "'ll've"] = [ + _exc[orth + "'ll've"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "llve"] = [ + _exc[orth + "llve"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "'d"] = [ + _exc[orth + "'d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ] + {ORTH: "'d", LEMMA: "would", TAG: "MD"}] - EXC[orth + "d"] = [ + _exc[orth + "d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ] + {ORTH: "d", LEMMA: "would", TAG: "MD"}] - EXC[orth + "'d've"] = [ + _exc[orth + "'d've"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "dve"] = [ + _exc[orth + "dve"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] for pron in ["i", "you", "we", "they"]: for orth in [pron, pron.title()]: - EXC[orth + "'ve"] = [ + _exc[orth + "'ve"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "ve"] = [ + _exc[orth + "ve"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] for pron in ["you", "we", "they"]: for orth in [pron, pron.title()]: - EXC[orth + "'re"] = [ + _exc[orth + "'re"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] + {ORTH: "'re", LEMMA: "be", NORM: "are"}] - EXC[orth + "re"] = [ + _exc[orth + "re"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"} - ] + {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] for pron in ["he", "she", "it"]: for orth in [pron, pron.title()]: - EXC[orth + "'s"] = [ + _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ] + {ORTH: "'s"}] - EXC[orth + "s"] = [ + _exc[orth + "s"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ] - + {ORTH: "s"}] # W-words, relative pronouns, prepositions etc. for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for orth in [word, word.title()]: - EXC[orth + "'s"] = [ + _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'s"} - ] + {ORTH: "'s"}] - EXC[orth + "s"] = [ + _exc[orth + "s"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "s"} - ] + {ORTH: "s"}] - EXC[orth + "'ll"] = [ + _exc[orth + "'ll"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] + {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] - EXC[orth + "ll"] = [ + _exc[orth + "ll"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] + {ORTH: "ll", LEMMA: "will", TAG: "MD"}] - EXC[orth + "'ll've"] = [ + _exc[orth + "'ll've"] = [ {ORTH: orth, LEMMA: word}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "llve"] = [ + _exc[orth + "llve"] = [ {ORTH: orth, LEMMA: word}, {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "'re"] = [ + _exc[orth + "'re"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] + {ORTH: "'re", LEMMA: "be", NORM: "are"}] - EXC[orth + "re"] = [ + _exc[orth + "re"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ] + {ORTH: "re", LEMMA: "be", NORM: "are"}] - EXC[orth + "'ve"] = [ + _exc[orth + "'ve"] = [ {ORTH: orth}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "ve"] = [ + _exc[orth + "ve"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "'d"] = [ + _exc[orth + "'d"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'d"} - ] + {ORTH: "'d"}] - EXC[orth + "d"] = [ + _exc[orth + "d"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "d"} - ] + {ORTH: "d"}] - EXC[orth + "'d've"] = [ + _exc[orth + "'d've"] = [ {ORTH: orth, LEMMA: word}, {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[orth + "dve"] = [ + _exc[orth + "dve"] = [ {ORTH: orth, LEMMA: word}, {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] # Verbs @@ -221,54 +187,44 @@ for verb_data in [ {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, {ORTH: "should", TAG: "MD"}, {ORTH: "wo", LEMMA: "will", TAG: "MD"}, - {ORTH: "would", TAG: "MD"} -]: + {ORTH: "would", TAG: "MD"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ + _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] + {ORTH: "n't", LEMMA: "not", TAG: "RB"}] - EXC[data[ORTH] + "nt"] = [ + _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] + {ORTH: "nt", LEMMA: "not", TAG: "RB"}] - EXC[data[ORTH] + "n't've"] = [ + _exc[data[ORTH] + "n't've"] = [ dict(data), {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[data[ORTH] + "ntve"] = [ + _exc[data[ORTH] + "ntve"] = [ dict(data), {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] for verb_data in [ {ORTH: "could", TAG: "MD"}, {ORTH: "might"}, {ORTH: "must"}, - {ORTH: "should"} -]: + {ORTH: "should"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "'ve"] = [ + _exc[data[ORTH] + "'ve"] = [ dict(data), - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] - EXC[data[ORTH] + "ve"] = [ + _exc[data[ORTH] + "ve"] = [ dict(data), - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + {ORTH: "ve", LEMMA: "have", TAG: "VB"}] for verb_data in [ @@ -276,22 +232,17 @@ for verb_data in [ {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} -]: + {ORTH: "were", LEMMA: "be"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ + _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] + {ORTH: "n't", LEMMA: "not", TAG: "RB"}] - EXC[data[ORTH] + "nt"] = [ + _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] - + {ORTH: "nt", LEMMA: "not", TAG: "RB"}] # Other contractions with trailing apostrophe @@ -302,22 +253,14 @@ for exc_data in [ {ORTH: "nothin", LEMMA: "nothing"}, {ORTH: "nuthin", LEMMA: "nothing"}, {ORTH: "ol", LEMMA: "old"}, - {ORTH: "somethin", LEMMA: "something"} -]: + {ORTH: "somethin", LEMMA: "something"}]: exc_data_tc = dict(exc_data) exc_data_tc[ORTH] = exc_data_tc[ORTH].title() - for data in [exc_data, exc_data_tc]: data_apos = dict(data) data_apos[ORTH] = data_apos[ORTH] + "'" - - EXC[data[ORTH]] = [ - dict(data) - ] - - EXC[data_apos[ORTH]] = [ - dict(data_apos) - ] + _exc[data[ORTH]] = [dict(data)] + _exc[data_apos[ORTH]] = [dict(data_apos)] # Other contractions with leading apostrophe @@ -326,449 +269,181 @@ for exc_data in [ {ORTH: "cause", LEMMA: "because"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "ll", LEMMA: "will"}, - {ORTH: "nuff", LEMMA: "enough"} -]: + {ORTH: "nuff", LEMMA: "enough"}]: exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] - for data in [exc_data, exc_data_apos]: - EXC[data[ORTH]] = [ - dict(data) - ] + _exc[data[ORTH]] = [dict(data)] # Times for h in range(1, 12 + 1): hour = str(h) - for period in ["a.m.", "am"]: - EXC[hour + period] = [ + _exc[hour+period] = [ {ORTH: hour}, - {ORTH: period, LEMMA: "a.m."} - ] + {ORTH: period, LEMMA: "a.m."}] for period in ["p.m.", "pm"]: - EXC[hour + period] = [ + _exc[hour+period] = [ {ORTH: hour}, - {ORTH: period, LEMMA: "p.m."} - ] + {ORTH: period, LEMMA: "p.m."}] # Rest -OTHER = { - " ": [ - {ORTH: " ", TAG: "SP"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "'re": [ - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "'Cause": [ - {ORTH: "'Cause", LEMMA: "because"} - ], - +_other_exc = { "y'all": [ {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], + {ORTH: "all"}], "yall": [ {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "ma'am": [ - {ORTH: "ma'am", LEMMA: "madam"} - ], - - "Ma'am": [ - {ORTH: "Ma'am", LEMMA: "madam"} - ], - - "o'clock": [ - {ORTH: "o'clock", LEMMA: "o'clock"} - ], - - "O'clock": [ - {ORTH: "O'clock", LEMMA: "o'clock"} - ], + {ORTH: "all"}], "how'd'y": [ {ORTH: "how", LEMMA: "how"}, {ORTH: "'d", LEMMA: "do"}, - {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} - ], + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], "How'd'y": [ {ORTH: "How", LEMMA: "how"}, {ORTH: "'d", LEMMA: "do"}, - {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} - ], + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], "not've": [ {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], "notve": [ {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], + {ORTH: "ve", LEMMA: "have", TAG: "VB"}], "Not've": [ {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], + {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], "Notve": [ {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], + {ORTH: "ve", LEMMA: "have", TAG: "VB"}], "cannot": [ {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], + {ORTH: "not", LEMMA: "not", TAG: "RB"}], "Cannot": [ {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], + {ORTH: "not", LEMMA: "not", TAG: "RB"}], "gonna": [ {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], + {ORTH: "na", LEMMA: "to"}], "Gonna": [ {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], + {ORTH: "na", LEMMA: "to"}], "gotta": [ {ORTH: "got"}, - {ORTH: "ta", LEMMA: "to"} - ], + {ORTH: "ta", LEMMA: "to"}], "Gotta": [ {ORTH: "Got"}, - {ORTH: "ta", LEMMA: "to"} - ], + {ORTH: "ta", LEMMA: "to"}], "let's": [ {ORTH: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], "Let's": [ {ORTH: "Let", LEMMA: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ] + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] } - -# Abbreviations - -ABBREVIATIONS = { - "Mt.": [ - {ORTH: "Mt.", LEMMA: "Mount"} - ], - - "Ak.": [ - {ORTH: "Ak.", LEMMA: "Alaska"} - ], - - "Ala.": [ - {ORTH: "Ala.", LEMMA: "Alabama"} - ], - - "Apr.": [ - {ORTH: "Apr.", LEMMA: "April"} - ], - - "Ariz.": [ - {ORTH: "Ariz.", LEMMA: "Arizona"} - ], - - "Ark.": [ - {ORTH: "Ark.", LEMMA: "Arkansas"} - ], - - "Aug.": [ - {ORTH: "Aug.", LEMMA: "August"} - ], - - "Calif.": [ - {ORTH: "Calif.", LEMMA: "California"} - ], - - "Colo.": [ - {ORTH: "Colo.", LEMMA: "Colorado"} - ], - - "Conn.": [ - {ORTH: "Conn.", LEMMA: "Connecticut"} - ], - - "Dec.": [ - {ORTH: "Dec.", LEMMA: "December"} - ], - - "Del.": [ - {ORTH: "Del.", LEMMA: "Delaware"} - ], - - "Feb.": [ - {ORTH: "Feb.", LEMMA: "February"} - ], - - "Fla.": [ - {ORTH: "Fla.", LEMMA: "Florida"} - ], - - "Ga.": [ - {ORTH: "Ga.", LEMMA: "Georgia"} - ], - - "Ia.": [ - {ORTH: "Ia.", LEMMA: "Iowa"} - ], - - "Id.": [ - {ORTH: "Id.", LEMMA: "Idaho"} - ], - - "Ill.": [ - {ORTH: "Ill.", LEMMA: "Illinois"} - ], - - "Ind.": [ - {ORTH: "Ind.", LEMMA: "Indiana"} - ], - - "Jan.": [ - {ORTH: "Jan.", LEMMA: "January"} - ], - - "Jul.": [ - {ORTH: "Jul.", LEMMA: "July"} - ], - - "Jun.": [ - {ORTH: "Jun.", LEMMA: "June"} - ], - - "Kan.": [ - {ORTH: "Kan.", LEMMA: "Kansas"} - ], - - "Kans.": [ - {ORTH: "Kans.", LEMMA: "Kansas"} - ], - - "Ky.": [ - {ORTH: "Ky.", LEMMA: "Kentucky"} - ], - - "La.": [ - {ORTH: "La.", LEMMA: "Louisiana"} - ], - - "Mar.": [ - {ORTH: "Mar.", LEMMA: "March"} - ], - - "Mass.": [ - {ORTH: "Mass.", LEMMA: "Massachusetts"} - ], - - "May.": [ - {ORTH: "May.", LEMMA: "May"} - ], - - "Mich.": [ - {ORTH: "Mich.", LEMMA: "Michigan"} - ], - - "Minn.": [ - {ORTH: "Minn.", LEMMA: "Minnesota"} - ], - - "Miss.": [ - {ORTH: "Miss.", LEMMA: "Mississippi"} - ], - - "N.C.": [ - {ORTH: "N.C.", LEMMA: "North Carolina"} - ], - - "N.D.": [ - {ORTH: "N.D.", LEMMA: "North Dakota"} - ], - - "N.H.": [ - {ORTH: "N.H.", LEMMA: "New Hampshire"} - ], - - "N.J.": [ - {ORTH: "N.J.", LEMMA: "New Jersey"} - ], - - "N.M.": [ - {ORTH: "N.M.", LEMMA: "New Mexico"} - ], - - "N.Y.": [ - {ORTH: "N.Y.", LEMMA: "New York"} - ], - - "Neb.": [ - {ORTH: "Neb.", LEMMA: "Nebraska"} - ], - - "Nebr.": [ - {ORTH: "Nebr.", LEMMA: "Nebraska"} - ], - - "Nev.": [ - {ORTH: "Nev.", LEMMA: "Nevada"} - ], - - "Nov.": [ - {ORTH: "Nov.", LEMMA: "November"} - ], - - "Oct.": [ - {ORTH: "Oct.", LEMMA: "October"} - ], - - "Okla.": [ - {ORTH: "Okla.", LEMMA: "Oklahoma"} - ], - - "Ore.": [ - {ORTH: "Ore.", LEMMA: "Oregon"} - ], - - "Pa.": [ - {ORTH: "Pa.", LEMMA: "Pennsylvania"} - ], - - "S.C.": [ - {ORTH: "S.C.", LEMMA: "South Carolina"} - ], - - "Sep.": [ - {ORTH: "Sep.", LEMMA: "September"} - ], - - "Sept.": [ - {ORTH: "Sept.", LEMMA: "September"} - ], - - "Tenn.": [ - {ORTH: "Tenn.", LEMMA: "Tennessee"} - ], - - "Va.": [ - {ORTH: "Va.", LEMMA: "Virginia"} - ], - - "Wash.": [ - {ORTH: "Wash.", LEMMA: "Washington"} - ], - - "Wis.": [ - {ORTH: "Wis.", LEMMA: "Wisconsin"} - ] -} +_exc.update(_other_exc) -TOKENIZER_EXCEPTIONS = dict(EXC) -TOKENIZER_EXCEPTIONS.update(OTHER) -TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) +for exc_data in [ + {ORTH: "'S", LEMMA: "'s"}, + {ORTH: "'s", LEMMA: "'s"}, + {ORTH: "\u2018S", LEMMA: "'s"}, + {ORTH: "\u2018s", LEMMA: "'s"}, + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"}, + {ORTH: "'Cause", LEMMA: "because"}, + {ORTH: "'cause", LEMMA: "because"}, + {ORTH: "ma'am", LEMMA: "madam"}, + {ORTH: "Ma'am", LEMMA: "madam"}, + {ORTH: "o'clock", LEMMA: "o'clock"}, + {ORTH: "O'clock", LEMMA: "o'clock"}, + + {ORTH: "Mt.", LEMMA: "Mount"}, + {ORTH: "Ak.", LEMMA: "Alaska"}, + {ORTH: "Ala.", LEMMA: "Alabama"}, + {ORTH: "Apr.", LEMMA: "April"}, + {ORTH: "Ariz.", LEMMA: "Arizona"}, + {ORTH: "Ark.", LEMMA: "Arkansas"}, + {ORTH: "Aug.", LEMMA: "August"}, + {ORTH: "Calif.", LEMMA: "California"}, + {ORTH: "Colo.", LEMMA: "Colorado"}, + {ORTH: "Conn.", LEMMA: "Connecticut"}, + {ORTH: "Dec.", LEMMA: "December"}, + {ORTH: "Del.", LEMMA: "Delaware"}, + {ORTH: "Feb.", LEMMA: "February"}, + {ORTH: "Fla.", LEMMA: "Florida"}, + {ORTH: "Ga.", LEMMA: "Georgia"}, + {ORTH: "Ia.", LEMMA: "Iowa"}, + {ORTH: "Id.", LEMMA: "Idaho"}, + {ORTH: "Ill.", LEMMA: "Illinois"}, + {ORTH: "Ind.", LEMMA: "Indiana"}, + {ORTH: "Jan.", LEMMA: "January"}, + {ORTH: "Jul.", LEMMA: "July"}, + {ORTH: "Jun.", LEMMA: "June"}, + {ORTH: "Kan.", LEMMA: "Kansas"}, + {ORTH: "Kans.", LEMMA: "Kansas"}, + {ORTH: "Ky.", LEMMA: "Kentucky"}, + {ORTH: "La.", LEMMA: "Louisiana"}, + {ORTH: "Mar.", LEMMA: "March"}, + {ORTH: "Mass.", LEMMA: "Massachusetts"}, + {ORTH: "May.", LEMMA: "May"}, + {ORTH: "Mich.", LEMMA: "Michigan"}, + {ORTH: "Minn.", LEMMA: "Minnesota"}, + {ORTH: "Miss.", LEMMA: "Mississippi"}, + {ORTH: "N.C.", LEMMA: "North Carolina"}, + {ORTH: "N.D.", LEMMA: "North Dakota"}, + {ORTH: "N.H.", LEMMA: "New Hampshire"}, + {ORTH: "N.J.", LEMMA: "New Jersey"}, + {ORTH: "N.M.", LEMMA: "New Mexico"}, + {ORTH: "N.Y.", LEMMA: "New York"}, + {ORTH: "Neb.", LEMMA: "Nebraska"}, + {ORTH: "Nebr.", LEMMA: "Nebraska"}, + {ORTH: "Nev.", LEMMA: "Nevada"}, + {ORTH: "Nov.", LEMMA: "November"}, + {ORTH: "Oct.", LEMMA: "October"}, + {ORTH: "Okla.", LEMMA: "Oklahoma"}, + {ORTH: "Ore.", LEMMA: "Oregon"}, + {ORTH: "Pa.", LEMMA: "Pennsylvania"}, + {ORTH: "S.C.", LEMMA: "South Carolina"}, + {ORTH: "Sep.", LEMMA: "September"}, + {ORTH: "Sept.", LEMMA: "September"}, + {ORTH: "Tenn.", LEMMA: "Tennessee"}, + {ORTH: "Va.", LEMMA: "Virginia"}, + {ORTH: "Wash.", LEMMA: "Washington"}, + {ORTH: "Wis.", LEMMA: "Wisconsin"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)] -# Remove EXCLUDE_EXC if in exceptions - -for string in EXCLUDE_EXC: - if string in TOKENIZER_EXCEPTIONS: - TOKENIZER_EXCEPTIONS.pop(string) +for orth in [ + "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.", + "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.", + "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.", + "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]: + _exc[orth] = [{ORTH: orth}] -# Abbreviations with only one ORTH token +for string in _exclude: + if string in _exc: + _exc.pop(string) -ORTH_ONLY = [ - "'d", - "a.m.", - "Adm.", - "Bros.", - "co.", - "Co.", - "Corp.", - "D.C.", - "Dr.", - "e.g.", - "E.g.", - "E.G.", - "Gen.", - "Gov.", - "i.e.", - "I.e.", - "I.E.", - "Inc.", - "Jr.", - "Ltd.", - "Md.", - "Messrs.", - "Mo.", - "Mont.", - "Mr.", - "Mrs.", - "Ms.", - "p.m.", - "Ph.D.", - "Rep.", - "Rev.", - "Sen.", - "St.", - "vs.", -] + +TOKENIZER_EXCEPTIONS = dict(_exc)