Reorganise English language data

This commit is contained in:
ines 2017-05-08 15:47:25 +02:00
parent 1bf9d5ec8b
commit c7c21b980f
8 changed files with 242 additions and 582 deletions

View File

@ -1,14 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
from ..language import Language
from ..lemmatizer import Lemmatizer
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..attrs import LANG
from ..deprecated import fix_glove_vectors_loading
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .language_data import *
from ..language_data import BASE_EXCEPTIONS
from ..language import Language
from ..attrs import LANG
from ..util import update_exc
class English(Language):
@ -18,20 +20,13 @@ class English(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = TAG_MAP
stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
def __init__(self, **overrides):
# Special-case hack for loading the GloVe vectors, to support <1.0
overrides = fix_glove_vectors_loading(overrides)
Language.__init__(self, **overrides)
EXPORT = English
__all__ = ['English']

View File

@ -1,3 +1,5 @@
from .lookup import LOOKUP
from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adverbs import ADVERBS
@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
INDEX = {
"adj": ADJECTIVES,
"adv": ADVERBS,
"noun": NOUNS,
"verb": VERBS
}
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
'verb': VERBS_IRREG}
EXC = {
"adj": ADJECTIVES_IRREG,
"adv": ADVERBS_IRREG,
"noun": NOUNS_IRREG,
"verb": VERBS_IRREG
}
RULES = {
"adj": ADJECTIVE_RULES,
"noun": NOUN_RULES,
"verb": VERB_RULES,
"punct": PUNCT_RULES
}
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
'punct': PUNCT_RULES}

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
LOOK_UP = {
LOOKUP = {
" furtherst": "further",
" skilled": "skill",
"'cause": "because",

23
spacy/en/lex_attrs.py Normal file
View File

@ -0,0 +1,23 @@
# coding: utf8
from __future__ import unicode_literals
# Number words
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
bazillionth
""".split())

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..symbols import LEMMA
from ..deprecated import PRON_LEMMA
MORPH_RULES = {

View File

@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split())
# Number words
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
bazillionth
""".split())

View File

@ -1,7 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
TAG_MAP = {

View File

@ -1,13 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..symbols import ORTH, LEMMA, TAG, NORM
from ..deprecated import PRON_LEMMA
EXC = {}
EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
_exc = {}
_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
"Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]:
for orth in [pron, pron.title()]:
EXC[orth + "'m"] = [
_exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}
]
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
EXC[orth + "m"] = [
_exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
]
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
EXC[orth + "'ma"] = [
_exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
]
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
EXC[orth + "ma"] = [
_exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
]
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]:
EXC[orth + "'ll"] = [
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}
]
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
EXC[orth + "ll"] = [
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}
]
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
EXC[orth + "'ll've"] = [
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "llve"] = [
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "'d"] = [
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}
]
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
EXC[orth + "d"] = [
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}
]
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
EXC[orth + "'d've"] = [
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "dve"] = [
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]:
EXC[orth + "'ve"] = [
_exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "ve"] = [
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]:
EXC[orth + "'re"] = [
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}
]
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
EXC[orth + "re"] = [
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}
]
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]:
EXC[orth + "'s"] = [
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"}
]
{ORTH: "'s"}]
EXC[orth + "s"] = [
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"}
]
{ORTH: "s"}]
# W-words, relative pronouns, prepositions etc.
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]:
EXC[orth + "'s"] = [
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'s"}
]
{ORTH: "'s"}]
EXC[orth + "s"] = [
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "s"}
]
{ORTH: "s"}]
EXC[orth + "'ll"] = [
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}
]
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
EXC[orth + "ll"] = [
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}
]
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
EXC[orth + "'ll've"] = [
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "llve"] = [
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "'re"] = [
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}
]
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
EXC[orth + "re"] = [
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}
]
{ORTH: "re", LEMMA: "be", NORM: "are"}]
EXC[orth + "'ve"] = [
_exc[orth + "'ve"] = [
{ORTH: orth},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "ve"] = [
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "'d"] = [
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d"}
]
{ORTH: "'d"}]
EXC[orth + "d"] = [
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "d"}
]
{ORTH: "d"}]
EXC[orth + "'d've"] = [
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[orth + "dve"] = [
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
# Verbs
@ -221,54 +187,44 @@ for verb_data in [
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}
]:
{ORTH: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "n't"] = [
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}
]
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
EXC[data[ORTH] + "nt"] = [
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}
]
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
EXC[data[ORTH] + "n't've"] = [
_exc[data[ORTH] + "n't've"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[data[ORTH] + "ntve"] = [
_exc[data[ORTH] + "ntve"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
for verb_data in [
{ORTH: "could", TAG: "MD"},
{ORTH: "might"},
{ORTH: "must"},
{ORTH: "should"}
]:
{ORTH: "should"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "'ve"] = [
_exc[data[ORTH] + "'ve"] = [
dict(data),
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
EXC[data[ORTH] + "ve"] = [
_exc[data[ORTH] + "ve"] = [
dict(data),
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
]
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
for verb_data in [
@ -276,22 +232,17 @@ for verb_data in [
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"},
{ORTH: "were", LEMMA: "be"}
]:
{ORTH: "were", LEMMA: "be"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "n't"] = [
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}
]
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
EXC[data[ORTH] + "nt"] = [
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}
]
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe
@ -302,22 +253,14 @@ for exc_data in [
{ORTH: "nothin", LEMMA: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"},
{ORTH: "ol", LEMMA: "old"},
{ORTH: "somethin", LEMMA: "something"}
]:
{ORTH: "somethin", LEMMA: "something"}]:
exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]:
data_apos = dict(data)
data_apos[ORTH] = data_apos[ORTH] + "'"
EXC[data[ORTH]] = [
dict(data)
]
EXC[data_apos[ORTH]] = [
dict(data_apos)
]
_exc[data[ORTH]] = [dict(data)]
_exc[data_apos[ORTH]] = [dict(data_apos)]
# Other contractions with leading apostrophe
@ -326,449 +269,181 @@ for exc_data in [
{ORTH: "cause", LEMMA: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"},
{ORTH: "nuff", LEMMA: "enough"}
]:
{ORTH: "nuff", LEMMA: "enough"}]:
exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]:
EXC[data[ORTH]] = [
dict(data)
]
_exc[data[ORTH]] = [dict(data)]
# Times
for h in range(1, 12 + 1):
hour = str(h)
for period in ["a.m.", "am"]:
EXC[hour + period] = [
_exc[hour+period] = [
{ORTH: hour},
{ORTH: period, LEMMA: "a.m."}
]
{ORTH: period, LEMMA: "a.m."}]
for period in ["p.m.", "pm"]:
EXC[hour + period] = [
_exc[hour+period] = [
{ORTH: hour},
{ORTH: period, LEMMA: "p.m."}
]
{ORTH: period, LEMMA: "p.m."}]
# Rest
OTHER = {
" ": [
{ORTH: " ", TAG: "SP"}
],
"\u00a0": [
{ORTH: "\u00a0", TAG: "SP", LEMMA: " "}
],
"'S": [
{ORTH: "'S", LEMMA: "'s"}
],
"'s": [
{ORTH: "'s", LEMMA: "'s"}
],
"'re": [
{ORTH: "'re", LEMMA: "be", NORM: "are"}
],
"\u2018S": [
{ORTH: "\u2018S", LEMMA: "'s"}
],
"\u2018s": [
{ORTH: "\u2018s", LEMMA: "'s"}
],
"and/or": [
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
],
"'Cause": [
{ORTH: "'Cause", LEMMA: "because"}
],
_other_exc = {
"y'all": [
{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
{ORTH: "all"}
],
{ORTH: "all"}],
"yall": [
{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
{ORTH: "all"}
],
"ma'am": [
{ORTH: "ma'am", LEMMA: "madam"}
],
"Ma'am": [
{ORTH: "Ma'am", LEMMA: "madam"}
],
"o'clock": [
{ORTH: "o'clock", LEMMA: "o'clock"}
],
"O'clock": [
{ORTH: "O'clock", LEMMA: "o'clock"}
],
{ORTH: "all"}],
"how'd'y": [
{ORTH: "how", LEMMA: "how"},
{ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [
{ORTH: "How", LEMMA: "how"},
{ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
],
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
"notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
],
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
"Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
],
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
"Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
],
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
"cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}
],
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}
],
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}
],
{ORTH: "na", LEMMA: "to"}],
"Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}
],
{ORTH: "na", LEMMA: "to"}],
"gotta": [
{ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}
],
{ORTH: "ta", LEMMA: "to"}],
"Gotta": [
{ORTH: "Got"},
{ORTH: "ta", LEMMA: "to"}
],
{ORTH: "ta", LEMMA: "to"}],
"let's": [
{ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [
{ORTH: "Let", LEMMA: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
],
"\u2014": [
{ORTH: "\u2014", TAG: ":", LEMMA: "--"}
],
"\n": [
{ORTH: "\n", TAG: "SP"}
],
"\t": [
{ORTH: "\t", TAG: "SP"}
]
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
}
# Abbreviations
ABBREVIATIONS = {
"Mt.": [
{ORTH: "Mt.", LEMMA: "Mount"}
],
"Ak.": [
{ORTH: "Ak.", LEMMA: "Alaska"}
],
"Ala.": [
{ORTH: "Ala.", LEMMA: "Alabama"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Ariz.": [
{ORTH: "Ariz.", LEMMA: "Arizona"}
],
"Ark.": [
{ORTH: "Ark.", LEMMA: "Arkansas"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "August"}
],
"Calif.": [
{ORTH: "Calif.", LEMMA: "California"}
],
"Colo.": [
{ORTH: "Colo.", LEMMA: "Colorado"}
],
"Conn.": [
{ORTH: "Conn.", LEMMA: "Connecticut"}
],
"Dec.": [
{ORTH: "Dec.", LEMMA: "December"}
],
"Del.": [
{ORTH: "Del.", LEMMA: "Delaware"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "February"}
],
"Fla.": [
{ORTH: "Fla.", LEMMA: "Florida"}
],
"Ga.": [
{ORTH: "Ga.", LEMMA: "Georgia"}
],
"Ia.": [
{ORTH: "Ia.", LEMMA: "Iowa"}
],
"Id.": [
{ORTH: "Id.", LEMMA: "Idaho"}
],
"Ill.": [
{ORTH: "Ill.", LEMMA: "Illinois"}
],
"Ind.": [
{ORTH: "Ind.", LEMMA: "Indiana"}
],
"Jan.": [
{ORTH: "Jan.", LEMMA: "January"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "July"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "June"}
],
"Kan.": [
{ORTH: "Kan.", LEMMA: "Kansas"}
],
"Kans.": [
{ORTH: "Kans.", LEMMA: "Kansas"}
],
"Ky.": [
{ORTH: "Ky.", LEMMA: "Kentucky"}
],
"La.": [
{ORTH: "La.", LEMMA: "Louisiana"}
],
"Mar.": [
{ORTH: "Mar.", LEMMA: "March"}
],
"Mass.": [
{ORTH: "Mass.", LEMMA: "Massachusetts"}
],
"May.": [
{ORTH: "May.", LEMMA: "May"}
],
"Mich.": [
{ORTH: "Mich.", LEMMA: "Michigan"}
],
"Minn.": [
{ORTH: "Minn.", LEMMA: "Minnesota"}
],
"Miss.": [
{ORTH: "Miss.", LEMMA: "Mississippi"}
],
"N.C.": [
{ORTH: "N.C.", LEMMA: "North Carolina"}
],
"N.D.": [
{ORTH: "N.D.", LEMMA: "North Dakota"}
],
"N.H.": [
{ORTH: "N.H.", LEMMA: "New Hampshire"}
],
"N.J.": [
{ORTH: "N.J.", LEMMA: "New Jersey"}
],
"N.M.": [
{ORTH: "N.M.", LEMMA: "New Mexico"}
],
"N.Y.": [
{ORTH: "N.Y.", LEMMA: "New York"}
],
"Neb.": [
{ORTH: "Neb.", LEMMA: "Nebraska"}
],
"Nebr.": [
{ORTH: "Nebr.", LEMMA: "Nebraska"}
],
"Nev.": [
{ORTH: "Nev.", LEMMA: "Nevada"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Oct.": [
{ORTH: "Oct.", LEMMA: "October"}
],
"Okla.": [
{ORTH: "Okla.", LEMMA: "Oklahoma"}
],
"Ore.": [
{ORTH: "Ore.", LEMMA: "Oregon"}
],
"Pa.": [
{ORTH: "Pa.", LEMMA: "Pennsylvania"}
],
"S.C.": [
{ORTH: "S.C.", LEMMA: "South Carolina"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"Tenn.": [
{ORTH: "Tenn.", LEMMA: "Tennessee"}
],
"Va.": [
{ORTH: "Va.", LEMMA: "Virginia"}
],
"Wash.": [
{ORTH: "Wash.", LEMMA: "Washington"}
],
"Wis.": [
{ORTH: "Wis.", LEMMA: "Wisconsin"}
]
}
_exc.update(_other_exc)
TOKENIZER_EXCEPTIONS = dict(EXC)
TOKENIZER_EXCEPTIONS.update(OTHER)
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
for exc_data in [
{ORTH: "'S", LEMMA: "'s"},
{ORTH: "'s", LEMMA: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
{ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"},
{ORTH: "'cause", LEMMA: "because"},
{ORTH: "ma'am", LEMMA: "madam"},
{ORTH: "Ma'am", LEMMA: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Calif.", LEMMA: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"},
{ORTH: "Del.", LEMMA: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"},
{ORTH: "Fla.", LEMMA: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"},
{ORTH: "Jul.", LEMMA: "July"},
{ORTH: "Jun.", LEMMA: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Oct.", LEMMA: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
# Remove EXCLUDE_EXC if in exceptions
for string in EXCLUDE_EXC:
if string in TOKENIZER_EXCEPTIONS:
TOKENIZER_EXCEPTIONS.pop(string)
for orth in [
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
_exc[orth] = [{ORTH: orth}]
# Abbreviations with only one ORTH token
for string in _exclude:
if string in _exc:
_exc.pop(string)
ORTH_ONLY = [
"'d",
"a.m.",
"Adm.",
"Bros.",
"co.",
"Co.",
"Corp.",
"D.C.",
"Dr.",
"e.g.",
"E.g.",
"E.G.",
"Gen.",
"Gov.",
"i.e.",
"I.e.",
"I.E.",
"Inc.",
"Jr.",
"Ltd.",
"Md.",
"Messrs.",
"Mo.",
"Mont.",
"Mr.",
"Mrs.",
"Ms.",
"p.m.",
"Ph.D.",
"Rep.",
"Rev.",
"Sen.",
"St.",
"vs.",
]
TOKENIZER_EXCEPTIONS = dict(_exc)