mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Reorganise English language data
This commit is contained in:
parent
1bf9d5ec8b
commit
c7c21b980f
|
@ -1,14 +1,16 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..language import Language
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..lemmatizer import Lemmatizer
|
from .tag_map import TAG_MAP
|
||||||
from ..vocab import Vocab
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer import Tokenizer
|
from .morph_rules import MORPH_RULES
|
||||||
from ..attrs import LANG
|
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||||
from ..deprecated import fix_glove_vectors_loading
|
|
||||||
|
|
||||||
from .language_data import *
|
from ..language_data import BASE_EXCEPTIONS
|
||||||
|
from ..language import Language
|
||||||
|
from ..attrs import LANG
|
||||||
|
from ..util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
@ -18,20 +20,13 @@ class English(Language):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'en'
|
lex_attr_getters[LANG] = lambda text: 'en'
|
||||||
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = STOP_WORDS
|
stop_words = set(STOP_WORDS)
|
||||||
|
|
||||||
morph_rules = dict(MORPH_RULES)
|
morph_rules = dict(MORPH_RULES)
|
||||||
lemma_rules = dict(LEMMA_RULES)
|
lemma_rules = dict(LEMMA_RULES)
|
||||||
lemma_index = dict(LEMMA_INDEX)
|
lemma_index = dict(LEMMA_INDEX)
|
||||||
lemma_exc = dict(LEMMA_EXC)
|
lemma_exc = dict(LEMMA_EXC)
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, **overrides):
|
__all__ = ['English']
|
||||||
# Special-case hack for loading the GloVe vectors, to support <1.0
|
|
||||||
overrides = fix_glove_vectors_loading(overrides)
|
|
||||||
Language.__init__(self, **overrides)
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT = English
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
|
||||||
|
from .lookup import LOOKUP
|
||||||
from ._adjectives import ADJECTIVES
|
from ._adjectives import ADJECTIVES
|
||||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||||
from ._adverbs import ADVERBS
|
from ._adverbs import ADVERBS
|
||||||
|
@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG
|
||||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||||
|
|
||||||
|
|
||||||
INDEX = {
|
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
||||||
"adj": ADJECTIVES,
|
|
||||||
"adv": ADVERBS,
|
|
||||||
"noun": NOUNS,
|
|
||||||
"verb": VERBS
|
|
||||||
}
|
|
||||||
|
|
||||||
|
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
|
||||||
|
'verb': VERBS_IRREG}
|
||||||
|
|
||||||
EXC = {
|
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
|
||||||
"adj": ADJECTIVES_IRREG,
|
'punct': PUNCT_RULES}
|
||||||
"adv": ADVERBS_IRREG,
|
|
||||||
"noun": NOUNS_IRREG,
|
|
||||||
"verb": VERBS_IRREG
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
RULES = {
|
|
||||||
"adj": ADJECTIVE_RULES,
|
|
||||||
"noun": NOUN_RULES,
|
|
||||||
"verb": VERB_RULES,
|
|
||||||
"punct": PUNCT_RULES
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
LOOK_UP = {
|
LOOKUP = {
|
||||||
" furtherst": "further",
|
" furtherst": "further",
|
||||||
" skilled": "skill",
|
" skilled": "skill",
|
||||||
"'cause": "because",
|
"'cause": "because",
|
||||||
|
@ -41585,4 +41585,4 @@ LOOK_UP = {
|
||||||
"zoospores": "zoospore",
|
"zoospores": "zoospore",
|
||||||
"zucchinis": "zucchini",
|
"zucchinis": "zucchini",
|
||||||
"zygotes": "zygote"
|
"zygotes": "zygote"
|
||||||
}
|
}
|
23
spacy/en/lex_attrs.py
Normal file
23
spacy/en/lex_attrs.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# Number words
|
||||||
|
|
||||||
|
NUM_WORDS = set("""
|
||||||
|
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
||||||
|
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
||||||
|
sixty seventy eighty ninety hundred thousand million billion trillion
|
||||||
|
quadrillion gajillion bazillion
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
# Ordinal words
|
||||||
|
|
||||||
|
ORDINAL_WORDS = set("""
|
||||||
|
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
||||||
|
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
||||||
|
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
||||||
|
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
||||||
|
bazillionth
|
||||||
|
""".split())
|
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import LEMMA
|
||||||
from ..language_data import PRON_LEMMA
|
from ..deprecated import PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
|
|
|
@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
""".split())
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
# Number words
|
|
||||||
|
|
||||||
NUM_WORDS = set("""
|
|
||||||
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
|
||||||
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
|
||||||
sixty seventy eighty ninety hundred thousand million billion trillion
|
|
||||||
quadrillion gajillion bazillion
|
|
||||||
""".split())
|
|
||||||
|
|
||||||
|
|
||||||
# Ordinal words
|
|
||||||
|
|
||||||
ORDINAL_WORDS = set("""
|
|
||||||
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
|
||||||
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
|
||||||
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
|
||||||
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
|
||||||
bazillionth
|
|
||||||
""".split())
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
|
from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
|
|
|
@ -1,13 +1,12 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import ORTH, LEMMA, TAG, NORM
|
||||||
from ..language_data import PRON_LEMMA
|
from ..deprecated import PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
EXC = {}
|
_exc = {}
|
||||||
|
_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
||||||
EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
|
||||||
"Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
|
"Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
||||||
|
|
||||||
for pron in ["i"]:
|
for pron in ["i"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
EXC[orth + "'m"] = [
|
_exc[orth + "'m"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}
|
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "m"] = [
|
_exc[orth + "m"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
|
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'ma"] = [
|
_exc[orth + "'ma"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "ma"] = [
|
_exc[orth + "ma"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
EXC[orth + "'ll"] = [
|
_exc[orth + "'ll"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}
|
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "ll"] = [
|
_exc[orth + "ll"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}
|
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'ll've"] = [
|
_exc[orth + "'ll've"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "llve"] = [
|
_exc[orth + "llve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"}
|
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "d"] = [
|
_exc[orth + "d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "d", LEMMA: "would", TAG: "MD"}
|
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "dve"] = [
|
_exc[orth + "dve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for pron in ["i", "you", "we", "they"]:
|
for pron in ["i", "you", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
EXC[orth + "'ve"] = [
|
_exc[orth + "'ve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "ve"] = [
|
_exc[orth + "ve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for pron in ["you", "we", "they"]:
|
for pron in ["you", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
EXC[orth + "'re"] = [
|
_exc[orth + "'re"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}
|
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "re"] = [
|
_exc[orth + "re"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}
|
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for pron in ["he", "she", "it"]:
|
for pron in ["he", "she", "it"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
EXC[orth + "'s"] = [
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'s"}
|
{ORTH: "'s"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "s"] = [
|
_exc[orth + "s"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "s"}
|
{ORTH: "s"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# W-words, relative pronouns, prepositions etc.
|
# W-words, relative pronouns, prepositions etc.
|
||||||
|
|
||||||
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
for orth in [word, word.title()]:
|
for orth in [word, word.title()]:
|
||||||
EXC[orth + "'s"] = [
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'s"}
|
{ORTH: "'s"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "s"] = [
|
_exc[orth + "s"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "s"}
|
{ORTH: "s"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'ll"] = [
|
_exc[orth + "'ll"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}
|
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "ll"] = [
|
_exc[orth + "ll"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}
|
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'ll've"] = [
|
_exc[orth + "'ll've"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "llve"] = [
|
_exc[orth + "llve"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'re"] = [
|
_exc[orth + "'re"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}
|
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "re"] = [
|
_exc[orth + "re"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "re", LEMMA: "be", NORM: "are"}
|
{ORTH: "re", LEMMA: "be", NORM: "are"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'ve"] = [
|
_exc[orth + "'ve"] = [
|
||||||
{ORTH: orth},
|
{ORTH: orth},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "ve"] = [
|
_exc[orth + "ve"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'d"}
|
{ORTH: "'d"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "d"] = [
|
_exc[orth + "d"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "d"}
|
{ORTH: "d"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[orth + "dve"] = [
|
_exc[orth + "dve"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Verbs
|
# Verbs
|
||||||
|
@ -221,54 +187,44 @@ for verb_data in [
|
||||||
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
|
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
|
||||||
{ORTH: "should", TAG: "MD"},
|
{ORTH: "should", TAG: "MD"},
|
||||||
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
|
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "would", TAG: "MD"}
|
{ORTH: "would", TAG: "MD"}]:
|
||||||
]:
|
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
|
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
EXC[data[ORTH] + "n't"] = [
|
_exc[data[ORTH] + "n't"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}
|
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data[ORTH] + "nt"] = [
|
_exc[data[ORTH] + "nt"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}
|
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data[ORTH] + "n't've"] = [
|
_exc[data[ORTH] + "n't've"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data[ORTH] + "ntve"] = [
|
_exc[data[ORTH] + "ntve"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for verb_data in [
|
for verb_data in [
|
||||||
{ORTH: "could", TAG: "MD"},
|
{ORTH: "could", TAG: "MD"},
|
||||||
{ORTH: "might"},
|
{ORTH: "might"},
|
||||||
{ORTH: "must"},
|
{ORTH: "must"},
|
||||||
{ORTH: "should"}
|
{ORTH: "should"}]:
|
||||||
]:
|
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
|
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
EXC[data[ORTH] + "'ve"] = [
|
_exc[data[ORTH] + "'ve"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data[ORTH] + "ve"] = [
|
_exc[data[ORTH] + "ve"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for verb_data in [
|
for verb_data in [
|
||||||
|
@ -276,22 +232,17 @@ for verb_data in [
|
||||||
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
|
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||||
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
|
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
|
||||||
{ORTH: "was", LEMMA: "be"},
|
{ORTH: "was", LEMMA: "be"},
|
||||||
{ORTH: "were", LEMMA: "be"}
|
{ORTH: "were", LEMMA: "be"}]:
|
||||||
]:
|
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
|
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
EXC[data[ORTH] + "n't"] = [
|
_exc[data[ORTH] + "n't"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}
|
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data[ORTH] + "nt"] = [
|
_exc[data[ORTH] + "nt"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}
|
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Other contractions with trailing apostrophe
|
# Other contractions with trailing apostrophe
|
||||||
|
@ -302,22 +253,14 @@ for exc_data in [
|
||||||
{ORTH: "nothin", LEMMA: "nothing"},
|
{ORTH: "nothin", LEMMA: "nothing"},
|
||||||
{ORTH: "nuthin", LEMMA: "nothing"},
|
{ORTH: "nuthin", LEMMA: "nothing"},
|
||||||
{ORTH: "ol", LEMMA: "old"},
|
{ORTH: "ol", LEMMA: "old"},
|
||||||
{ORTH: "somethin", LEMMA: "something"}
|
{ORTH: "somethin", LEMMA: "something"}]:
|
||||||
]:
|
|
||||||
exc_data_tc = dict(exc_data)
|
exc_data_tc = dict(exc_data)
|
||||||
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
||||||
|
|
||||||
for data in [exc_data, exc_data_tc]:
|
for data in [exc_data, exc_data_tc]:
|
||||||
data_apos = dict(data)
|
data_apos = dict(data)
|
||||||
data_apos[ORTH] = data_apos[ORTH] + "'"
|
data_apos[ORTH] = data_apos[ORTH] + "'"
|
||||||
|
_exc[data[ORTH]] = [dict(data)]
|
||||||
EXC[data[ORTH]] = [
|
_exc[data_apos[ORTH]] = [dict(data_apos)]
|
||||||
dict(data)
|
|
||||||
]
|
|
||||||
|
|
||||||
EXC[data_apos[ORTH]] = [
|
|
||||||
dict(data_apos)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Other contractions with leading apostrophe
|
# Other contractions with leading apostrophe
|
||||||
|
@ -326,449 +269,181 @@ for exc_data in [
|
||||||
{ORTH: "cause", LEMMA: "because"},
|
{ORTH: "cause", LEMMA: "because"},
|
||||||
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
||||||
{ORTH: "ll", LEMMA: "will"},
|
{ORTH: "ll", LEMMA: "will"},
|
||||||
{ORTH: "nuff", LEMMA: "enough"}
|
{ORTH: "nuff", LEMMA: "enough"}]:
|
||||||
]:
|
|
||||||
exc_data_apos = dict(exc_data)
|
exc_data_apos = dict(exc_data)
|
||||||
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
||||||
|
|
||||||
for data in [exc_data, exc_data_apos]:
|
for data in [exc_data, exc_data_apos]:
|
||||||
EXC[data[ORTH]] = [
|
_exc[data[ORTH]] = [dict(data)]
|
||||||
dict(data)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
|
|
||||||
for h in range(1, 12 + 1):
|
for h in range(1, 12 + 1):
|
||||||
hour = str(h)
|
hour = str(h)
|
||||||
|
|
||||||
for period in ["a.m.", "am"]:
|
for period in ["a.m.", "am"]:
|
||||||
EXC[hour + period] = [
|
_exc[hour+period] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: period, LEMMA: "a.m."}
|
{ORTH: period, LEMMA: "a.m."}]
|
||||||
]
|
|
||||||
for period in ["p.m.", "pm"]:
|
for period in ["p.m.", "pm"]:
|
||||||
EXC[hour + period] = [
|
_exc[hour+period] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: period, LEMMA: "p.m."}
|
{ORTH: period, LEMMA: "p.m."}]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Rest
|
# Rest
|
||||||
|
|
||||||
OTHER = {
|
_other_exc = {
|
||||||
" ": [
|
|
||||||
{ORTH: " ", TAG: "SP"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"\u00a0": [
|
|
||||||
{ORTH: "\u00a0", TAG: "SP", LEMMA: " "}
|
|
||||||
],
|
|
||||||
|
|
||||||
"'S": [
|
|
||||||
{ORTH: "'S", LEMMA: "'s"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"'s": [
|
|
||||||
{ORTH: "'s", LEMMA: "'s"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"'re": [
|
|
||||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"\u2018S": [
|
|
||||||
{ORTH: "\u2018S", LEMMA: "'s"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"\u2018s": [
|
|
||||||
{ORTH: "\u2018s", LEMMA: "'s"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"and/or": [
|
|
||||||
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"'Cause": [
|
|
||||||
{ORTH: "'Cause", LEMMA: "because"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"y'all": [
|
"y'all": [
|
||||||
{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
|
{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
|
||||||
{ORTH: "all"}
|
{ORTH: "all"}],
|
||||||
],
|
|
||||||
|
|
||||||
"yall": [
|
"yall": [
|
||||||
{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
|
{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
|
||||||
{ORTH: "all"}
|
{ORTH: "all"}],
|
||||||
],
|
|
||||||
|
|
||||||
"ma'am": [
|
|
||||||
{ORTH: "ma'am", LEMMA: "madam"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ma'am": [
|
|
||||||
{ORTH: "Ma'am", LEMMA: "madam"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"o'clock": [
|
|
||||||
{ORTH: "o'clock", LEMMA: "o'clock"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"O'clock": [
|
|
||||||
{ORTH: "O'clock", LEMMA: "o'clock"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"how'd'y": [
|
"how'd'y": [
|
||||||
{ORTH: "how", LEMMA: "how"},
|
{ORTH: "how", LEMMA: "how"},
|
||||||
{ORTH: "'d", LEMMA: "do"},
|
{ORTH: "'d", LEMMA: "do"},
|
||||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
|
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||||
],
|
|
||||||
|
|
||||||
"How'd'y": [
|
"How'd'y": [
|
||||||
{ORTH: "How", LEMMA: "how"},
|
{ORTH: "How", LEMMA: "how"},
|
||||||
{ORTH: "'d", LEMMA: "do"},
|
{ORTH: "'d", LEMMA: "do"},
|
||||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
|
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||||
],
|
|
||||||
|
|
||||||
"not've": [
|
"not've": [
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"notve": [
|
"notve": [
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Not've": [
|
"Not've": [
|
||||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Notve": [
|
"Notve": [
|
||||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"cannot": [
|
"cannot": [
|
||||||
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}
|
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Cannot": [
|
"Cannot": [
|
||||||
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
|
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}
|
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||||
],
|
|
||||||
|
|
||||||
"gonna": [
|
"gonna": [
|
||||||
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "na", LEMMA: "to"}
|
{ORTH: "na", LEMMA: "to"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Gonna": [
|
"Gonna": [
|
||||||
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "na", LEMMA: "to"}
|
{ORTH: "na", LEMMA: "to"}],
|
||||||
],
|
|
||||||
|
|
||||||
"gotta": [
|
"gotta": [
|
||||||
{ORTH: "got"},
|
{ORTH: "got"},
|
||||||
{ORTH: "ta", LEMMA: "to"}
|
{ORTH: "ta", LEMMA: "to"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Gotta": [
|
"Gotta": [
|
||||||
{ORTH: "Got"},
|
{ORTH: "Got"},
|
||||||
{ORTH: "ta", LEMMA: "to"}
|
{ORTH: "ta", LEMMA: "to"}],
|
||||||
],
|
|
||||||
|
|
||||||
"let's": [
|
"let's": [
|
||||||
{ORTH: "let"},
|
{ORTH: "let"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
|
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
||||||
],
|
|
||||||
|
|
||||||
"Let's": [
|
"Let's": [
|
||||||
{ORTH: "Let", LEMMA: "let"},
|
{ORTH: "Let", LEMMA: "let"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
|
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
|
||||||
],
|
|
||||||
|
|
||||||
"\u2014": [
|
|
||||||
{ORTH: "\u2014", TAG: ":", LEMMA: "--"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"\n": [
|
|
||||||
{ORTH: "\n", TAG: "SP"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"\t": [
|
|
||||||
{ORTH: "\t", TAG: "SP"}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_exc.update(_other_exc)
|
||||||
# Abbreviations
|
|
||||||
|
|
||||||
ABBREVIATIONS = {
|
|
||||||
"Mt.": [
|
|
||||||
{ORTH: "Mt.", LEMMA: "Mount"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ak.": [
|
|
||||||
{ORTH: "Ak.", LEMMA: "Alaska"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ala.": [
|
|
||||||
{ORTH: "Ala.", LEMMA: "Alabama"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Apr.": [
|
|
||||||
{ORTH: "Apr.", LEMMA: "April"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ariz.": [
|
|
||||||
{ORTH: "Ariz.", LEMMA: "Arizona"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ark.": [
|
|
||||||
{ORTH: "Ark.", LEMMA: "Arkansas"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Aug.": [
|
|
||||||
{ORTH: "Aug.", LEMMA: "August"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Calif.": [
|
|
||||||
{ORTH: "Calif.", LEMMA: "California"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Colo.": [
|
|
||||||
{ORTH: "Colo.", LEMMA: "Colorado"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Conn.": [
|
|
||||||
{ORTH: "Conn.", LEMMA: "Connecticut"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Dec.": [
|
|
||||||
{ORTH: "Dec.", LEMMA: "December"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Del.": [
|
|
||||||
{ORTH: "Del.", LEMMA: "Delaware"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Feb.": [
|
|
||||||
{ORTH: "Feb.", LEMMA: "February"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Fla.": [
|
|
||||||
{ORTH: "Fla.", LEMMA: "Florida"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ga.": [
|
|
||||||
{ORTH: "Ga.", LEMMA: "Georgia"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ia.": [
|
|
||||||
{ORTH: "Ia.", LEMMA: "Iowa"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Id.": [
|
|
||||||
{ORTH: "Id.", LEMMA: "Idaho"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ill.": [
|
|
||||||
{ORTH: "Ill.", LEMMA: "Illinois"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ind.": [
|
|
||||||
{ORTH: "Ind.", LEMMA: "Indiana"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Jan.": [
|
|
||||||
{ORTH: "Jan.", LEMMA: "January"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Jul.": [
|
|
||||||
{ORTH: "Jul.", LEMMA: "July"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Jun.": [
|
|
||||||
{ORTH: "Jun.", LEMMA: "June"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Kan.": [
|
|
||||||
{ORTH: "Kan.", LEMMA: "Kansas"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Kans.": [
|
|
||||||
{ORTH: "Kans.", LEMMA: "Kansas"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ky.": [
|
|
||||||
{ORTH: "Ky.", LEMMA: "Kentucky"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"La.": [
|
|
||||||
{ORTH: "La.", LEMMA: "Louisiana"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Mar.": [
|
|
||||||
{ORTH: "Mar.", LEMMA: "March"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Mass.": [
|
|
||||||
{ORTH: "Mass.", LEMMA: "Massachusetts"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"May.": [
|
|
||||||
{ORTH: "May.", LEMMA: "May"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Mich.": [
|
|
||||||
{ORTH: "Mich.", LEMMA: "Michigan"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Minn.": [
|
|
||||||
{ORTH: "Minn.", LEMMA: "Minnesota"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Miss.": [
|
|
||||||
{ORTH: "Miss.", LEMMA: "Mississippi"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.C.": [
|
|
||||||
{ORTH: "N.C.", LEMMA: "North Carolina"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.D.": [
|
|
||||||
{ORTH: "N.D.", LEMMA: "North Dakota"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.H.": [
|
|
||||||
{ORTH: "N.H.", LEMMA: "New Hampshire"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.J.": [
|
|
||||||
{ORTH: "N.J.", LEMMA: "New Jersey"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.M.": [
|
|
||||||
{ORTH: "N.M.", LEMMA: "New Mexico"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"N.Y.": [
|
|
||||||
{ORTH: "N.Y.", LEMMA: "New York"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Neb.": [
|
|
||||||
{ORTH: "Neb.", LEMMA: "Nebraska"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Nebr.": [
|
|
||||||
{ORTH: "Nebr.", LEMMA: "Nebraska"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Nev.": [
|
|
||||||
{ORTH: "Nev.", LEMMA: "Nevada"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Nov.": [
|
|
||||||
{ORTH: "Nov.", LEMMA: "November"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Oct.": [
|
|
||||||
{ORTH: "Oct.", LEMMA: "October"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Okla.": [
|
|
||||||
{ORTH: "Okla.", LEMMA: "Oklahoma"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ore.": [
|
|
||||||
{ORTH: "Ore.", LEMMA: "Oregon"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Pa.": [
|
|
||||||
{ORTH: "Pa.", LEMMA: "Pennsylvania"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"S.C.": [
|
|
||||||
{ORTH: "S.C.", LEMMA: "South Carolina"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Sep.": [
|
|
||||||
{ORTH: "Sep.", LEMMA: "September"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Sept.": [
|
|
||||||
{ORTH: "Sept.", LEMMA: "September"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Tenn.": [
|
|
||||||
{ORTH: "Tenn.", LEMMA: "Tennessee"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Va.": [
|
|
||||||
{ORTH: "Va.", LEMMA: "Virginia"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Wash.": [
|
|
||||||
{ORTH: "Wash.", LEMMA: "Washington"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Wis.": [
|
|
||||||
{ORTH: "Wis.", LEMMA: "Wisconsin"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(EXC)
|
for exc_data in [
|
||||||
TOKENIZER_EXCEPTIONS.update(OTHER)
|
{ORTH: "'S", LEMMA: "'s"},
|
||||||
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
{ORTH: "'s", LEMMA: "'s"},
|
||||||
|
{ORTH: "\u2018S", LEMMA: "'s"},
|
||||||
|
{ORTH: "\u2018s", LEMMA: "'s"},
|
||||||
|
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
|
||||||
|
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||||
|
{ORTH: "'Cause", LEMMA: "because"},
|
||||||
|
{ORTH: "'cause", LEMMA: "because"},
|
||||||
|
{ORTH: "ma'am", LEMMA: "madam"},
|
||||||
|
{ORTH: "Ma'am", LEMMA: "madam"},
|
||||||
|
{ORTH: "o'clock", LEMMA: "o'clock"},
|
||||||
|
{ORTH: "O'clock", LEMMA: "o'clock"},
|
||||||
|
|
||||||
|
{ORTH: "Mt.", LEMMA: "Mount"},
|
||||||
|
{ORTH: "Ak.", LEMMA: "Alaska"},
|
||||||
|
{ORTH: "Ala.", LEMMA: "Alabama"},
|
||||||
|
{ORTH: "Apr.", LEMMA: "April"},
|
||||||
|
{ORTH: "Ariz.", LEMMA: "Arizona"},
|
||||||
|
{ORTH: "Ark.", LEMMA: "Arkansas"},
|
||||||
|
{ORTH: "Aug.", LEMMA: "August"},
|
||||||
|
{ORTH: "Calif.", LEMMA: "California"},
|
||||||
|
{ORTH: "Colo.", LEMMA: "Colorado"},
|
||||||
|
{ORTH: "Conn.", LEMMA: "Connecticut"},
|
||||||
|
{ORTH: "Dec.", LEMMA: "December"},
|
||||||
|
{ORTH: "Del.", LEMMA: "Delaware"},
|
||||||
|
{ORTH: "Feb.", LEMMA: "February"},
|
||||||
|
{ORTH: "Fla.", LEMMA: "Florida"},
|
||||||
|
{ORTH: "Ga.", LEMMA: "Georgia"},
|
||||||
|
{ORTH: "Ia.", LEMMA: "Iowa"},
|
||||||
|
{ORTH: "Id.", LEMMA: "Idaho"},
|
||||||
|
{ORTH: "Ill.", LEMMA: "Illinois"},
|
||||||
|
{ORTH: "Ind.", LEMMA: "Indiana"},
|
||||||
|
{ORTH: "Jan.", LEMMA: "January"},
|
||||||
|
{ORTH: "Jul.", LEMMA: "July"},
|
||||||
|
{ORTH: "Jun.", LEMMA: "June"},
|
||||||
|
{ORTH: "Kan.", LEMMA: "Kansas"},
|
||||||
|
{ORTH: "Kans.", LEMMA: "Kansas"},
|
||||||
|
{ORTH: "Ky.", LEMMA: "Kentucky"},
|
||||||
|
{ORTH: "La.", LEMMA: "Louisiana"},
|
||||||
|
{ORTH: "Mar.", LEMMA: "March"},
|
||||||
|
{ORTH: "Mass.", LEMMA: "Massachusetts"},
|
||||||
|
{ORTH: "May.", LEMMA: "May"},
|
||||||
|
{ORTH: "Mich.", LEMMA: "Michigan"},
|
||||||
|
{ORTH: "Minn.", LEMMA: "Minnesota"},
|
||||||
|
{ORTH: "Miss.", LEMMA: "Mississippi"},
|
||||||
|
{ORTH: "N.C.", LEMMA: "North Carolina"},
|
||||||
|
{ORTH: "N.D.", LEMMA: "North Dakota"},
|
||||||
|
{ORTH: "N.H.", LEMMA: "New Hampshire"},
|
||||||
|
{ORTH: "N.J.", LEMMA: "New Jersey"},
|
||||||
|
{ORTH: "N.M.", LEMMA: "New Mexico"},
|
||||||
|
{ORTH: "N.Y.", LEMMA: "New York"},
|
||||||
|
{ORTH: "Neb.", LEMMA: "Nebraska"},
|
||||||
|
{ORTH: "Nebr.", LEMMA: "Nebraska"},
|
||||||
|
{ORTH: "Nev.", LEMMA: "Nevada"},
|
||||||
|
{ORTH: "Nov.", LEMMA: "November"},
|
||||||
|
{ORTH: "Oct.", LEMMA: "October"},
|
||||||
|
{ORTH: "Okla.", LEMMA: "Oklahoma"},
|
||||||
|
{ORTH: "Ore.", LEMMA: "Oregon"},
|
||||||
|
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
|
||||||
|
{ORTH: "S.C.", LEMMA: "South Carolina"},
|
||||||
|
{ORTH: "Sep.", LEMMA: "September"},
|
||||||
|
{ORTH: "Sept.", LEMMA: "September"},
|
||||||
|
{ORTH: "Tenn.", LEMMA: "Tennessee"},
|
||||||
|
{ORTH: "Va.", LEMMA: "Virginia"},
|
||||||
|
{ORTH: "Wash.", LEMMA: "Washington"},
|
||||||
|
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
|
||||||
|
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||||
|
|
||||||
|
|
||||||
# Remove EXCLUDE_EXC if in exceptions
|
for orth in [
|
||||||
|
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
|
||||||
for string in EXCLUDE_EXC:
|
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
|
||||||
if string in TOKENIZER_EXCEPTIONS:
|
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
|
||||||
TOKENIZER_EXCEPTIONS.pop(string)
|
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
# Abbreviations with only one ORTH token
|
for string in _exclude:
|
||||||
|
if string in _exc:
|
||||||
|
_exc.pop(string)
|
||||||
|
|
||||||
ORTH_ONLY = [
|
|
||||||
"'d",
|
TOKENIZER_EXCEPTIONS = dict(_exc)
|
||||||
"a.m.",
|
|
||||||
"Adm.",
|
|
||||||
"Bros.",
|
|
||||||
"co.",
|
|
||||||
"Co.",
|
|
||||||
"Corp.",
|
|
||||||
"D.C.",
|
|
||||||
"Dr.",
|
|
||||||
"e.g.",
|
|
||||||
"E.g.",
|
|
||||||
"E.G.",
|
|
||||||
"Gen.",
|
|
||||||
"Gov.",
|
|
||||||
"i.e.",
|
|
||||||
"I.e.",
|
|
||||||
"I.E.",
|
|
||||||
"Inc.",
|
|
||||||
"Jr.",
|
|
||||||
"Ltd.",
|
|
||||||
"Md.",
|
|
||||||
"Messrs.",
|
|
||||||
"Mo.",
|
|
||||||
"Mont.",
|
|
||||||
"Mr.",
|
|
||||||
"Mrs.",
|
|
||||||
"Ms.",
|
|
||||||
"p.m.",
|
|
||||||
"Ph.D.",
|
|
||||||
"Rep.",
|
|
||||||
"Rev.",
|
|
||||||
"Sen.",
|
|
||||||
"St.",
|
|
||||||
"vs.",
|
|
||||||
]
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user