Reorganise English language data

This commit is contained in:
ines 2017-05-08 15:47:25 +02:00
parent 1bf9d5ec8b
commit c7c21b980f
8 changed files with 242 additions and 582 deletions

View File

@ -1,14 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..language import Language from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..lemmatizer import Lemmatizer from .tag_map import TAG_MAP
from ..vocab import Vocab from .stop_words import STOP_WORDS
from ..tokenizer import Tokenizer from .morph_rules import MORPH_RULES
from ..attrs import LANG from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from ..deprecated import fix_glove_vectors_loading
from .language_data import * from ..language_data import BASE_EXCEPTIONS
from ..language import Language
from ..attrs import LANG
from ..util import update_exc
class English(Language): class English(Language):
@ -18,20 +20,13 @@ class English(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = dict(TAG_MAP)
stop_words = STOP_WORDS stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES) morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES) lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX) lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC) lemma_exc = dict(LEMMA_EXC)
def __init__(self, **overrides): __all__ = ['English']
# Special-case hack for loading the GloVe vectors, to support <1.0
overrides = fix_glove_vectors_loading(overrides)
Language.__init__(self, **overrides)
EXPORT = English

View File

@ -1,3 +1,5 @@
from .lookup import LOOKUP
from ._adjectives import ADJECTIVES from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adverbs import ADVERBS from ._adverbs import ADVERBS
@ -9,25 +11,10 @@ from ._verbs_irreg import VERBS_IRREG
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
INDEX = { LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
"adj": ADJECTIVES,
"adv": ADVERBS,
"noun": NOUNS,
"verb": VERBS
}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
'verb': VERBS_IRREG}
EXC = { LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
"adj": ADJECTIVES_IRREG, 'punct': PUNCT_RULES}
"adv": ADVERBS_IRREG,
"noun": NOUNS_IRREG,
"verb": VERBS_IRREG
}
RULES = {
"adj": ADJECTIVE_RULES,
"noun": NOUN_RULES,
"verb": VERB_RULES,
"punct": PUNCT_RULES
}

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
LOOK_UP = { LOOKUP = {
" furtherst": "further", " furtherst": "further",
" skilled": "skill", " skilled": "skill",
"'cause": "because", "'cause": "because",
@ -41585,4 +41585,4 @@ LOOK_UP = {
"zoospores": "zoospore", "zoospores": "zoospore",
"zucchinis": "zucchini", "zucchinis": "zucchini",
"zygotes": "zygote" "zygotes": "zygote"
} }

23
spacy/en/lex_attrs.py Normal file
View File

@ -0,0 +1,23 @@
# coding: utf8
from __future__ import unicode_literals
# Number words
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
bazillionth
""".split())

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import LEMMA
from ..language_data import PRON_LEMMA from ..deprecated import PRON_LEMMA
MORPH_RULES = { MORPH_RULES = {

View File

@ -67,24 +67,3 @@ whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves yet you your yours yourself yourselves
""".split()) """.split())
# Number words
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
bazillionth
""".split())

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
TAG_MAP = { TAG_MAP = {

View File

@ -1,13 +1,12 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import ORTH, LEMMA, TAG, NORM
from ..language_data import PRON_LEMMA from ..deprecated import PRON_LEMMA
EXC = {} _exc = {}
_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
"Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"] "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
@ -15,193 +14,160 @@ EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
EXC[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
]
EXC[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
]
EXC[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"} {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
]
EXC[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"} {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
]
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
EXC[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
]
EXC[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
]
EXC[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
]
EXC[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}]
]
EXC[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
EXC[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
EXC[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"} {ORTH: "'re", LEMMA: "be", NORM: "are"}]
]
EXC[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"} {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
EXC[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}]
]
EXC[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"} {ORTH: "s"}]
]
# W-words, relative pronouns, prepositions etc. # W-words, relative pronouns, prepositions etc.
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
EXC[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'s"} {ORTH: "'s"}]
]
EXC[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "s"} {ORTH: "s"}]
]
EXC[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
]
EXC[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
]
EXC[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"} {ORTH: "'re", LEMMA: "be", NORM: "are"}]
]
EXC[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "re", LEMMA: "be", NORM: "are"} {ORTH: "re", LEMMA: "be", NORM: "are"}]
]
EXC[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth}, {ORTH: orth},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'d"} {ORTH: "'d"}]
]
EXC[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "d"} {ORTH: "d"}]
]
EXC[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
# Verbs # Verbs
@ -221,54 +187,44 @@ for verb_data in [
{ORTH: "sha", LEMMA: "shall", TAG: "MD"}, {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"}, {ORTH: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"}, {ORTH: "wo", LEMMA: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"} {ORTH: "would", TAG: "MD"}]:
]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
]
EXC[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
]
EXC[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}, {ORTH: "n't", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}, {ORTH: "nt", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
for verb_data in [ for verb_data in [
{ORTH: "could", TAG: "MD"}, {ORTH: "could", TAG: "MD"},
{ORTH: "might"}, {ORTH: "might"},
{ORTH: "must"}, {ORTH: "must"},
{ORTH: "should"} {ORTH: "should"}]:
]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "'ve"] = [ _exc[data[ORTH] + "'ve"] = [
dict(data), dict(data),
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
]
EXC[data[ORTH] + "ve"] = [ _exc[data[ORTH] + "ve"] = [
dict(data), dict(data),
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
]
for verb_data in [ for verb_data in [
@ -276,22 +232,17 @@ for verb_data in [
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"}, {ORTH: "was", LEMMA: "be"},
{ORTH: "were", LEMMA: "be"} {ORTH: "were", LEMMA: "be"}]:
]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
]
EXC[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
@ -302,22 +253,14 @@ for exc_data in [
{ORTH: "nothin", LEMMA: "nothing"}, {ORTH: "nothin", LEMMA: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"}, {ORTH: "nuthin", LEMMA: "nothing"},
{ORTH: "ol", LEMMA: "old"}, {ORTH: "ol", LEMMA: "old"},
{ORTH: "somethin", LEMMA: "something"} {ORTH: "somethin", LEMMA: "something"}]:
]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]: for data in [exc_data, exc_data_tc]:
data_apos = dict(data) data_apos = dict(data)
data_apos[ORTH] = data_apos[ORTH] + "'" data_apos[ORTH] = data_apos[ORTH] + "'"
_exc[data[ORTH]] = [dict(data)]
EXC[data[ORTH]] = [ _exc[data_apos[ORTH]] = [dict(data_apos)]
dict(data)
]
EXC[data_apos[ORTH]] = [
dict(data_apos)
]
# Other contractions with leading apostrophe # Other contractions with leading apostrophe
@ -326,449 +269,181 @@ for exc_data in [
{ORTH: "cause", LEMMA: "because"}, {ORTH: "cause", LEMMA: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"}, {ORTH: "ll", LEMMA: "will"},
{ORTH: "nuff", LEMMA: "enough"} {ORTH: "nuff", LEMMA: "enough"}]:
]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]: for data in [exc_data, exc_data_apos]:
EXC[data[ORTH]] = [ _exc[data[ORTH]] = [dict(data)]
dict(data)
]
# Times # Times
for h in range(1, 12 + 1): for h in range(1, 12 + 1):
hour = str(h) hour = str(h)
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
EXC[hour + period] = [ _exc[hour+period] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: period, LEMMA: "a.m."} {ORTH: period, LEMMA: "a.m."}]
]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
EXC[hour + period] = [ _exc[hour+period] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: period, LEMMA: "p.m."} {ORTH: period, LEMMA: "p.m."}]
]
# Rest # Rest
OTHER = { _other_exc = {
" ": [
{ORTH: " ", TAG: "SP"}
],
"\u00a0": [
{ORTH: "\u00a0", TAG: "SP", LEMMA: " "}
],
"'S": [
{ORTH: "'S", LEMMA: "'s"}
],
"'s": [
{ORTH: "'s", LEMMA: "'s"}
],
"'re": [
{ORTH: "'re", LEMMA: "be", NORM: "are"}
],
"\u2018S": [
{ORTH: "\u2018S", LEMMA: "'s"}
],
"\u2018s": [
{ORTH: "\u2018s", LEMMA: "'s"}
],
"and/or": [
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
],
"'Cause": [
{ORTH: "'Cause", LEMMA: "because"}
],
"y'all": [ "y'all": [
{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
{ORTH: "all"} {ORTH: "all"}],
],
"yall": [ "yall": [
{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
{ORTH: "all"} {ORTH: "all"}],
],
"ma'am": [
{ORTH: "ma'am", LEMMA: "madam"}
],
"Ma'am": [
{ORTH: "Ma'am", LEMMA: "madam"}
],
"o'clock": [
{ORTH: "o'clock", LEMMA: "o'clock"}
],
"O'clock": [
{ORTH: "O'clock", LEMMA: "o'clock"}
],
"how'd'y": [ "how'd'y": [
{ORTH: "how", LEMMA: "how"}, {ORTH: "how", LEMMA: "how"},
{ORTH: "'d", LEMMA: "do"}, {ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
],
"How'd'y": [ "How'd'y": [
{ORTH: "How", LEMMA: "how"}, {ORTH: "How", LEMMA: "how"},
{ORTH: "'d", LEMMA: "do"}, {ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
],
"not've": [ "not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
],
"notve": [ "notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
],
"Not've": [ "Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
],
"Notve": [ "Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
],
"cannot": [ "cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"}, {ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"} {ORTH: "not", LEMMA: "not", TAG: "RB"}],
],
"Cannot": [ "Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"}, {ORTH: "Can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"} {ORTH: "not", LEMMA: "not", TAG: "RB"}],
],
"gonna": [ "gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"}, {ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"} {ORTH: "na", LEMMA: "to"}],
],
"Gonna": [ "Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"}, {ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"} {ORTH: "na", LEMMA: "to"}],
],
"gotta": [ "gotta": [
{ORTH: "got"}, {ORTH: "got"},
{ORTH: "ta", LEMMA: "to"} {ORTH: "ta", LEMMA: "to"}],
],
"Gotta": [ "Gotta": [
{ORTH: "Got"}, {ORTH: "Got"},
{ORTH: "ta", LEMMA: "to"} {ORTH: "ta", LEMMA: "to"}],
],
"let's": [ "let's": [
{ORTH: "let"}, {ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
],
"Let's": [ "Let's": [
{ORTH: "Let", LEMMA: "let"}, {ORTH: "Let", LEMMA: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
],
"\u2014": [
{ORTH: "\u2014", TAG: ":", LEMMA: "--"}
],
"\n": [
{ORTH: "\n", TAG: "SP"}
],
"\t": [
{ORTH: "\t", TAG: "SP"}
]
} }
_exc.update(_other_exc)
# Abbreviations
ABBREVIATIONS = {
"Mt.": [
{ORTH: "Mt.", LEMMA: "Mount"}
],
"Ak.": [
{ORTH: "Ak.", LEMMA: "Alaska"}
],
"Ala.": [
{ORTH: "Ala.", LEMMA: "Alabama"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Ariz.": [
{ORTH: "Ariz.", LEMMA: "Arizona"}
],
"Ark.": [
{ORTH: "Ark.", LEMMA: "Arkansas"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "August"}
],
"Calif.": [
{ORTH: "Calif.", LEMMA: "California"}
],
"Colo.": [
{ORTH: "Colo.", LEMMA: "Colorado"}
],
"Conn.": [
{ORTH: "Conn.", LEMMA: "Connecticut"}
],
"Dec.": [
{ORTH: "Dec.", LEMMA: "December"}
],
"Del.": [
{ORTH: "Del.", LEMMA: "Delaware"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "February"}
],
"Fla.": [
{ORTH: "Fla.", LEMMA: "Florida"}
],
"Ga.": [
{ORTH: "Ga.", LEMMA: "Georgia"}
],
"Ia.": [
{ORTH: "Ia.", LEMMA: "Iowa"}
],
"Id.": [
{ORTH: "Id.", LEMMA: "Idaho"}
],
"Ill.": [
{ORTH: "Ill.", LEMMA: "Illinois"}
],
"Ind.": [
{ORTH: "Ind.", LEMMA: "Indiana"}
],
"Jan.": [
{ORTH: "Jan.", LEMMA: "January"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "July"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "June"}
],
"Kan.": [
{ORTH: "Kan.", LEMMA: "Kansas"}
],
"Kans.": [
{ORTH: "Kans.", LEMMA: "Kansas"}
],
"Ky.": [
{ORTH: "Ky.", LEMMA: "Kentucky"}
],
"La.": [
{ORTH: "La.", LEMMA: "Louisiana"}
],
"Mar.": [
{ORTH: "Mar.", LEMMA: "March"}
],
"Mass.": [
{ORTH: "Mass.", LEMMA: "Massachusetts"}
],
"May.": [
{ORTH: "May.", LEMMA: "May"}
],
"Mich.": [
{ORTH: "Mich.", LEMMA: "Michigan"}
],
"Minn.": [
{ORTH: "Minn.", LEMMA: "Minnesota"}
],
"Miss.": [
{ORTH: "Miss.", LEMMA: "Mississippi"}
],
"N.C.": [
{ORTH: "N.C.", LEMMA: "North Carolina"}
],
"N.D.": [
{ORTH: "N.D.", LEMMA: "North Dakota"}
],
"N.H.": [
{ORTH: "N.H.", LEMMA: "New Hampshire"}
],
"N.J.": [
{ORTH: "N.J.", LEMMA: "New Jersey"}
],
"N.M.": [
{ORTH: "N.M.", LEMMA: "New Mexico"}
],
"N.Y.": [
{ORTH: "N.Y.", LEMMA: "New York"}
],
"Neb.": [
{ORTH: "Neb.", LEMMA: "Nebraska"}
],
"Nebr.": [
{ORTH: "Nebr.", LEMMA: "Nebraska"}
],
"Nev.": [
{ORTH: "Nev.", LEMMA: "Nevada"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Oct.": [
{ORTH: "Oct.", LEMMA: "October"}
],
"Okla.": [
{ORTH: "Okla.", LEMMA: "Oklahoma"}
],
"Ore.": [
{ORTH: "Ore.", LEMMA: "Oregon"}
],
"Pa.": [
{ORTH: "Pa.", LEMMA: "Pennsylvania"}
],
"S.C.": [
{ORTH: "S.C.", LEMMA: "South Carolina"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"Tenn.": [
{ORTH: "Tenn.", LEMMA: "Tennessee"}
],
"Va.": [
{ORTH: "Va.", LEMMA: "Virginia"}
],
"Wash.": [
{ORTH: "Wash.", LEMMA: "Washington"}
],
"Wis.": [
{ORTH: "Wis.", LEMMA: "Wisconsin"}
]
}
TOKENIZER_EXCEPTIONS = dict(EXC) for exc_data in [
TOKENIZER_EXCEPTIONS.update(OTHER) {ORTH: "'S", LEMMA: "'s"},
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) {ORTH: "'s", LEMMA: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
{ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"},
{ORTH: "'cause", LEMMA: "because"},
{ORTH: "ma'am", LEMMA: "madam"},
{ORTH: "Ma'am", LEMMA: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Calif.", LEMMA: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"},
{ORTH: "Del.", LEMMA: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"},
{ORTH: "Fla.", LEMMA: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"},
{ORTH: "Jul.", LEMMA: "July"},
{ORTH: "Jun.", LEMMA: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Oct.", LEMMA: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
# Remove EXCLUDE_EXC if in exceptions for orth in [
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
for string in EXCLUDE_EXC: "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
if string in TOKENIZER_EXCEPTIONS: "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
TOKENIZER_EXCEPTIONS.pop(string) "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
_exc[orth] = [{ORTH: orth}]
# Abbreviations with only one ORTH token for string in _exclude:
if string in _exc:
_exc.pop(string)
ORTH_ONLY = [
"'d", TOKENIZER_EXCEPTIONS = dict(_exc)
"a.m.",
"Adm.",
"Bros.",
"co.",
"Co.",
"Corp.",
"D.C.",
"Dr.",
"e.g.",
"E.g.",
"E.G.",
"Gen.",
"Gov.",
"i.e.",
"I.e.",
"I.E.",
"Inc.",
"Jr.",
"Ltd.",
"Md.",
"Messrs.",
"Mo.",
"Mont.",
"Mr.",
"Mrs.",
"Ms.",
"p.m.",
"Ph.D.",
"Rep.",
"Rev.",
"Sen.",
"St.",
"vs.",
]