Update tokenizer exceptions for English

This commit is contained in:
Ines Montani 2016-12-21 18:06:34 +01:00
parent 702d1eed93
commit 78e63dc7d0

View File

@ -11,7 +11,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Theydve": [ "Theydve": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -68,7 +68,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"itll": [ "itll": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
@ -113,7 +113,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Idve": [ "Idve": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -124,23 +124,23 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Ive": [ "Ive": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"they'd": [ "they'd": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
"Youdve": [ "Youdve": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"theyve": [ "theyve": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -160,12 +160,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"I'm": [ "I'm": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
], ],
"She'd've": [ "She'd've": [
{ORTH: "She", LEMMA: PRON_LEMMA}, {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -191,7 +191,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"they've": [ "they've": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -226,12 +226,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"i'll": [ "i'll": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"you'd": [ "you'd": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -287,7 +287,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"youll": [ "youll": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
@ -307,7 +307,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Youre": [ "Youre": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re", LEMMA: "be"} {ORTH: "re", LEMMA: "be"}
], ],
@ -369,7 +369,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"You'll": [ "You'll": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
@ -379,7 +379,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"i'd": [ "i'd": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -394,7 +394,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"i'm": [ "i'm": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
], ],
@ -425,7 +425,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Hes": [ "Hes": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"} {ORTH: "s"}
], ],
@ -435,7 +435,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"It's": [ "It's": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
@ -445,7 +445,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Hed": [ "Hed": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
@ -464,12 +464,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"It'd": [ "It'd": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
"theydve": [ "theydve": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -489,7 +489,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"I've": [ "I've": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -499,13 +499,13 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Itdve": [ "Itdve": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"I'ma": [ "I'ma": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ma"} {ORTH: "'ma"}
], ],
@ -515,7 +515,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"They'd": [ "They'd": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -525,7 +525,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"You've": [ "You've": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -546,7 +546,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"I'd've": [ "I'd've": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -557,13 +557,13 @@ TOKENIZER_EXCEPTIONS = {
], ],
"it'd": [ "it'd": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
"what're": [ "what're": [
{ORTH: "what"}, {ORTH: "what"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"Wasn't": [ "Wasn't": [
@ -577,18 +577,18 @@ TOKENIZER_EXCEPTIONS = {
], ],
"he'd've": [ "he'd've": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
"She'd": [ "She'd": [
{ORTH: "She", LEMMA: PRON_LEMMA}, {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
"shedve": [ "shedve": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -599,12 +599,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"She's": [ "She's": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
"i'd've": [ "i'd've": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -631,7 +631,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"you'd've": [ "you'd've": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -647,7 +647,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Youd": [ "Youd": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
@ -678,12 +678,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"ive": [ "ive": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"It'd've": [ "It'd've": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -693,7 +693,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Itll": [ "Itll": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
@ -708,12 +708,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"im": [ "im": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
], ],
"they'd've": [ "they'd've": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -735,19 +735,19 @@ TOKENIZER_EXCEPTIONS = {
], ],
"youdve": [ "youdve": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"Shedve": [ "Shedve": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"theyd": [ "theyd": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
@ -763,11 +763,11 @@ TOKENIZER_EXCEPTIONS = {
"What're": [ "What're": [
{ORTH: "What"}, {ORTH: "What"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"He'll": [ "He'll": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
@ -777,8 +777,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"They're": [ "They're": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"shouldnt": [ "shouldnt": [
@ -796,7 +796,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"youve": [ "youve": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -816,7 +816,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Youve": [ "Youve": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -841,12 +841,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"they're": [ "they're": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"idve": [ "idve": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -857,8 +857,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"youre": [ "youre": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re"} {ORTH: "re", LEMMA: "be", NORM: "are"}
], ],
"Didn't": [ "Didn't": [
@ -877,8 +877,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Im": [ "Im": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"}
], ],
"howd": [ "howd": [
@ -887,22 +887,22 @@ TOKENIZER_EXCEPTIONS = {
], ],
"you've": [ "you've": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
"You're": [ "You're": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"she'll": [ "she'll": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"Theyll": [ "Theyll": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
@ -912,12 +912,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"itd": [ "itd": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
"Hedve": [ "Hedve": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -933,8 +933,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"We're": [ "We're": [
{ORTH: "We", LEMMA: PRON_LEMMA}, {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"\u2018S": [ "\u2018S": [
@ -951,7 +951,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"ima": [ "ima": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ma"} {ORTH: "ma"}
], ],
@ -961,7 +961,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"he's": [ "he's": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
@ -981,13 +981,13 @@ TOKENIZER_EXCEPTIONS = {
], ],
"hedve": [ "hedve": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
"he'd": [ "he'd": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1029,7 +1029,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"You'd've": [ "You'd've": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1072,12 +1072,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"wont": [ "wont": [
{ORTH: "wo"}, {ORTH: "wo", LEMMA: "will"},
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}
], ],
"she'd've": [ "she'd've": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1088,7 +1088,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"theyre": [ "theyre": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re"} {ORTH: "re"}
], ],
@ -1129,7 +1129,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"They'll": [ "They'll": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
@ -1139,7 +1139,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Wedve": [ "Wedve": [
{ORTH: "We"}, {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1156,7 +1156,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"we'd": [ "we'd": [
{ORTH: "we"}, {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1193,7 +1193,7 @@ TOKENIZER_EXCEPTIONS = {
"why're": [ "why're": [
{ORTH: "why"}, {ORTH: "why"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"Doesnt": [ "Doesnt": [
@ -1207,12 +1207,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"they'll": [ "they'll": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"I'd": [ "I'd": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1237,12 +1237,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"you're": [ "you're": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM: "are"}
], ],
"They've": [ "They've": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1272,12 +1272,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"She'll": [ "She'll": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"You'd": [ "You'd": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1297,8 +1297,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Theyre": [ "Theyre": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "re"} {ORTH: "re", LEMMA: "be", NORM: "are"}
], ],
"Won't": [ "Won't": [
@ -1312,33 +1312,33 @@ TOKENIZER_EXCEPTIONS = {
], ],
"it's": [ "it's": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
"it'll": [ "it'll": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"They'd've": [ "They'd've": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
"Ima": [ "Ima": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ma"} {ORTH: "ma"}
], ],
"gonna": [ "gonna": [
{ORTH: "gon", LEMMA: "go"}, {ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"} {ORTH: "na", LEMMA: "to"}
], ],
"Gonna": [ "Gonna": [
{ORTH: "Gon", LEMMA: "go"}, {ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"} {ORTH: "na", LEMMA: "to"}
], ],
@ -1359,7 +1359,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"youd": [ "youd": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
@ -1390,7 +1390,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"He'd've": [ "He'd've": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1427,17 +1427,17 @@ TOKENIZER_EXCEPTIONS = {
], ],
"hes": [ "hes": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"} {ORTH: "s"}
], ],
"he'll": [ "he'll": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"hed": [ "hed": [
{ORTH: "he", LEMMA: PRON_LEMMA}, {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
@ -1447,8 +1447,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"we're": [ "we're": [
{ORTH: "we", LEMMA: PRON_LEMMA}, {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be"} {ORTH: "'re", LEMMA: "be", NORM :"are"}
], ],
"Hadnt": [ "Hadnt": [
@ -1457,12 +1457,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Shant": [ "Shant": [
{ORTH: "Sha"}, {ORTH: "Sha", LEMMA: "shall"},
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}
], ],
"Theyve": [ "Theyve": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1477,7 +1477,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"i've": [ "i've": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1487,7 +1487,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"i'ma": [ "i'ma": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ma"} {ORTH: "'ma"}
], ],
@ -1502,7 +1502,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"shant": [ "shant": [
{ORTH: "sha"}, {ORTH: "sha", LEMMA: "shall"},
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}
], ],
@ -1513,7 +1513,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"I'll": [ "I'll": [
{ORTH: "I", LEMMA: PRON_LEMMA}, {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
@ -1571,7 +1571,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"shes": [ "shes": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"} {ORTH: "s"}
], ],
@ -1586,12 +1586,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Hasnt": [ "Hasnt": [
{ORTH: "Has"}, {ORTH: "Has", LEMMA: "have"},
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}
], ],
"He's": [ "He's": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
@ -1611,12 +1611,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"He'd": [ "He'd": [
{ORTH: "He", LEMMA: PRON_LEMMA}, {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
"Shes": [ "Shes": [
{ORTH: "i", LEMMA: PRON_LEMMA}, {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "s"} {ORTH: "s"}
], ],
@ -1626,7 +1626,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Youll": [ "Youll": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
@ -1636,18 +1636,18 @@ TOKENIZER_EXCEPTIONS = {
], ],
"theyll": [ "theyll": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"} {ORTH: "ll", LEMMA: "will", TAG: "MD"}
], ],
"it'd've": [ "it'd've": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
], ],
"itdve": [ "itdve": [
{ORTH: "it", LEMMA: PRON_LEMMA}, {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
], ],
@ -1674,7 +1674,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Wont": [ "Wont": [
{ORTH: "Wo"}, {ORTH: "Wo", LEMMA: "will"},
{ORTH: "nt", LEMMA: "not", TAG: "RB"} {ORTH: "nt", LEMMA: "not", TAG: "RB"}
], ],
@ -1691,7 +1691,7 @@ TOKENIZER_EXCEPTIONS = {
"Whatre": [ "Whatre": [
{ORTH: "What"}, {ORTH: "What"},
{ORTH: "re"} {ORTH: "re", LEMMA: "be", NORM: "are"}
], ],
"'s": [ "'s": [
@ -1719,12 +1719,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"It'll": [ "It'll": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"We'd": [ "We'd": [
{ORTH: "We"}, {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1738,12 +1738,12 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Itd": [ "Itd": [
{ORTH: "It", LEMMA: PRON_LEMMA}, {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
"she'd": [ "she'd": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"} {ORTH: "'d", LEMMA: "would", TAG: "MD"}
], ],
@ -1758,17 +1758,17 @@ TOKENIZER_EXCEPTIONS = {
], ],
"you'll": [ "you'll": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"} {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
], ],
"Theyd": [ "Theyd": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"} {ORTH: "d", LEMMA: "would", TAG: "MD"}
], ],
"she's": [ "she's": [
{ORTH: "she", LEMMA: PRON_LEMMA}, {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"} {ORTH: "'s"}
], ],
@ -1783,7 +1783,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"'em": [ "'em": [
{ORTH: "'em", LEMMA: PRON_LEMMA} {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"}
], ],
"ol'": [ "ol'": [