mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			449 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			449 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from ...symbols import ORTH, LEMMA, TAG, NORM
 | |
| from ...deprecated import PRON_LEMMA
 | |
| 
 | |
| 
 | |
| _exc = {}
 | |
| _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 | |
|                "Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
 | |
| 
 | |
| 
 | |
| # Pronouns
 | |
| 
 | |
| for pron in ["i"]:
 | |
|     for orth in [pron, pron.title()]:
 | |
|         _exc[orth + "'m"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
 | |
| 
 | |
|         _exc[orth + "m"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
 | |
| 
 | |
|         _exc[orth + "'ma"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'m", LEMMA: "be", NORM: "am"},
 | |
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
 | |
| 
 | |
|         _exc[orth + "ma"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "m", LEMMA: "be", NORM: "am"},
 | |
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
 | |
| 
 | |
| 
 | |
| for pron in ["i", "you", "he", "she", "it", "we", "they"]:
 | |
|     for orth in [pron, pron.title()]:
 | |
|         _exc[orth + "'ll"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "ll"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "'ll've"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "llve"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "'d"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "d"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "d", LEMMA: "would", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "'d've"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "dve"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "d", LEMMA: "would", TAG: "MD"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
| 
 | |
| for pron in ["i", "you", "we", "they"]:
 | |
|     for orth in [pron, pron.title()]:
 | |
|         _exc[orth + "'ve"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "ve"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
| 
 | |
| for pron in ["you", "we", "they"]:
 | |
|     for orth in [pron, pron.title()]:
 | |
|         _exc[orth + "'re"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'re", LEMMA: "be", NORM: "are"}]
 | |
| 
 | |
|         _exc[orth + "re"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
 | |
| 
 | |
| 
 | |
| for pron in ["he", "she", "it"]:
 | |
|     for orth in [pron, pron.title()]:
 | |
|         _exc[orth + "'s"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "'s"}]
 | |
| 
 | |
|         _exc[orth + "s"] = [
 | |
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
 | |
|             {ORTH: "s"}]
 | |
| 
 | |
| 
 | |
| # W-words, relative pronouns, prepositions etc.
 | |
| 
 | |
| for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
 | |
|     for orth in [word, word.title()]:
 | |
|         _exc[orth + "'s"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'s"}]
 | |
| 
 | |
|         _exc[orth + "s"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "s"}]
 | |
| 
 | |
|         _exc[orth + "'ll"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "ll"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
 | |
| 
 | |
|         _exc[orth + "'ll've"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "llve"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "'re"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'re", LEMMA: "be", NORM: "are"}]
 | |
| 
 | |
|         _exc[orth + "re"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "re", LEMMA: "be", NORM: "are"}]
 | |
| 
 | |
|         _exc[orth + "'ve"] = [
 | |
|             {ORTH: orth},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "ve"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "'d"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'d"}]
 | |
| 
 | |
|         _exc[orth + "d"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "d"}]
 | |
| 
 | |
|         _exc[orth + "'d've"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[orth + "dve"] = [
 | |
|             {ORTH: orth, LEMMA: word},
 | |
|             {ORTH: "d", LEMMA: "would", TAG: "MD"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
| 
 | |
| # Verbs
 | |
| 
 | |
| for verb_data in [
 | |
|     {ORTH: "ca", LEMMA: "can", TAG: "MD"},
 | |
|     {ORTH: "could", TAG: "MD"},
 | |
|     {ORTH: "do", LEMMA: "do"},
 | |
|     {ORTH: "does", LEMMA: "do"},
 | |
|     {ORTH: "did", LEMMA: "do", TAG: "VBD"},
 | |
|     {ORTH: "had", LEMMA: "have", TAG: "VBD"},
 | |
|     {ORTH: "may", TAG: "MD"},
 | |
|     {ORTH: "might", TAG: "MD"},
 | |
|     {ORTH: "must", TAG: "MD"},
 | |
|     {ORTH: "need"},
 | |
|     {ORTH: "ought"},
 | |
|     {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
 | |
|     {ORTH: "should", TAG: "MD"},
 | |
|     {ORTH: "wo", LEMMA: "will", TAG: "MD"},
 | |
|     {ORTH: "would", TAG: "MD"}]:
 | |
|     verb_data_tc = dict(verb_data)
 | |
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | |
|     for data in [verb_data, verb_data_tc]:
 | |
|         _exc[data[ORTH] + "n't"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
 | |
| 
 | |
|         _exc[data[ORTH] + "nt"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
 | |
| 
 | |
|         _exc[data[ORTH] + "n't've"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"},
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[data[ORTH] + "ntve"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"},
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
| 
 | |
| for verb_data in [
 | |
|     {ORTH: "could", TAG: "MD"},
 | |
|     {ORTH: "might"},
 | |
|     {ORTH: "must"},
 | |
|     {ORTH: "should"}]:
 | |
|     verb_data_tc = dict(verb_data)
 | |
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | |
|     for data in [verb_data, verb_data_tc]:
 | |
|         _exc[data[ORTH] + "'ve"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
|         _exc[data[ORTH] + "ve"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | |
| 
 | |
| 
 | |
| for verb_data in [
 | |
|     {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
 | |
|     {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
 | |
|     {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
 | |
|     {ORTH: "was", LEMMA: "be"},
 | |
|     {ORTH: "were", LEMMA: "be"}]:
 | |
|     verb_data_tc = dict(verb_data)
 | |
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | |
|     for data in [verb_data, verb_data_tc]:
 | |
|         _exc[data[ORTH] + "n't"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
 | |
| 
 | |
|         _exc[data[ORTH] + "nt"] = [
 | |
|             dict(data),
 | |
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
 | |
| 
 | |
| 
 | |
| # Other contractions with trailing apostrophe
 | |
| 
 | |
| for exc_data in [
 | |
|     {ORTH: "doin", LEMMA: "do", NORM: "doing"},
 | |
|     {ORTH: "goin", LEMMA: "go", NORM: "going"},
 | |
|     {ORTH: "nothin", LEMMA: "nothing"},
 | |
|     {ORTH: "nuthin", LEMMA: "nothing"},
 | |
|     {ORTH: "ol", LEMMA: "old"},
 | |
|     {ORTH: "somethin", LEMMA: "something"}]:
 | |
|     exc_data_tc = dict(exc_data)
 | |
|     exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
 | |
|     for data in [exc_data, exc_data_tc]:
 | |
|         data_apos = dict(data)
 | |
|         data_apos[ORTH] = data_apos[ORTH] + "'"
 | |
|         _exc[data[ORTH]] = [dict(data)]
 | |
|         _exc[data_apos[ORTH]] = [dict(data_apos)]
 | |
| 
 | |
| 
 | |
| # Other contractions with leading apostrophe
 | |
| 
 | |
| for exc_data in [
 | |
|     {ORTH: "cause", LEMMA: "because"},
 | |
|     {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
 | |
|     {ORTH: "ll", LEMMA: "will"},
 | |
|     {ORTH: "nuff", LEMMA: "enough"}]:
 | |
|     exc_data_apos = dict(exc_data)
 | |
|     exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
 | |
|     for data in [exc_data, exc_data_apos]:
 | |
|         _exc[data[ORTH]] = [dict(data)]
 | |
| 
 | |
| 
 | |
| # Times
 | |
| 
 | |
| for h in range(1, 12 + 1):
 | |
|     for period in ["a.m.", "am"]:
 | |
|         _exc["%d%s" % (h, period)] = [
 | |
|             {ORTH: "%d" % h},
 | |
|             {ORTH: period, LEMMA: "a.m."}]
 | |
|     for period in ["p.m.", "pm"]:
 | |
|         _exc["%d%s" % (h, period)] = [
 | |
|             {ORTH: "%d" % h},
 | |
|             {ORTH: period, LEMMA: "p.m."}]
 | |
| 
 | |
| 
 | |
| # Rest
 | |
| 
 | |
| _other_exc = {
 | |
|     "y'all": [
 | |
|         {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
 | |
|         {ORTH: "all"}],
 | |
| 
 | |
|     "yall": [
 | |
|         {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
 | |
|         {ORTH: "all"}],
 | |
| 
 | |
|     "how'd'y": [
 | |
|         {ORTH: "how", LEMMA: "how"},
 | |
|         {ORTH: "'d", LEMMA: "do"},
 | |
|         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
 | |
| 
 | |
|     "How'd'y": [
 | |
|         {ORTH: "How", LEMMA: "how"},
 | |
|         {ORTH: "'d", LEMMA: "do"},
 | |
|         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
 | |
| 
 | |
|     "not've": [
 | |
|         {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | |
|         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
 | |
| 
 | |
|     "notve": [
 | |
|         {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | |
|         {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
 | |
| 
 | |
|     "Not've": [
 | |
|         {ORTH: "Not", LEMMA: "not", TAG: "RB"},
 | |
|         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
 | |
| 
 | |
|     "Notve": [
 | |
|         {ORTH: "Not", LEMMA: "not", TAG: "RB"},
 | |
|         {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
 | |
| 
 | |
|     "cannot": [
 | |
|         {ORTH: "can", LEMMA: "can", TAG: "MD"},
 | |
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}],
 | |
| 
 | |
|     "Cannot": [
 | |
|         {ORTH: "Can", LEMMA: "can", TAG: "MD"},
 | |
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}],
 | |
| 
 | |
|     "gonna": [
 | |
|         {ORTH: "gon", LEMMA: "go", NORM: "going"},
 | |
|         {ORTH: "na", LEMMA: "to"}],
 | |
| 
 | |
|     "Gonna": [
 | |
|         {ORTH: "Gon", LEMMA: "go", NORM: "going"},
 | |
|         {ORTH: "na", LEMMA: "to"}],
 | |
| 
 | |
|     "gotta": [
 | |
|         {ORTH: "got"},
 | |
|         {ORTH: "ta", LEMMA: "to"}],
 | |
| 
 | |
|     "Gotta": [
 | |
|         {ORTH: "Got"},
 | |
|         {ORTH: "ta", LEMMA: "to"}],
 | |
| 
 | |
|     "let's": [
 | |
|         {ORTH: "let"},
 | |
|         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
 | |
| 
 | |
|     "Let's": [
 | |
|         {ORTH: "Let", LEMMA: "let"},
 | |
|         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
 | |
| }
 | |
| 
 | |
| _exc.update(_other_exc)
 | |
| 
 | |
| 
 | |
| for exc_data in [
 | |
|     {ORTH: "'S", LEMMA: "'s"},
 | |
|     {ORTH: "'s", LEMMA: "'s"},
 | |
|     {ORTH: "\u2018S", LEMMA: "'s"},
 | |
|     {ORTH: "\u2018s", LEMMA: "'s"},
 | |
|     {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
 | |
|     {ORTH: "'re", LEMMA: "be", NORM: "are"},
 | |
|     {ORTH: "'Cause", LEMMA: "because"},
 | |
|     {ORTH: "'cause", LEMMA: "because"},
 | |
|     {ORTH: "ma'am", LEMMA: "madam"},
 | |
|     {ORTH: "Ma'am", LEMMA: "madam"},
 | |
|     {ORTH: "o'clock", LEMMA: "o'clock"},
 | |
|     {ORTH: "O'clock", LEMMA: "o'clock"},
 | |
| 
 | |
|     {ORTH: "Mt.", LEMMA: "Mount"},
 | |
|     {ORTH: "Ak.", LEMMA: "Alaska"},
 | |
|     {ORTH: "Ala.", LEMMA: "Alabama"},
 | |
|     {ORTH: "Apr.", LEMMA: "April"},
 | |
|     {ORTH: "Ariz.", LEMMA: "Arizona"},
 | |
|     {ORTH: "Ark.", LEMMA: "Arkansas"},
 | |
|     {ORTH: "Aug.", LEMMA: "August"},
 | |
|     {ORTH: "Calif.", LEMMA: "California"},
 | |
|     {ORTH: "Colo.", LEMMA: "Colorado"},
 | |
|     {ORTH: "Conn.", LEMMA: "Connecticut"},
 | |
|     {ORTH: "Dec.", LEMMA: "December"},
 | |
|     {ORTH: "Del.", LEMMA: "Delaware"},
 | |
|     {ORTH: "Feb.", LEMMA: "February"},
 | |
|     {ORTH: "Fla.", LEMMA: "Florida"},
 | |
|     {ORTH: "Ga.", LEMMA: "Georgia"},
 | |
|     {ORTH: "Ia.", LEMMA: "Iowa"},
 | |
|     {ORTH: "Id.", LEMMA: "Idaho"},
 | |
|     {ORTH: "Ill.", LEMMA: "Illinois"},
 | |
|     {ORTH: "Ind.", LEMMA: "Indiana"},
 | |
|     {ORTH: "Jan.", LEMMA: "January"},
 | |
|     {ORTH: "Jul.", LEMMA: "July"},
 | |
|     {ORTH: "Jun.", LEMMA: "June"},
 | |
|     {ORTH: "Kan.", LEMMA: "Kansas"},
 | |
|     {ORTH: "Kans.", LEMMA: "Kansas"},
 | |
|     {ORTH: "Ky.", LEMMA: "Kentucky"},
 | |
|     {ORTH: "La.", LEMMA: "Louisiana"},
 | |
|     {ORTH: "Mar.", LEMMA: "March"},
 | |
|     {ORTH: "Mass.", LEMMA: "Massachusetts"},
 | |
|     {ORTH: "May.", LEMMA: "May"},
 | |
|     {ORTH: "Mich.", LEMMA: "Michigan"},
 | |
|     {ORTH: "Minn.", LEMMA: "Minnesota"},
 | |
|     {ORTH: "Miss.", LEMMA: "Mississippi"},
 | |
|     {ORTH: "N.C.", LEMMA: "North Carolina"},
 | |
|     {ORTH: "N.D.", LEMMA: "North Dakota"},
 | |
|     {ORTH: "N.H.", LEMMA: "New Hampshire"},
 | |
|     {ORTH: "N.J.", LEMMA: "New Jersey"},
 | |
|     {ORTH: "N.M.", LEMMA: "New Mexico"},
 | |
|     {ORTH: "N.Y.", LEMMA: "New York"},
 | |
|     {ORTH: "Neb.", LEMMA: "Nebraska"},
 | |
|     {ORTH: "Nebr.", LEMMA: "Nebraska"},
 | |
|     {ORTH: "Nev.", LEMMA: "Nevada"},
 | |
|     {ORTH: "Nov.", LEMMA: "November"},
 | |
|     {ORTH: "Oct.", LEMMA: "October"},
 | |
|     {ORTH: "Okla.", LEMMA: "Oklahoma"},
 | |
|     {ORTH: "Ore.", LEMMA: "Oregon"},
 | |
|     {ORTH: "Pa.", LEMMA: "Pennsylvania"},
 | |
|     {ORTH: "S.C.", LEMMA: "South Carolina"},
 | |
|     {ORTH: "Sep.", LEMMA: "September"},
 | |
|     {ORTH: "Sept.", LEMMA: "September"},
 | |
|     {ORTH: "Tenn.", LEMMA: "Tennessee"},
 | |
|     {ORTH: "Va.", LEMMA: "Virginia"},
 | |
|     {ORTH: "Wash.", LEMMA: "Washington"},
 | |
|     {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
 | |
|     _exc[exc_data[ORTH]] = [dict(exc_data)]
 | |
| 
 | |
| 
 | |
| for orth in [
 | |
|     "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
 | |
|     "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
 | |
|     "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
 | |
|     "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
 | |
|     _exc[orth] = [{ORTH: orth}]
 | |
| 
 | |
| 
 | |
| for string in _exclude:
 | |
|     if string in _exc:
 | |
|         _exc.pop(string)
 | |
| 
 | |
| 
 | |
| TOKENIZER_EXCEPTIONS = dict(_exc)
 |