mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Fix Issue #736: Times were being tokenized with incorrect string values.
This commit is contained in:
		
							parent
							
								
									a6790b6694
								
							
						
					
					
						commit
						fba67fa342
					
				|  | @ -15,25 +15,26 @@ from .morph_rules import MORPH_RULES | ||||||
| def get_time_exc(hours): | def get_time_exc(hours): | ||||||
|     exc = {} |     exc = {} | ||||||
|     for hour in hours: |     for hour in hours: | ||||||
|         exc["%da.m." % hour] = [ |         exc["%sa.m." % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "a.m."} |             {ORTH: "a.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dp.m." % hour] = [ |         exc["%sp.m." % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "p.m."} |             {ORTH: "p.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dam" % hour] = [ |         exc["%sam" % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "am", LEMMA: "a.m."} |             {ORTH: "am", LEMMA: "a.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dpm" % hour] = [ |         exc["%spm" % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "pm", LEMMA: "p.m."} |             {ORTH: "pm", LEMMA: "p.m."} | ||||||
|         ] |         ] | ||||||
|  |     print(exc) | ||||||
|     return exc |     return exc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -43,7 +44,8 @@ STOP_WORDS = set(STOP_WORDS) | ||||||
| 
 | 
 | ||||||
| TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) | update_exc(TOKENIZER_EXCEPTIONS, get_time_exc( | ||||||
|  |     ['%d' % hour for hour in range(1, 12 + 1)])) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) | update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | ||||||
|  |  | ||||||
|  | @ -20,18 +20,18 @@ for pron in ["i"]: | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[orth + "m"] = [ |         EXC[orth + "m"] = [ | ||||||
|             {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, | ||||||
|             {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } |             {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[orth + "'ma"] = [ |         EXC[orth + "'ma"] = [ | ||||||
|             {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, | ||||||
|             {ORTH: "'m", LEMMA: "be", NORM: "am"}, |             {ORTH: "'m", LEMMA: "be", NORM: "am"}, | ||||||
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"} |             {ORTH: "a", LEMMA: "going to", NORM: "gonna"} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[orth + "ma"] = [ |         EXC[orth + "ma"] = [ | ||||||
|             {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, | ||||||
|             {ORTH: "m", LEMMA: "be", NORM: "am"}, |             {ORTH: "m", LEMMA: "be", NORM: "am"}, | ||||||
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"} |             {ORTH: "a", LEMMA: "going to", NORM: "gonna"} | ||||||
|         ] |         ] | ||||||
|  | @ -150,8 +150,8 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: | ||||||
| 
 | 
 | ||||||
|         EXC[orth + "'ll've"] = [ |         EXC[orth + "'ll've"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word}, | ||||||
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}, |             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"} |             {ORTH: "'ve", LEMMA: "have", TAG: "VB"} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[orth + "llve"] = [ |         EXC[orth + "llve"] = [ | ||||||
|  | @ -237,11 +237,13 @@ for verb_data in [ | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[data[ORTH] + "n't've"] = [ |         EXC[data[ORTH] + "n't've"] = [ | ||||||
|  |             dict(data), | ||||||
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}, |             {ORTH: "n't", LEMMA: "not", TAG: "RB"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"} |             {ORTH: "'ve", LEMMA: "have", TAG: "VB"} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         EXC[data[ORTH] + "ntve"] = [ |         EXC[data[ORTH] + "ntve"] = [ | ||||||
|  |             dict(data), | ||||||
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}, |             {ORTH: "nt", LEMMA: "not", TAG: "RB"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"} |             {ORTH: "ve", LEMMA: "have", TAG: "VB"} | ||||||
|         ] |         ] | ||||||
|  |  | ||||||
|  | @ -18,22 +18,22 @@ def get_time_exc(hours): | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     for hour in hours: |     for hour in hours: | ||||||
|         exc["%da.m." % hour] = [ |         exc["%sa.m." % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "a.m."} |             {ORTH: "a.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dp.m." % hour] = [ |         exc["%sp.m." % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "p.m."} |             {ORTH: "p.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dam" % hour] = [ |         exc["%sam" % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "am", LEMMA: "a.m."} |             {ORTH: "am", LEMMA: "a.m."} | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         exc["%dpm" % hour] = [ |         exc["%spm" % hour] = [ | ||||||
|             {ORTH: hour}, |             {ORTH: hour}, | ||||||
|             {ORTH: "pm", LEMMA: "p.m."} |             {ORTH: "pm", LEMMA: "p.m."} | ||||||
|         ] |         ] | ||||||
|  | @ -45,7 +45,8 @@ STOP_WORDS = set(STOP_WORDS) | ||||||
| 
 | 
 | ||||||
| TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) | update_exc(TOKENIZER_EXCEPTIONS, get_time_exc( | ||||||
|  |     ['%d' % hour for hour in range(1, 12 + 1)])) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3,6 +3,11 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..symbols import * | from ..symbols import * | ||||||
| 
 | 
 | ||||||
|  | try: | ||||||
|  |     unicode | ||||||
|  | except: | ||||||
|  |     unicode = str | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| PRON_LEMMA = "-PRON-" | PRON_LEMMA = "-PRON-" | ||||||
| DET_LEMMA = "-DET-" | DET_LEMMA = "-DET-" | ||||||
|  | @ -10,6 +15,15 @@ ENT_ID = "ent_id" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def update_exc(exc, additions): | def update_exc(exc, additions): | ||||||
|  |     for orth, token_attrs in additions.items(): | ||||||
|  |         if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs): | ||||||
|  |             msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" | ||||||
|  |             raise ValueError(msg % (orth, token_attrs)) | ||||||
|  |         described_orth = ''.join(attr[ORTH] for attr in token_attrs) | ||||||
|  |         if orth != described_orth: | ||||||
|  |             # TODO: Better error | ||||||
|  |             msg = "Invalid tokenizer exception: key='%s', orths='%s'" | ||||||
|  |             raise ValueError(msg % (orth, described_orth)) | ||||||
|     overlap = set(exc.keys()).intersection(set(additions)) |     overlap = set(exc.keys()).intersection(set(additions)) | ||||||
|     assert not overlap, overlap |     assert not overlap, overlap | ||||||
|     exc.update(additions) |     exc.update(additions) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user