mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Add tokenizer exceptions for a.m. and p.m. in Spanish
This commit is contained in:
		
							parent
							
								
									d1a2846750
								
							
						
					
					
						commit
						3c87c71d43
					
				| 
						 | 
					@ -3,12 +3,45 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import language_data as base
 | 
					from .. import language_data as base
 | 
				
			||||||
from ..language_data import update_exc, strings_to_exc
 | 
					from ..language_data import update_exc, strings_to_exc
 | 
				
			||||||
 | 
					from ..symbols import ORTH, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_time_exc(hours):
 | 
				
			||||||
 | 
					    exc = {
 | 
				
			||||||
 | 
					        "12m.": [
 | 
				
			||||||
 | 
					            {ORTH: "12"},
 | 
				
			||||||
 | 
					            {ORTH: "m.", LEMMA: "p.m."}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for hour in hours:
 | 
				
			||||||
 | 
					        exc["%da.m." % hour] = [
 | 
				
			||||||
 | 
					            {ORTH: hour},
 | 
				
			||||||
 | 
					            {ORTH: "a.m."}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        exc["%dp.m." % hour] = [
 | 
				
			||||||
 | 
					            {ORTH: hour},
 | 
				
			||||||
 | 
					            {ORTH: "p.m."}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        exc["%dam" % hour] = [
 | 
				
			||||||
 | 
					            {ORTH: hour},
 | 
				
			||||||
 | 
					            {ORTH: "am", LEMMA: "a.m."}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        exc["%dpm" % hour] = [
 | 
				
			||||||
 | 
					            {ORTH: hour},
 | 
				
			||||||
 | 
					            {ORTH: "pm", LEMMA: "p.m."}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    return exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 | 
					TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
 | 
					update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 | 
				
			||||||
STOP_WORDS = set(STOP_WORDS)
 | 
					STOP_WORDS = set(STOP_WORDS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user