mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Add English lex_attrs overrides
This commit is contained in:
		
							parent
							
								
									8f3fbbb147
								
							
						
					
					
						commit
						88adeee548
					
				|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
| from .morph_rules import MORPH_RULES | from .morph_rules import MORPH_RULES | ||||||
| from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC | from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC | ||||||
| 
 | 
 | ||||||
|  | @ -19,6 +20,7 @@ class English(Language): | ||||||
|     class Defaults(Language.Defaults): |     class Defaults(Language.Defaults): | ||||||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|         lex_attr_getters[LANG] = lambda text: 'en' |         lex_attr_getters[LANG] = lambda text: 'en' | ||||||
|  |         lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 | 
 | ||||||
|         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|         tag_map = dict(TAG_MAP) |         tag_map = dict(TAG_MAP) | ||||||
|  |  | ||||||
|  | @ -2,22 +2,31 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Number words | from ...attrs import LIKE_NUM | ||||||
| 
 |  | ||||||
| NUM_WORDS = set(""" |  | ||||||
| zero one two three four five six seven eight nine ten eleven twelve thirteen |  | ||||||
| fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty |  | ||||||
| sixty seventy eighty ninety hundred thousand million billion trillion |  | ||||||
| quadrillion gajillion bazillion |  | ||||||
| """.split()) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Ordinal words | _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', | ||||||
|  |               'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', | ||||||
|  |               'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', | ||||||
|  |               'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', | ||||||
|  |               'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion', | ||||||
|  |               'gajillion', 'bazillion'] | ||||||
| 
 | 
 | ||||||
| ORDINAL_WORDS = set(""" | 
 | ||||||
| first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth | def like_num(text): | ||||||
| thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth |     text = text.replace(',', '') | ||||||
| twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth |     text = text.replace('.', '') | ||||||
| hundreth thousandth millionth billionth trillionth quadrillionth gajillionth |     if text.isdigit(): | ||||||
| bazillionth |         return True | ||||||
| """.split()) |     if text.count('/') == 1: | ||||||
|  |         num, denom = text.split('/') | ||||||
|  |         if num.isdigit() and denom.isdigit(): | ||||||
|  |             return True | ||||||
|  |     if text in _num_words: | ||||||
|  |         return True | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | LEX_ATTRS = { | ||||||
|  |     LIKE_NUM: like_num | ||||||
|  | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user