mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Add German norm exceptions
This commit is contained in:
		
							parent
							
								
									5bd311c77e
								
							
						
					
					
						commit
						0d6fa8b241
					
				|  | @ -2,21 +2,25 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .norm_exceptions import NORM_EXCEPTIONS | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer | from ...lemmatizerlookup import Lemmatizer | ||||||
| from ...attrs import LANG | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class GermanDefaults(Language.Defaults): | class GermanDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'de' |     lex_attr_getters[LANG] = lambda text: 'de' | ||||||
|  |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||||
|  |                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = dict(TAG_MAP) | ||||||
|  |  | ||||||
							
								
								
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | # Here we only want to include the absolute most common words. Otherwise, | ||||||
|  | # this list would get impossibly long for German – especially considering the | ||||||
|  | # old vs. new spelling rules, and all possible cases. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _exc = { | ||||||
|  |     "daß": "dass" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | NORM_EXCEPTIONS = {} | ||||||
|  | 
 | ||||||
|  | for string, norm in _exc.items(): | ||||||
|  |     NORM_EXCEPTIONS[string.title()] = norm | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user