mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Add German norm exceptions
This commit is contained in:
		
							parent
							
								
									5bd311c77e
								
							
						
					
					
						commit
						0d6fa8b241
					
				|  | @ -2,21 +2,25 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .norm_exceptions import NORM_EXCEPTIONS | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
| 
 | ||||
| class GermanDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'de' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||
|                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|  |  | |||
							
								
								
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| # Here we only want to include the absolute most common words. Otherwise, | ||||
| # this list would get impossibly long for German – especially considering the | ||||
| # old vs. new spelling rules, and all possible cases. | ||||
| 
 | ||||
| 
 | ||||
| _exc = { | ||||
|     "daß": "dass" | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| NORM_EXCEPTIONS = {} | ||||
| 
 | ||||
| for string, norm in _exc.items(): | ||||
|     NORM_EXCEPTIONS[string.title()] = norm | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user