mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-28 06:31:12 +03:00 
			
		
		
		
	Improve German tokenization
Improve German tokenization with respect to Tiger.
This commit is contained in:
		
							parent
							
								
									ff184b7a9c
								
							
						
					
					
						commit
						d1f703d78d
					
				|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .norm_exceptions import NORM_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
|  | @ -22,6 +23,8 @@ class GermanDefaults(Language.Defaults): | |||
|         Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS | ||||
|     ) | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
|  |  | |||
|  | @ -1,10 +1,32 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES | ||||
| from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..punctuation import _prefixes, _suffixes | ||||
| 
 | ||||
| 
 | ||||
| _prefixes = ["``",] + list(_prefixes) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     ["''", "/"] | ||||
|     + LIST_PUNCT | ||||
|     + LIST_ELLIPSES | ||||
|     + LIST_QUOTES | ||||
|     + LIST_ICONS | ||||
|     + [ | ||||
|         r"(?<=[0-9])\+", | ||||
|         r"(?<=°[FfCcKk])\.", | ||||
|         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||
|         r"(?<=[{al}{e}{p}(?:{q})])\.".format( | ||||
|             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT | ||||
|         ), | ||||
|         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| 
 | ||||
| _infixes = ( | ||||
|  | @ -15,6 +37,7 @@ _infixes = ( | |||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[0-9{a}])\/(?=[0-9{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[0-9])-(?=[0-9])", | ||||
|  | @ -22,4 +45,6 @@ _infixes = ( | |||
| ) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = _prefixes | ||||
| TOKENIZER_SUFFIXES = _suffixes | ||||
| TOKENIZER_INFIXES = _infixes | ||||
|  |  | |||
|  | @ -160,6 +160,8 @@ for exc_data in [ | |||
| 
 | ||||
| 
 | ||||
| for orth in [ | ||||
|     "``", | ||||
|     "''", | ||||
|     "A.C.", | ||||
|     "a.D.", | ||||
|     "A.D.", | ||||
|  | @ -175,10 +177,13 @@ for orth in [ | |||
|     "biol.", | ||||
|     "Biol.", | ||||
|     "ca.", | ||||
|     "CDU/CSU", | ||||
|     "Chr.", | ||||
|     "Cie.", | ||||
|     "c/o", | ||||
|     "co.", | ||||
|     "Co.", | ||||
|     "d'", | ||||
|     "D.C.", | ||||
|     "Dipl.-Ing.", | ||||
|     "Dipl.", | ||||
|  | @ -203,12 +208,18 @@ for orth in [ | |||
|     "i.G.", | ||||
|     "i.Tr.", | ||||
|     "i.V.", | ||||
|     "I.", | ||||
|     "II.", | ||||
|     "III.", | ||||
|     "IV.", | ||||
|     "Inc.", | ||||
|     "Ing.", | ||||
|     "jr.", | ||||
|     "Jr.", | ||||
|     "jun.", | ||||
|     "jur.", | ||||
|     "K.O.", | ||||
|     "L'", | ||||
|     "L.A.", | ||||
|     "lat.", | ||||
|     "M.A.", | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user