mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Don't split hyphenated words in German
This way, the tokenizer matches the tokenization in German treebanks
This commit is contained in:
		
							parent
							
								
									68f66aebf8
								
							
						
					
					
						commit
						ece30c28a8
					
				|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .norm_exceptions import NORM_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
|  | @ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults): | |||
|                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| 
 | ||||
| 
 | ||||
| _quotes = QUOTES.replace("'", '') | ||||
| 
 | ||||
| _infixes = (LIST_ELLIPSES + LIST_ICONS + | ||||
|             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), | ||||
|              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), | ||||
|              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[0-9])-(?=[0-9])']) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
|  | @ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): | |||
|     assert len(tokens) == 4 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["blau-rot"]) | ||||
| def test_tokenizer_splits_hyphens(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | ||||
| def test_tokenizer_splits_numeric_range(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|  | @ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): | |||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) | ||||
| def test_tokenizer_keeps_hyphens(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | ||||
|     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") | ||||
|     assert len(tokens) == 12 | ||||
|     assert len(tokens) == 10 | ||||
|     assert tokens[0].text == "Viele" | ||||
|     assert tokens[1].text == "Regeln" | ||||
|     assert tokens[2].text == "--" | ||||
|     assert tokens[3].text == "wie" | ||||
|     assert tokens[4].text == "die" | ||||
|     assert tokens[5].text == "Bindestrich" | ||||
|     assert tokens[6].text == "-" | ||||
|     assert tokens[7].text == "Regeln" | ||||
|     assert tokens[8].text == "--" | ||||
|     assert tokens[9].text == "sind" | ||||
|     assert tokens[10].text == "kompliziert" | ||||
|     assert tokens[5].text == "Bindestrich-Regeln" | ||||
|     assert tokens[6].text == "--" | ||||
|     assert tokens[7].text == "sind" | ||||
|     assert tokens[8].text == "kompliziert" | ||||
|  |  | |||
|  | @ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. | |||
|     assert len(tokens) == 109 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,length', [ | ||||
|     ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), | ||||
|     ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), | ||||
|     ("Kraftfahrzeug-Haftpflichtversicherung", 3), | ||||
|     ("Vakuum-Mittelfrequenz-Induktionsofen", 5) | ||||
| @pytest.mark.parametrize('text', [ | ||||
|     "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", | ||||
|     "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", | ||||
|     "Kraftfahrzeug-Haftpflichtversicherung", | ||||
|     "Vakuum-Mittelfrequenz-Induktionsofen" | ||||
|     ]) | ||||
| def test_tokenizer_handles_long_words(de_tokenizer, text, length): | ||||
| def test_tokenizer_handles_long_words(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == length | ||||
|     assert len(tokens) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,length', [ | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user