mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Don't split hyphenated words in German
This way, the tokenizer matches the tokenization in German treebanks
This commit is contained in:
		
							parent
							
								
									68f66aebf8
								
							
						
					
					
						commit
						ece30c28a8
					
				|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .norm_exceptions import NORM_EXCEPTIONS | from .norm_exceptions import NORM_EXCEPTIONS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
|  | @ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults): | ||||||
|                                          NORM_EXCEPTIONS, BASE_NORMS) |                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|  |     infixes = tuple(TOKENIZER_INFIXES) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = dict(TAG_MAP) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = set(STOP_WORDS) | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||||
|  |  | ||||||
							
								
								
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||||
|  | from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _quotes = QUOTES.replace("'", '') | ||||||
|  | 
 | ||||||
|  | _infixes = (LIST_ELLIPSES + LIST_ICONS + | ||||||
|  |             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), | ||||||
|  |              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), | ||||||
|  |              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[0-9])-(?=[0-9])']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_INFIXES = _infixes | ||||||
|  | @ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): | ||||||
|     assert len(tokens) == 4 |     assert len(tokens) == 4 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["blau-rot"]) |  | ||||||
| def test_tokenizer_splits_hyphens(de_tokenizer, text): |  | ||||||
|     tokens = de_tokenizer(text) |  | ||||||
|     assert len(tokens) == 3 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | ||||||
| def test_tokenizer_splits_numeric_range(de_tokenizer, text): | def test_tokenizer_splits_numeric_range(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|  | @ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) | ||||||
|  | def test_tokenizer_keeps_hyphens(de_tokenizer, text): | ||||||
|  |     tokens = de_tokenizer(text) | ||||||
|  |     assert len(tokens) == 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | ||||||
|     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") |     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") | ||||||
|     assert len(tokens) == 12 |     assert len(tokens) == 10 | ||||||
|     assert tokens[0].text == "Viele" |     assert tokens[0].text == "Viele" | ||||||
|     assert tokens[1].text == "Regeln" |     assert tokens[1].text == "Regeln" | ||||||
|     assert tokens[2].text == "--" |     assert tokens[2].text == "--" | ||||||
|     assert tokens[3].text == "wie" |     assert tokens[3].text == "wie" | ||||||
|     assert tokens[4].text == "die" |     assert tokens[4].text == "die" | ||||||
|     assert tokens[5].text == "Bindestrich" |     assert tokens[5].text == "Bindestrich-Regeln" | ||||||
|     assert tokens[6].text == "-" |     assert tokens[6].text == "--" | ||||||
|     assert tokens[7].text == "Regeln" |     assert tokens[7].text == "sind" | ||||||
|     assert tokens[8].text == "--" |     assert tokens[8].text == "kompliziert" | ||||||
|     assert tokens[9].text == "sind" |  | ||||||
|     assert tokens[10].text == "kompliziert" |  | ||||||
|  |  | ||||||
|  | @ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. | ||||||
|     assert len(tokens) == 109 |     assert len(tokens) == 109 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,length', [ | @pytest.mark.parametrize('text', [ | ||||||
|     ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), |     "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", | ||||||
|     ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), |     "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", | ||||||
|     ("Kraftfahrzeug-Haftpflichtversicherung", 3), |     "Kraftfahrzeug-Haftpflichtversicherung", | ||||||
|     ("Vakuum-Mittelfrequenz-Induktionsofen", 5) |     "Vakuum-Mittelfrequenz-Induktionsofen" | ||||||
|     ]) |     ]) | ||||||
| def test_tokenizer_handles_long_words(de_tokenizer, text, length): | def test_tokenizer_handles_long_words(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|     assert len(tokens) == length |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,length', [ | @pytest.mark.parametrize('text,length', [ | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user