mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Bugfix/swedish tokenizer (#12315)
* add unittest for explosion#12311 * create punctuation.py for swedish * removed : from infixes in swedish punctuation.py * allow : as infix if succeeding char is uppercase
This commit is contained in:
		
							parent
							
								
									4539fbae17
								
							
						
					
					
						commit
						e2de188cf1
					
				| 
						 | 
					@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from ...language import Language, BaseDefaults
 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Punctuation stolen from Danish
 | 
					 | 
				
			||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SwedishDefaults(BaseDefaults):
 | 
					class SwedishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,33 @@
 | 
				
			||||||
 | 
					from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 | 
				
			||||||
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
				
			||||||
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_infixes = (
 | 
				
			||||||
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + LIST_ICONS
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_suffixes = [
 | 
				
			||||||
 | 
					    suffix
 | 
				
			||||||
 | 
					    for suffix in TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					_suffixes += [r"(?<=[^sSxXzZ])\'"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_INFIXES = _infixes
 | 
				
			||||||
 | 
					TOKENIZER_SUFFIXES = _suffixes
 | 
				
			||||||
| 
						 | 
					@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
 | 
				
			||||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
 | 
					def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
 | 
				
			||||||
    tokens = sv_tokenizer(text)
 | 
					    tokens = sv_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.issue(12311)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
 | 
				
			||||||
 | 
					def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = sv_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user