mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	[finnish] Add initial tests for tokenizer
This commit is contained in:
		
							parent
							
								
									f9bb25d1cf
								
							
						
					
					
						commit
						1a1952afa5
					
				| 
						 | 
					@ -10,6 +10,7 @@ from ..pt import Portuguese
 | 
				
			||||||
from ..nl import Dutch
 | 
					from ..nl import Dutch
 | 
				
			||||||
from ..sv import Swedish
 | 
					from ..sv import Swedish
 | 
				
			||||||
from ..hu import Hungarian
 | 
					from ..hu import Hungarian
 | 
				
			||||||
 | 
					from ..fi import Finnish
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..strings import StringStore
 | 
					from ..strings import StringStore
 | 
				
			||||||
from ..lemmatizer import Lemmatizer
 | 
					from ..lemmatizer import Lemmatizer
 | 
				
			||||||
| 
						 | 
					@ -23,7 +24,7 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
 | 
					LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
 | 
				
			||||||
             Swedish, Hungarian]
 | 
					             Swedish, Hungarian, Finnish]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(params=LANGUAGES)
 | 
					@pytest.fixture(params=LANGUAGES)
 | 
				
			||||||
| 
						 | 
					@ -62,6 +63,11 @@ def hu_tokenizer():
 | 
				
			||||||
    return Hungarian.Defaults.create_tokenizer()
 | 
					    return Hungarian.Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def fi_tokenizer():
 | 
				
			||||||
 | 
					    return Finnish.Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def stringstore():
 | 
					def stringstore():
 | 
				
			||||||
    return StringStore()
 | 
					    return StringStore()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/fi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/fi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										18
									
								
								spacy/tests/fi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/tests/fi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,18 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ABBREVIATION_TESTS = [
 | 
				
			||||||
 | 
					    ('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
 | 
				
			||||||
 | 
					    ('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TESTCASES = ABBREVIATION_TESTS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
 | 
				
			||||||
 | 
					def test_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
 | 
				
			||||||
 | 
					    tokens = fi_tokenizer(text)
 | 
				
			||||||
 | 
					    token_list = [token.text for token in tokens if not token.is_space]
 | 
				
			||||||
 | 
					    assert expected_tokens == token_list
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user