mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			61 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			61 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| DOT_TESTS = [
 | |
|     ("tel.", ["tel."]),
 | |
|     ("np.", ["np."]),
 | |
|     ("godz. 21:37", ["godz.", "21:37"]),
 | |
|     ("inż.", ["inż."]),
 | |
|     ("gosp.-polit.", ["gosp.-polit."]),
 | |
|     ("ppoż", ["ppoż"]),
 | |
|     ("płn", ["płn"]),
 | |
|     ("ul.", ["ul."]),
 | |
|     ("jw.", ["jw."]),
 | |
|     ("itd.", ["itd."]),
 | |
|     ("cdn.", ["cdn."]),
 | |
|     ("itp.", ["itp."]),
 | |
|     ("10,- zł", ["10,-", "zł"]),
 | |
|     ("0 zł 99 gr", ["0", "zł", "99", "gr"]),
 | |
|     ("0,99 rub.", ["0,99", "rub."]),
 | |
|     ("dol.", ["dol."]),
 | |
|     ("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
 | |
|     ("m.in.", ["m.in."]),
 | |
|     ("p.n.e.", ["p.n.e."]),
 | |
|     ("Sz.P.", ["Sz.P."]),
 | |
|     ("p.o.", ["p.o."]),
 | |
|     ("k.o.", ["k.o."]),
 | |
|     ("m.st.", ["m.st."]),
 | |
|     ("dra.", ["dra", "."]),
 | |
|     ("pp.", ["pp."]),
 | |
|     ("oo.", ["oo."]),
 | |
| ]
 | |
| 
 | |
| HYPHEN_TESTS = [
 | |
|     ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
 | |
|     ("NESS-040C5", ["NESS-040C5"]),
 | |
|     ("JTE-7-31", ["JTE-7-31"]),
 | |
|     ("BAY-59-3074", ["BAY-59-3074"]),
 | |
|     ("BAY-38-7271", ["BAY-38-7271"]),
 | |
|     ("STS-135", ["STS-135"]),
 | |
|     ("5F-PB-22", ["5F-PB-22"]),
 | |
|     ("cztero-", ["cztero-"]),
 | |
|     ("jedno-", ["jedno-"]),
 | |
|     ("dwu-", ["dwu-"]),
 | |
|     ("trzy-", ["trzy-"]),
 | |
|     ("b-adoratorzy", ["b-adoratorzy"]),
 | |
|     ("2-3-4 drzewa", ["2-3-4", "drzewa"]),
 | |
|     ("b-drzewa", ["b-drzewa"]),
 | |
| ]
 | |
| 
 | |
| 
 | |
| TESTCASES = DOT_TESTS + HYPHEN_TESTS
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("text,expected_tokens", TESTCASES)
 | |
| def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
 | |
|     tokens = pl_tokenizer(text)
 | |
|     token_list = [token.text for token in tokens if not token.is_space]
 | |
|     assert expected_tokens == token_list
 |