mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
		
			
				
	
	
		
			38 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			38 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| 
 | |
| # fmt: off
 | |
| TESTCASES = [
 | |
|     # Punctuation tests
 | |
|     ("āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻžāύ āĻāĻžāĻ!", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāϞāĻžāϝāĻŧ", "āĻāĻžāύ", "āĻāĻžāĻ", "!"]),
 | |
|     ("āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻĨāĻž āĻāĻāĨ¤", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāϞāĻžāϝāĻŧ", "āĻāĻĨāĻž", "āĻāĻ", "āĨ¤"]),
 | |
|     ("āĻŦāϏā§āύā§āϧāϰāĻž āĻāύāϏāĻŽā§āĻŽā§āĻā§ āĻĻā§āώ āϏā§āĻŦā§āĻāĻžāϰ āĻāϰāϞ⧠āύāĻž?", ["āĻŦāϏā§āύā§āϧāϰāĻž", "āĻāύāϏāĻŽā§āĻŽā§āĻā§", "āĻĻā§āώ", "āϏā§āĻŦā§āĻāĻžāϰ", "āĻāϰāϞā§", "āύāĻž", "?"]),
 | |
|     ("āĻāĻžāĻāĻž āĻĨāĻžāĻāϞ⧠āĻāĻŋ āύāĻž āĻšāϝāĻŧ!", ["āĻāĻžāĻāĻž", "āĻĨāĻžāĻāϞā§", "āĻāĻŋ", "āύāĻž", "āĻšāϝāĻŧ", "!"]),
 | |
|     ("āϏāϰāĻāĻžāϰāĻŋ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§-āĻāϰ āĻāĻžāϤā§āϰ āύāĻ āĻŦāϞā§āĻ āĻāĻŋ āĻāĻŽāύ āĻāĻāϰāĻŖ?", ["āϏāϰāĻāĻžāϰāĻŋ", "āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§", "-", "āĻāϰ", "āĻāĻžāϤā§āϰ", "āύāĻ", "āĻŦāϞā§āĻ", "āĻāĻŋ", "āĻāĻŽāύ", "āĻāĻāϰāĻŖ", "?"]),
 | |
|     ('āϤāĻžāϰāĻž āĻŦāϞā§, "āĻāϰāĻž āĻāĻžāĻŽāĻžāϰā§āϰ āĻŽā§āϰāĻāĻŋāĨ¤"', ["āϤāĻžāϰāĻž", "āĻŦāϞā§", ",", '"', "āĻāϰāĻž", "āĻāĻžāĻŽāĻžāϰā§āϰ", "āĻŽā§āϰāĻāĻŋ", "āĨ¤", '"']),
 | |
|     ("ā§Š*ā§Š=ā§Ŧ?", ["ā§Š", "*", "ā§Š", "=", "ā§Ŧ", "?"]),
 | |
|     ("āĻāĻžāĻāĻ āĻžāϞ-āĻāϰ āĻāύā§āϧāĻ āĻ
āύā§āϝāϰāĻāĻŽ", ["āĻāĻžāĻāĻ āĻžāϞ", "-", "āĻāϰ", "āĻāύā§āϧāĻ", "āĻ
āύā§āϝāϰāĻāĻŽ"]),
 | |
|     # Abbreviations
 | |
|     ("āĻĄāĻ āĻāĻžāϞā§āĻĻ āĻŦāϞāϞā§āύ āĻĸāĻžāĻāĻžāϝāĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āϰāĻŋ āϏā§.āĨ¤", ["āĻĄāĻ", "āĻāĻžāϞā§āĻĻ", "āĻŦāϞāϞā§āύ", "āĻĸāĻžāĻāĻžāϝāĻŧ", "ā§Šā§Ģ", "āĻĄāĻŋāĻā§āϰāĻŋ", "āϏā§.", "āĨ¤"]),
 | |
| ]
 | |
| # fmt: on
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("text,expected_tokens", TESTCASES)
 | |
| def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
 | |
|     tokens = bn_tokenizer(text)
 | |
|     token_list = [token.text for token in tokens if not token.is_space]
 | |
|     assert expected_tokens == token_list
 | |
| 
 | |
| 
 | |
| def test_bn_tokenizer_handles_long_text(bn_tokenizer):
 | |
|     text = """āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞāϝāĻŧā§ āϏāĻžāϰāĻžāĻŦāĻāϰ āĻā§āύ āύāĻž āĻā§āύ āĻŦāĻŋāώāϝāĻŧā§ āĻāĻŦā§āώāĻŖāĻž āĻāϞāϤā§āĻ āĻĨāĻžāĻā§āĨ¤ \
 | |
| āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āϝāĻžāĻāĻžāϞā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāϰāĻāĻŖ āĻĒā§āϰāĻžāϝāĻŧāĻ āĻļāĻŋāĻā§āώāĻžāϰā§āĻĨā§āĻĻā§āϰ āύāĻŋāϝāĻŧā§ āĻŦāĻŋāĻāĻŋāύā§āύ āĻāĻŦā§āώāĻŖāĻž āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰā§āύ, \
 | |
| āϝāĻžāϰ āĻŽāϧā§āϝ⧠āϰāϝāĻŧā§āĻā§ āϰā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāύ āϞāĻžāϰā§āύāĻŋāĻ āϏāĻŋāϏā§āĻā§āĻŽ āĻ āĻāϰā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāϝāĻŧāĻžāϞ āĻāύā§āĻā§āϞāĻŋāĻā§āύā§āϏāĨ¤ \
 | |
| āĻāϏāĻāϞ āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰāĻžāϰ āĻŽāĻžāϧā§āϝāĻŽā§ āϏāĻāĻļā§āϞāĻŋāώā§āĻ āĻā§āώā§āϤā§āϰ⧠āϝāĻĨā§āώā§āĻ  āĻĒāϰāĻŋāĻŽāĻžāĻŖ āϏā§āĻĒā§āĻļāĻžāϞāĻžāĻāĻāĻĄ āĻšāĻāϝāĻŧāĻž āϏāĻŽā§āĻāĻŦāĨ¤ \
 | |
| āĻāϰ āĻāĻŦā§āώāĻŖāĻžāϰ āĻāĻžāĻ āϤā§āĻŽāĻžāϰ āĻā§āϝāĻžāϰāĻŋāϝāĻŧāĻžāϰāĻā§ āĻ ā§āϞ⧠āύāĻŋāϝāĻŧā§ āϝāĻžāĻŦā§ āĻ
āύā§āĻāĻāĻžāύāĻŋ! \
 | |
| āĻāύā§āĻā§āϏā§āĻ āĻĒā§āϰā§āĻā§āϰāĻžāĻŽāĻžāϰ āĻšāĻ, āĻāĻŦā§āώāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āϞāĻĒāĻžāϰ - āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻāĻāύāĻŋāĻāĻžāϰā§āϏāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāϰ āĻĒā§āϰāϤāĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āϰ āϏā§āϝā§āĻ āϰāϝāĻŧā§āĻā§āĻāĨ¤ \
 | |
| āύāϰā§āĻĨ āϏāĻžāĻāĻĨā§āϰ āĻ
āϏāĻžāϧāĻžāϰāĻŖ āĻāĻŽāĻŋāĻāύāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāĻā§ āϏāĻžāĻĻāϰ āĻāĻŽāύā§āϤā§āϰāĻŖāĨ¤"""
 | |
|     tokens = bn_tokenizer(text)
 | |
|     assert len(tokens) == 84
 |