mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			37 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			37 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf-8
 | |
| """Find the min-cost alignment between two tokenizations"""
 | |
| 
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from ...gold import _min_edit_path as min_edit_path
 | |
| from ...gold import align
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize('cand,gold,path', [
 | |
|     (["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
 | |
|     (["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
 | |
|     (["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
 | |
|     (["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
 | |
|     (["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
 | |
|     (["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
 | |
| def test_gold_lev_align_edit_path(cand, gold, path):
 | |
|     assert min_edit_path(cand, gold) == path
 | |
| 
 | |
| 
 | |
| def test_gold_lev_align_edit_path2():
 | |
|     cand = ["your", "stuff"]
 | |
|     gold = ["you", "r", "stuff"]
 | |
|     assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize('cand,gold,result', [
 | |
|     (["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
 | |
|     (["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
 | |
|     (["i", "like", "2", "guys", "   ", "well", "id", "just", "come", "straight", "out"],
 | |
|      ["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
 | |
|      [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
 | |
| def test_gold_lev_align(cand, gold, result):
 | |
|     assert align(cand, gold) == result
 |