mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			43 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""Find the min-cost alignment between two tokenizations"""
 | 
						|
from spacy.gold import _min_edit_path as min_edit_path
 | 
						|
from spacy.gold import align
 | 
						|
 | 
						|
 | 
						|
def test_edit_path():
 | 
						|
    cand = ["U.S", ".", "policy"]
 | 
						|
    gold = ["U.S.", "policy"]
 | 
						|
    assert min_edit_path(cand, gold) == (0, 'MDM')
 | 
						|
    cand = ["U.N", ".", "policy"]
 | 
						|
    gold = ["U.S.", "policy"]
 | 
						|
    assert min_edit_path(cand, gold) == (1, 'SDM')
 | 
						|
    cand = ["The", "cat", "sat", "down"]
 | 
						|
    gold = ["The", "cat", "sat", "down"]
 | 
						|
    assert min_edit_path(cand, gold) == (0, 'MMMM')
 | 
						|
    cand = ["cat", "sat", "down"]
 | 
						|
    gold = ["The", "cat", "sat", "down"]
 | 
						|
    assert min_edit_path(cand, gold) == (1, 'IMMM')
 | 
						|
    cand = ["The", "cat", "down"]
 | 
						|
    gold = ["The", "cat", "sat", "down"]
 | 
						|
    assert min_edit_path(cand, gold) == (1, 'MMIM')
 | 
						|
    cand = ["The", "cat", "sag", "down"]
 | 
						|
    gold = ["The", "cat", "sat", "down"]
 | 
						|
    assert min_edit_path(cand, gold) == (1, 'MMSM')
 | 
						|
    cand = ["your", "stuff"]
 | 
						|
    gold = ["you", "r", "stuff"]
 | 
						|
    assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
 | 
						|
 | 
						|
 | 
						|
def test_align():
 | 
						|
    cand = ["U.S", ".", "policy"]
 | 
						|
    gold = ["U.S.", "policy"]
 | 
						|
    assert align(cand, gold) == [0, None, 1]
 | 
						|
    cand = ["your", "stuff"]
 | 
						|
    gold = ["you", "r", "stuff"]
 | 
						|
    assert align(cand, gold) == [None, 2]
 | 
						|
    cand = [u'i', u'like', u'2', u'guys', u'   ', u'well', u'id', u'just',
 | 
						|
            u'come', u'straight', u'out']
 | 
						|
    gold = [u'i', u'like', u'2', u'guys', u'well', u'i', u'd', u'just', u'come',
 | 
						|
            u'straight', u'out']
 | 
						|
    assert align(cand, gold) == [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10]
 | 
						|
 |