mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Make alignment work with strings
This commit is contained in:
		
							parent
							
								
									8180c84a98
								
							
						
					
					
						commit
						c0734ba526
					
				| 
						 | 
				
			
			@ -84,24 +84,42 @@ i.e. D[i,j+1] + 1
 | 
			
		|||
'''
 | 
			
		||||
import numpy
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
from .compat import unicode_
 | 
			
		||||
from murmurhash.mrmr cimport hash32
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def align(bytes S, bytes T):
 | 
			
		||||
def align(S, T):
 | 
			
		||||
    cdef int m = len(S)
 | 
			
		||||
    cdef int n = len(T)
 | 
			
		||||
    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
 | 
			
		||||
    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
 | 
			
		||||
    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
 | 
			
		||||
 | 
			
		||||
    cdef np.ndarray S_arr = _convert_sequence(S)
 | 
			
		||||
    cdef np.ndarray T_arr = _convert_sequence(T)
 | 
			
		||||
 | 
			
		||||
    fill_matrix(<int*>matrix.data,
 | 
			
		||||
        S, m, T, n)
 | 
			
		||||
        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
 | 
			
		||||
    fill_i2j(i2j, matrix)
 | 
			
		||||
    fill_j2i(j2i, matrix)
 | 
			
		||||
    return matrix[-1,-1], i2j, j2i, matrix
 | 
			
		||||
 | 
			
		||||
def _convert_sequence(seq):
 | 
			
		||||
    if isinstance(seq, numpy.ndarray):
 | 
			
		||||
        return numpy.ascontiguousarray(seq, dtype='i')
 | 
			
		||||
    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='i')
 | 
			
		||||
    cdef bytes item_bytes
 | 
			
		||||
    for i, item in enumerate(seq):
 | 
			
		||||
        if isinstance(item, unicode):
 | 
			
		||||
            item_bytes = item.encode('utf8')
 | 
			
		||||
        else:
 | 
			
		||||
            item_bytes = item
 | 
			
		||||
        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
 | 
			
		||||
    return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef void fill_matrix(int* D, 
 | 
			
		||||
        const char* S, int m, const char* T, int n) nogil:
 | 
			
		||||
        const int* S, int m, const int* T, int n) nogil:
 | 
			
		||||
    m1 = m+1
 | 
			
		||||
    n1 = n+1
 | 
			
		||||
    for i in range(m1*n1):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,11 +3,11 @@ from .._align import align
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('string1,string2,cost', [
 | 
			
		||||
    (b'hello', b'hell', 1),
 | 
			
		||||
    (b'rat', b'cat', 1),
 | 
			
		||||
    (b'rat', b'rat', 0),
 | 
			
		||||
    (b'rat', b'catsie', 4),
 | 
			
		||||
    (b't', b'catsie', 5),
 | 
			
		||||
    ('hello', 'hell', 1),
 | 
			
		||||
    ('rat', 'cat', 1),
 | 
			
		||||
    ('rat', 'rat', 0),
 | 
			
		||||
    ('rat', 'catsie', 4),
 | 
			
		||||
    ('t', 'catsie', 5),
 | 
			
		||||
])
 | 
			
		||||
def test_align_costs(string1, string2, cost):
 | 
			
		||||
    output_cost, i2j, j2i, matrix = align(string1, string2)
 | 
			
		||||
| 
						 | 
				
			
			@ -15,11 +15,11 @@ def test_align_costs(string1, string2, cost):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('string1,string2,i2j', [
 | 
			
		||||
    (b'hello', b'hell', [0,1,2,3,-1]),
 | 
			
		||||
    (b'rat', b'cat', [0,1,2]),
 | 
			
		||||
    (b'rat', b'rat', [0,1,2]),
 | 
			
		||||
    (b'rat', b'catsie', [0,1,2]),
 | 
			
		||||
    (b't', b'catsie', [2]),
 | 
			
		||||
    ('hello', 'hell', [0,1,2,3,-1]),
 | 
			
		||||
    ('rat', 'cat', [0,1,2]),
 | 
			
		||||
    ('rat', 'rat', [0,1,2]),
 | 
			
		||||
    ('rat', 'catsie', [0,1,2]),
 | 
			
		||||
    ('t', 'catsie', [2]),
 | 
			
		||||
])
 | 
			
		||||
def test_align_i2j(string1, string2, i2j):
 | 
			
		||||
    output_cost, output_i2j, j2i, matrix = align(string1, string2)
 | 
			
		||||
| 
						 | 
				
			
			@ -27,12 +27,20 @@ def test_align_i2j(string1, string2, i2j):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('string1,string2,j2i', [
 | 
			
		||||
    (b'hello', b'hell', [0,1,2,3]),
 | 
			
		||||
    (b'rat', b'cat', [0,1,2]),
 | 
			
		||||
    (b'rat', b'rat', [0,1,2]),
 | 
			
		||||
    (b'rat', b'catsie', [0,1,2, -1, -1, -1]),
 | 
			
		||||
    (b't', b'catsie', [-1, -1, 0, -1, -1, -1]),
 | 
			
		||||
    ('hello', 'hell', [0,1,2,3]),
 | 
			
		||||
    ('rat', 'cat', [0,1,2]),
 | 
			
		||||
    ('rat', 'rat', [0,1,2]),
 | 
			
		||||
    ('rat', 'catsie', [0,1,2, -1, -1, -1]),
 | 
			
		||||
    ('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
 | 
			
		||||
])
 | 
			
		||||
def test_align_i2j(string1, string2, j2i):
 | 
			
		||||
    output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
 | 
			
		||||
    assert list(output_j2i) == j2i
 | 
			
		||||
 | 
			
		||||
def test_align_strings():
 | 
			
		||||
    words1 = ['hello', 'this', 'is', 'test!']
 | 
			
		||||
    words2 = ['hellothis', 'is', 'test', '!']
 | 
			
		||||
    cost, i2j, j2i, matrix = align(words1, words2)
 | 
			
		||||
    assert cost == 4
 | 
			
		||||
    assert list(i2j) == [0, -1, 1, 2]
 | 
			
		||||
    assert list(j2i) == [0, 2, 3, -1]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user