mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Switch to new gold.align method (#5334)
* Switch from original `_align` to new simpler alignment algorithm from #4526 * Remove alignment normalizations beyond whitespace and lowercasing
This commit is contained in:
		
							parent
							
								
									bf5c13d170
								
							
						
					
					
						commit
						521f361052
					
				
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -31,7 +31,6 @@ PACKAGES = find_packages()
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
MOD_NAMES = [
 | 
			
		||||
    "spacy._align",
 | 
			
		||||
    "spacy.parts_of_speech",
 | 
			
		||||
    "spacy.strings",
 | 
			
		||||
    "spacy.lexeme",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										255
									
								
								spacy/_align.pyx
									
									
									
									
									
								
							
							
						
						
									
										255
									
								
								spacy/_align.pyx
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1,255 +0,0 @@
 | 
			
		|||
# cython: infer_types=True
 | 
			
		||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
 | 
			
		||||
 | 
			
		||||
Random notes:
 | 
			
		||||
 | 
			
		||||
  r i n g
 | 
			
		||||
  0 1 2 3 4
 | 
			
		||||
r 1 0 1 2 3
 | 
			
		||||
a 2 1 1 2 3
 | 
			
		||||
n 3 2 2 1 2
 | 
			
		||||
g 4 3 3 2 1
 | 
			
		||||
 | 
			
		||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
 | 
			
		||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
 | 
			
		||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
 | 
			
		||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
 | 
			
		||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
 | 
			
		||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
 | 
			
		||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
 | 
			
		||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
 | 
			
		||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
 | 
			
		||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
 | 
			
		||||
2,2: (3,3)
 | 
			
		||||
3,2: (4,3)
 | 
			
		||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
 | 
			
		||||
 | 
			
		||||
We know the costs to transition:
 | 
			
		||||
 | 
			
		||||
S[:i]   -> T[:j]   (at D[i,j])
 | 
			
		||||
S[:i+1] -> T[:j]   (at D[i+1,j])
 | 
			
		||||
S[:i]   -> T[:j+1] (at D[i,j+1])
 | 
			
		||||
    
 | 
			
		||||
Further, now we can transform:
 | 
			
		||||
S[:i+1] -> S[:i] (DEL) for 1,
 | 
			
		||||
T[:j+1] -> T[:j] (INS) for 1.
 | 
			
		||||
S[i+1]  -> T[j+1] (SUB) for 0 or 1
 | 
			
		||||
 | 
			
		||||
Therefore we have the costs:
 | 
			
		||||
SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
 | 
			
		||||
i.e. D[i, j] + S[i+1] != T[j+1]
 | 
			
		||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
 | 
			
		||||
i.e. D[i+1,j] + 1
 | 
			
		||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
 | 
			
		||||
i.e. D[i,j+1] + 1
 | 
			
		||||
 | 
			
		||||
    Source string S has length m, with index i
 | 
			
		||||
    Target string T has length n, with index j
 | 
			
		||||
 | 
			
		||||
    Output two alignment vectors: i2j (length m) and j2i (length n)
 | 
			
		||||
    # function LevenshteinDistance(char s[1..m], char t[1..n]):
 | 
			
		||||
    # for all i and j, d[i,j] will hold the Levenshtein distance between
 | 
			
		||||
    # the first i characters of s and the first j characters of t
 | 
			
		||||
    # note that d has (m+1)*(n+1) values
 | 
			
		||||
    # set each element in d to zero
 | 
			
		||||
    ring rang
 | 
			
		||||
      - r i n g
 | 
			
		||||
    - 0 0 0 0 0
 | 
			
		||||
    r 0 0 0 0 0
 | 
			
		||||
    a 0 0 0 0 0
 | 
			
		||||
    n 0 0 0 0 0
 | 
			
		||||
    g 0 0 0 0 0
 | 
			
		||||
 | 
			
		||||
    # source prefixes can be transformed into empty string by
 | 
			
		||||
    # dropping all characters
 | 
			
		||||
    # d[i, 0] := i
 | 
			
		||||
    ring rang
 | 
			
		||||
      - r i n g
 | 
			
		||||
    - 0 0 0 0 0
 | 
			
		||||
    r 1 0 0 0 0
 | 
			
		||||
    a 2 0 0 0 0
 | 
			
		||||
    n 3 0 0 0 0
 | 
			
		||||
    g 4 0 0 0 0
 | 
			
		||||
 | 
			
		||||
    # target prefixes can be reached from empty source prefix
 | 
			
		||||
    # by inserting every character
 | 
			
		||||
    # d[0, j] := j
 | 
			
		||||
      - r i n g
 | 
			
		||||
    - 0 1 2 3 4
 | 
			
		||||
    r 1 0 0 0 0
 | 
			
		||||
    a 2 0 0 0 0
 | 
			
		||||
    n 3 0 0 0 0
 | 
			
		||||
    g 4 0 0 0 0
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
from libc.stdint cimport uint32_t
 | 
			
		||||
import numpy
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
from .compat import unicode_
 | 
			
		||||
from murmurhash.mrmr cimport hash32
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def align(S, T):
 | 
			
		||||
    cdef int m = len(S)
 | 
			
		||||
    cdef int n = len(T)
 | 
			
		||||
    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
 | 
			
		||||
    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
 | 
			
		||||
    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
 | 
			
		||||
 | 
			
		||||
    cdef np.ndarray S_arr = _convert_sequence(S)
 | 
			
		||||
    cdef np.ndarray T_arr = _convert_sequence(T)
 | 
			
		||||
 | 
			
		||||
    fill_matrix(<int*>matrix.data,
 | 
			
		||||
        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
 | 
			
		||||
    fill_i2j(i2j, matrix)
 | 
			
		||||
    fill_j2i(j2i, matrix)
 | 
			
		||||
    for i in range(i2j.shape[0]):
 | 
			
		||||
        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
 | 
			
		||||
            i2j[i] = -1
 | 
			
		||||
    for j in range(j2i.shape[0]):
 | 
			
		||||
        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
 | 
			
		||||
            j2i[j] = -1
 | 
			
		||||
    return matrix[-1,-1], i2j, j2i, matrix
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
 | 
			
		||||
    '''Let's say we had:
 | 
			
		||||
 | 
			
		||||
    Guess: [aa bb cc dd]
 | 
			
		||||
    Truth: [aa bbcc dd]
 | 
			
		||||
    i2j: [0, None, -2, 2]
 | 
			
		||||
    j2i: [0, -2, 3]
 | 
			
		||||
 | 
			
		||||
    We want:
 | 
			
		||||
 | 
			
		||||
    i2j_multi: {1: 1, 2: 1}
 | 
			
		||||
    j2i_multi: {}
 | 
			
		||||
    '''
 | 
			
		||||
    i2j_miss = _get_regions(i2j, i_lengths)
 | 
			
		||||
    j2i_miss = _get_regions(j2i, j_lengths)
 | 
			
		||||
 | 
			
		||||
    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
 | 
			
		||||
    return i2j_multi, j2i_multi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_regions(alignment, lengths):
 | 
			
		||||
    regions = {}
 | 
			
		||||
    start = None
 | 
			
		||||
    offset = 0
 | 
			
		||||
    for i in range(len(alignment)):
 | 
			
		||||
        if alignment[i] < 0:
 | 
			
		||||
            if start is None:
 | 
			
		||||
                start = offset
 | 
			
		||||
                regions.setdefault(start, [])
 | 
			
		||||
            regions[start].append(i)
 | 
			
		||||
        else:
 | 
			
		||||
            start = None
 | 
			
		||||
        offset += lengths[i]
 | 
			
		||||
    return regions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
 | 
			
		||||
    i2j = {}
 | 
			
		||||
    j2i = {}
 | 
			
		||||
    for start, region1 in miss1.items():
 | 
			
		||||
        if not region1 or start not in miss2:
 | 
			
		||||
            continue
 | 
			
		||||
        region2 = miss2[start]
 | 
			
		||||
        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
 | 
			
		||||
            j = region2.pop(0)
 | 
			
		||||
            buff = []
 | 
			
		||||
            # Consume tokens from region 1, until we meet the length of the
 | 
			
		||||
            # first token in region2. If we do, align the tokens. If
 | 
			
		||||
            # we exceed the length, break.
 | 
			
		||||
            while region1:
 | 
			
		||||
                buff.append(region1.pop(0))
 | 
			
		||||
                if sum(lengths1[i] for i in buff) == lengths2[j]:
 | 
			
		||||
                    for i in buff:
 | 
			
		||||
                        i2j[i] = j
 | 
			
		||||
                    j2i[j] = buff[-1]
 | 
			
		||||
                    j += 1
 | 
			
		||||
                    buff = []
 | 
			
		||||
                elif sum(lengths1[i] for i in buff) > lengths2[j]:
 | 
			
		||||
                    break
 | 
			
		||||
            else:
 | 
			
		||||
                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
 | 
			
		||||
                    for i in buff:
 | 
			
		||||
                        i2j[i] = j
 | 
			
		||||
                    j2i[j] = buff[-1]
 | 
			
		||||
    return i2j, j2i
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _convert_sequence(seq):
 | 
			
		||||
    if isinstance(seq, numpy.ndarray):
 | 
			
		||||
        return numpy.ascontiguousarray(seq, dtype='uint32_t')
 | 
			
		||||
    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
 | 
			
		||||
    cdef bytes item_bytes
 | 
			
		||||
    for i, item in enumerate(seq):
 | 
			
		||||
        if item == "``":
 | 
			
		||||
            item = '"'
 | 
			
		||||
        elif item == "''":
 | 
			
		||||
            item = '"'
 | 
			
		||||
        if isinstance(item, unicode):
 | 
			
		||||
            item_bytes = item.encode('utf8')
 | 
			
		||||
        else:
 | 
			
		||||
            item_bytes = item
 | 
			
		||||
        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
 | 
			
		||||
    return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef void fill_matrix(int* D, 
 | 
			
		||||
        const int* S, int m, const int* T, int n) nogil:
 | 
			
		||||
    m1 = m+1
 | 
			
		||||
    n1 = n+1
 | 
			
		||||
    for i in range(m1*n1):
 | 
			
		||||
        D[i] = 0
 | 
			
		||||
 
 | 
			
		||||
    for i in range(m1):
 | 
			
		||||
        D[i*n1] = i
 | 
			
		||||
 
 | 
			
		||||
    for j in range(n1):
 | 
			
		||||
        D[j] = j
 | 
			
		||||
 
 | 
			
		||||
    cdef int sub_cost, ins_cost, del_cost
 | 
			
		||||
    for j in range(n):
 | 
			
		||||
        for i in range(m):
 | 
			
		||||
            i_j = i*n1 + j
 | 
			
		||||
            i1_j1 = (i+1)*n1 + j+1
 | 
			
		||||
            i1_j = (i+1)*n1 + j
 | 
			
		||||
            i_j1 = i*n1 + j+1
 | 
			
		||||
            if S[i] != T[j]:
 | 
			
		||||
                sub_cost = D[i_j] + 1
 | 
			
		||||
            else:
 | 
			
		||||
                sub_cost = D[i_j]
 | 
			
		||||
            del_cost = D[i_j1] + 1
 | 
			
		||||
            ins_cost = D[i1_j] + 1
 | 
			
		||||
            best = min(min(sub_cost, ins_cost), del_cost)
 | 
			
		||||
            D[i1_j1] = best
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
 | 
			
		||||
    j = D.shape[1]-2
 | 
			
		||||
    cdef int i = D.shape[0]-2
 | 
			
		||||
    while i >= 0:
 | 
			
		||||
        while D[i+1, j] < D[i+1, j+1]:
 | 
			
		||||
            j -= 1
 | 
			
		||||
        if D[i, j+1] < D[i+1, j+1]:
 | 
			
		||||
            i2j[i] = -1
 | 
			
		||||
        else:
 | 
			
		||||
            i2j[i] = j
 | 
			
		||||
            j -= 1
 | 
			
		||||
        i -= 1
 | 
			
		||||
 | 
			
		||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
 | 
			
		||||
    i = D.shape[0]-2
 | 
			
		||||
    cdef int j = D.shape[1]-2
 | 
			
		||||
    while j >= 0:
 | 
			
		||||
        while D[i, j+1] < D[i+1, j+1]:
 | 
			
		||||
            i -= 1
 | 
			
		||||
        if D[i+1, j] < D[i+1, j+1]:
 | 
			
		||||
            j2i[j] = -1
 | 
			
		||||
        else:
 | 
			
		||||
            j2i[j] = i
 | 
			
		||||
            i -= 1
 | 
			
		||||
        j -= 1
 | 
			
		||||
| 
						 | 
				
			
			@ -21,7 +21,6 @@ from .util import minibatch, itershuffle
 | 
			
		|||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
USE_NEW_ALIGN = False
 | 
			
		||||
punct_re = re.compile(r"\W")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -73,57 +72,8 @@ def merge_sents(sents):
 | 
			
		|||
    return [(m_deps, (m_cats, m_brackets))]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _normalize_for_alignment(tokens):
 | 
			
		||||
    tokens = [w.replace(" ", "").lower() for w in tokens]
 | 
			
		||||
    output = []
 | 
			
		||||
    for token in tokens:
 | 
			
		||||
        token = token.replace(" ", "").lower()
 | 
			
		||||
        for before, after in _ALIGNMENT_NORM_MAP:
 | 
			
		||||
            token = token.replace(before, after)
 | 
			
		||||
        output.append(token)
 | 
			
		||||
    return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _align_before_v2_2_2(tokens_a, tokens_b):
 | 
			
		||||
    """Calculate alignment tables between two tokenizations, using the Levenshtein
 | 
			
		||||
    algorithm. The alignment is case-insensitive.
 | 
			
		||||
 | 
			
		||||
    tokens_a (List[str]): The candidate tokenization.
 | 
			
		||||
    tokens_b (List[str]): The reference tokenization.
 | 
			
		||||
    RETURNS: (tuple): A 5-tuple consisting of the following information:
 | 
			
		||||
      * cost (int): The number of misaligned tokens.
 | 
			
		||||
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
 | 
			
		||||
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
 | 
			
		||||
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
 | 
			
		||||
        it has the value -1.
 | 
			
		||||
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
 | 
			
		||||
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
 | 
			
		||||
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
 | 
			
		||||
        the same token of `tokens_b`.
 | 
			
		||||
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
 | 
			
		||||
            direction.
 | 
			
		||||
    """
 | 
			
		||||
    from . import _align
 | 
			
		||||
    if tokens_a == tokens_b:
 | 
			
		||||
        alignment = numpy.arange(len(tokens_a))
 | 
			
		||||
        return 0, alignment, alignment, {}, {}
 | 
			
		||||
    tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
 | 
			
		||||
    tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
 | 
			
		||||
    cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
 | 
			
		||||
    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
 | 
			
		||||
                                                        [len(w) for w in tokens_b])
 | 
			
		||||
    for i, j in list(i2j_multi.items()):
 | 
			
		||||
        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
 | 
			
		||||
            i2j[i] = j
 | 
			
		||||
            i2j_multi.pop(i)
 | 
			
		||||
    for j, i in list(j2i_multi.items()):
 | 
			
		||||
        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
 | 
			
		||||
            j2i[j] = i
 | 
			
		||||
            j2i_multi.pop(j)
 | 
			
		||||
    return cost, i2j, j2i, i2j_multi, j2i_multi
 | 
			
		||||
    return [w.replace(" ", "").lower() for w in tokens]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def align(tokens_a, tokens_b):
 | 
			
		||||
| 
						 | 
				
			
			@ -144,8 +94,6 @@ def align(tokens_a, tokens_b):
 | 
			
		|||
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
 | 
			
		||||
            direction.
 | 
			
		||||
    """
 | 
			
		||||
    if not USE_NEW_ALIGN:
 | 
			
		||||
        return _align_before_v2_2_2(tokens_a, tokens_b)
 | 
			
		||||
    tokens_a = _normalize_for_alignment(tokens_a)
 | 
			
		||||
    tokens_b = _normalize_for_alignment(tokens_b)
 | 
			
		||||
    cost = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,79 +0,0 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
from spacy._align import align, multi_align
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "string1,string2,cost",
 | 
			
		||||
    [
 | 
			
		||||
        ("hello", "hell", 1),
 | 
			
		||||
        ("rat", "cat", 1),
 | 
			
		||||
        ("rat", "rat", 0),
 | 
			
		||||
        ("rat", "catsie", 4),
 | 
			
		||||
        ("t", "catsie", 5),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_align_costs(string1, string2, cost):
 | 
			
		||||
    output_cost, i2j, j2i, matrix = align(string1, string2)
 | 
			
		||||
    assert output_cost == cost
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "string1,string2,i2j",
 | 
			
		||||
    [
 | 
			
		||||
        ("hello", "hell", [0, 1, 2, 3, -1]),
 | 
			
		||||
        ("rat", "cat", [0, 1, 2]),
 | 
			
		||||
        ("rat", "rat", [0, 1, 2]),
 | 
			
		||||
        ("rat", "catsie", [0, 1, 2]),
 | 
			
		||||
        ("t", "catsie", [2]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_align_i2j(string1, string2, i2j):
 | 
			
		||||
    output_cost, output_i2j, j2i, matrix = align(string1, string2)
 | 
			
		||||
    assert list(output_i2j) == i2j
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "string1,string2,j2i",
 | 
			
		||||
    [
 | 
			
		||||
        ("hello", "hell", [0, 1, 2, 3]),
 | 
			
		||||
        ("rat", "cat", [0, 1, 2]),
 | 
			
		||||
        ("rat", "rat", [0, 1, 2]),
 | 
			
		||||
        ("rat", "catsie", [0, 1, 2, -1, -1, -1]),
 | 
			
		||||
        ("t", "catsie", [-1, -1, 0, -1, -1, -1]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_align_i2j_2(string1, string2, j2i):
 | 
			
		||||
    output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
 | 
			
		||||
    assert list(output_j2i) == j2i
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_align_strings():
 | 
			
		||||
    words1 = ["hello", "this", "is", "test!"]
 | 
			
		||||
    words2 = ["hellothis", "is", "test", "!"]
 | 
			
		||||
    cost, i2j, j2i, matrix = align(words1, words2)
 | 
			
		||||
    assert cost == 4
 | 
			
		||||
    assert list(i2j) == [-1, -1, 1, -1]
 | 
			
		||||
    assert list(j2i) == [-1, 2, -1, -1]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_align_many_to_one():
 | 
			
		||||
    words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
 | 
			
		||||
    words2 = ["ab", "bc", "e", "fg", "h"]
 | 
			
		||||
    cost, i2j, j2i, matrix = align(words1, words2)
 | 
			
		||||
    assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
 | 
			
		||||
    lengths1 = [len(w) for w in words1]
 | 
			
		||||
    lengths2 = [len(w) for w in words2]
 | 
			
		||||
    i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
 | 
			
		||||
    assert i2j_multi[0] == 0
 | 
			
		||||
    assert i2j_multi[1] == 0
 | 
			
		||||
    assert i2j_multi[2] == 1
 | 
			
		||||
    assert i2j_multi[3] == 1
 | 
			
		||||
    assert i2j_multi[3] == 1
 | 
			
		||||
    assert i2j_multi[5] == 3
 | 
			
		||||
    assert i2j_multi[6] == 3
 | 
			
		||||
 | 
			
		||||
    assert j2i_multi[0] == 1
 | 
			
		||||
    assert j2i_multi[1] == 3
 | 
			
		||||
| 
						 | 
				
			
			@ -177,13 +177,12 @@ def test_roundtrip_docs_to_json():
 | 
			
		|||
    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.skip(reason="skip while we have backwards-compatible alignment")
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "tokens_a,tokens_b,expected",
 | 
			
		||||
    [
 | 
			
		||||
        (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
 | 
			
		||||
        (
 | 
			
		||||
            ["a", "b", "``", "c"],
 | 
			
		||||
            ["a", "b", '"', "c"],
 | 
			
		||||
            ['ab"', "c"],
 | 
			
		||||
            (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
 | 
			
		||||
        ),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user