Replace old gold alignment with new gold alignment (#4710)

Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization. * Replace old alignment with new alignment, removing `_align.pyx` and its tests * Remove all quote normalizations * Enable test for new align * Modify test case for quote normalization
2025-07-22 05:59:56 +03:00 · 2019-11-25 23:13:26 +01:00 · 2019-11-25 23:13:26 +01:00 · 0c9640ced3
commit 0c9640ced3
parent 392c4880d9
5 changed files with 1 additions and 401 deletions
--- a/setup.py
+++ b/setup.py
@ -31,7 +31,6 @@ PACKAGES = find_packages()
 MOD_NAMES = [
    "spacy._align",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@ -1,255 +0,0 @@
 # cython: infer_types=True
 '''Do Levenshtein alignment, for evaluation of tokenized input.
 Random notes:
  r i n g
  0 1 2 3 4
 r 1 0 1 2 3
 a 2 1 1 2 3
 n 3 2 2 1 2
 g 4 3 3 2 1
 0,0: (1,1)=min(0+0,1+1,1+1)=0 S
 1,0: (2,1)=min(1+1,0+1,2+1)=1 D
 2,0: (3,1)=min(2+1,3+1,1+1)=2 D
 3,0: (4,1)=min(3+1,4+1,2+1)=3 D
 0,1: (1,2)=min(1+1,2+1,0+1)=1 D
 1,1: (2,2)=min(0+1,1+1,1+1)=1 S
 2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
 3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
 0,2: (1,3)=min(2+1,3+1,1+1)=2 I
 1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
 2,2: (3,3)
 3,2: (4,3)
 At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
 We know the costs to transition:
 S[:i]   -> T[:j]   (at D[i,j])
 S[:i+1] -> T[:j]   (at D[i+1,j])
 S[:i]   -> T[:j+1] (at D[i,j+1])
 Further, we now we can tranform:
 S[:i+1] -> S[:i] (DEL) for 1,
 T[:j+1] -> T[:j] (INS) for 1.
 S[i+1]  -> T[j+1] (SUB) for 0 or 1
 Therefore we have the costs:
 SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
 i.e. D[i, j] + S[i+1] != T[j+1]
 INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
 i.e. D[i+1,j] + 1
 DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
 i.e. D[i,j+1] + 1
    Source string S has length m, with index i
    Target string T has length n, with index j
    Output two alignment vectors: i2j (length m) and j2i (length n)
    # function LevenshteinDistance(char s[1..m], char t[1..n]):
    # for all i and j, d[i,j] will hold the Levenshtein distance between
    # the first i characters of s and the first j characters of t
    # note that d has (m+1)*(n+1) values
    # set each element in d to zero
    ring rang
      - r i n g
    - 0 0 0 0 0
    r 0 0 0 0 0
    a 0 0 0 0 0
    n 0 0 0 0 0
    g 0 0 0 0 0
    # source prefixes can be transformed into empty string by
    # dropping all characters
    # d[i, 0] := i
    ring rang
      - r i n g
    - 0 0 0 0 0
    r 1 0 0 0 0
    a 2 0 0 0 0
    n 3 0 0 0 0
    g 4 0 0 0 0
    # target prefixes can be reached from empty source prefix
    # by inserting every character
    # d[0, j] := j
      - r i n g
    - 0 1 2 3 4
    r 1 0 0 0 0
    a 2 0 0 0 0
    n 3 0 0 0 0
    g 4 0 0 0 0
 '''
 from __future__ import unicode_literals
 from libc.stdint cimport uint32_t
 import numpy
 cimport numpy as np
 from .compat import unicode_
 from murmurhash.mrmr cimport hash32
 def align(S, T):
    cdef int m = len(S)
    cdef int n = len(T)
    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
    cdef np.ndarray S_arr = _convert_sequence(S)
    cdef np.ndarray T_arr = _convert_sequence(T)
    fill_matrix(<int*>matrix.data,
        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
    fill_i2j(i2j, matrix)
    fill_j2i(j2i, matrix)
    for i in range(i2j.shape[0]):
        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
            i2j[i] = -1
    for j in range(j2i.shape[0]):
        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
            j2i[j] = -1
    return matrix[-1,-1], i2j, j2i, matrix
 def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
    '''Let's say we had:
    Guess: [aa bb cc dd]
    Truth: [aa bbcc dd]
    i2j: [0, None, -2, 2]
    j2i: [0, -2, 3]
    We want:
    i2j_multi: {1: 1, 2: 1}
    j2i_multi: {}
    '''
    i2j_miss = _get_regions(i2j, i_lengths)
    j2i_miss = _get_regions(j2i, j_lengths)
    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
    return i2j_multi, j2i_multi
 def _get_regions(alignment, lengths):
    regions = {}
    start = None
    offset = 0
    for i in range(len(alignment)):
        if alignment[i] < 0:
            if start is None:
                start = offset
                regions.setdefault(start, [])
            regions[start].append(i)
        else:
            start = None
        offset += lengths[i]
    return regions
 def _get_mapping(miss1, miss2, lengths1, lengths2):
    i2j = {}
    j2i = {}
    for start, region1 in miss1.items():
        if not region1 or start not in miss2:
            continue
        region2 = miss2[start]
        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
            j = region2.pop(0)
            buff = []
            # Consume tokens from region 1, until we meet the length of the
            # first token in region2. If we do, align the tokens. If
            # we exceed the length, break.
            while region1:
                buff.append(region1.pop(0))
                if sum(lengths1[i] for i in buff) == lengths2[j]:
                    for i in buff:
                        i2j[i] = j
                    j2i[j] = buff[-1]
                    j += 1
                    buff = []
                elif sum(lengths1[i] for i in buff) > lengths2[j]:
                    break
            else:
                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
                    for i in buff:
                        i2j[i] = j
                    j2i[j] = buff[-1]
    return i2j, j2i
 def _convert_sequence(seq):
    if isinstance(seq, numpy.ndarray):
        return numpy.ascontiguousarray(seq, dtype='uint32_t')
    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
    cdef bytes item_bytes
    for i, item in enumerate(seq):
        if item == "``":
            item = '"'
        elif item == "''":
            item = '"'
        if isinstance(item, unicode):
            item_bytes = item.encode('utf8')
        else:
            item_bytes = item
        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
    return output
 cdef void fill_matrix(int* D, 
        const int* S, int m, const int* T, int n) nogil:
    m1 = m+1
    n1 = n+1
    for i in range(m1*n1):
        D[i] = 0
    for i in range(m1):
        D[i*n1] = i
    for j in range(n1):
        D[j] = j
    cdef int sub_cost, ins_cost, del_cost
    for j in range(n):
        for i in range(m):
            i_j = i*n1 + j
            i1_j1 = (i+1)*n1 + j+1
            i1_j = (i+1)*n1 + j
            i_j1 = i*n1 + j+1
            if S[i] != T[j]:
                sub_cost = D[i_j] + 1
            else:
                sub_cost = D[i_j]
            del_cost = D[i_j1] + 1
            ins_cost = D[i1_j] + 1
            best = min(min(sub_cost, ins_cost), del_cost)
            D[i1_j1] = best
 cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
    j = D.shape[1]-2
    cdef int i = D.shape[0]-2
    while i >= 0:
        while D[i+1, j] < D[i+1, j+1]:
            j -= 1
        if D[i, j+1] < D[i+1, j+1]:
            i2j[i] = -1
        else:
            i2j[i] = j
            j -= 1
        i -= 1
 cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
    i = D.shape[0]-2
    cdef int j = D.shape[1]-2
    while j >= 0:
        while D[i, j+1] < D[i+1, j+1]:
            i -= 1
        if D[i+1, j] < D[i+1, j+1]:
            j2i[j] = -1
        else:
            j2i[j] = i
            i -= 1
        j -= 1
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -18,7 +18,6 @@ from .compat import path2str, basestring_
 from . import util
 USE_NEW_ALIGN = False
 punct_re = re.compile(r"\W")
@ -51,59 +50,15 @@ def tags_to_entities(tags):
    return entities
 _ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
 def _normalize_for_alignment(tokens):
    tokens = [w.replace(" ", "").lower() for w in tokens]
    output = []
    for token in tokens:
        token = token.replace(" ", "").lower()
        for before, after in _ALIGNMENT_NORM_MAP:
            token = token.replace(before, after)
        output.append(token)
    return output
 def _align_before_v2_2_2(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations, using the Levenshtein
    algorithm. The alignment is case-insensitive.
    tokens_a (List[str]): The candidate tokenization.
    tokens_b (List[str]): The reference tokenization.
    RETURNS: (tuple): A 5-tuple consisting of the following information:
      * cost (int): The number of misaligned tokens.
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
        it has the value -1.
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
        the same token of `tokens_b`.
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    from . import _align
    if tokens_a == tokens_b:
        alignment = numpy.arange(len(tokens_a))
        return 0, alignment, alignment, {}, {}
    tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
    tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
    cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
                                                        [len(w) for w in tokens_b])
    for i, j in list(i2j_multi.items()):
        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
            i2j[i] = j
            i2j_multi.pop(i)
    for j, i in list(j2i_multi.items()):
        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
            j2i[j] = i
            j2i_multi.pop(j)
    return cost, i2j, j2i, i2j_multi, j2i_multi
 def align(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations.
@ -122,8 +77,6 @@ def align(tokens_a, tokens_b):
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    if not USE_NEW_ALIGN:
        return _align_before_v2_2_2(tokens_a, tokens_b)
    tokens_a = _normalize_for_alignment(tokens_a)
    tokens_b = _normalize_for_alignment(tokens_b)
    cost = 0
--- a/spacy/tests/test_align.py
+++ b/spacy/tests/test_align.py
@ -1,79 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy._align import align, multi_align
@pytest.mark.parametrize(
    "string1,string2,cost",
    [
        ("hello", "hell", 1),
        ("rat", "cat", 1),
        ("rat", "rat", 0),
        ("rat", "catsie", 4),
        ("t", "catsie", 5),
    ],
 )
 def test_align_costs(string1, string2, cost):
    output_cost, i2j, j2i, matrix = align(string1, string2)
    assert output_cost == cost
@pytest.mark.parametrize(
    "string1,string2,i2j",
    [
        ("hello", "hell", [0, 1, 2, 3, -1]),
        ("rat", "cat", [0, 1, 2]),
        ("rat", "rat", [0, 1, 2]),
        ("rat", "catsie", [0, 1, 2]),
        ("t", "catsie", [2]),
    ],
 )
 def test_align_i2j(string1, string2, i2j):
    output_cost, output_i2j, j2i, matrix = align(string1, string2)
    assert list(output_i2j) == i2j
@pytest.mark.parametrize(
    "string1,string2,j2i",
    [
        ("hello", "hell", [0, 1, 2, 3]),
        ("rat", "cat", [0, 1, 2]),
        ("rat", "rat", [0, 1, 2]),
        ("rat", "catsie", [0, 1, 2, -1, -1, -1]),
        ("t", "catsie", [-1, -1, 0, -1, -1, -1]),
    ],
 )
 def test_align_i2j_2(string1, string2, j2i):
    output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
    assert list(output_j2i) == j2i
 def test_align_strings():
    words1 = ["hello", "this", "is", "test!"]
    words2 = ["hellothis", "is", "test", "!"]
    cost, i2j, j2i, matrix = align(words1, words2)
    assert cost == 4
    assert list(i2j) == [-1, -1, 1, -1]
    assert list(j2i) == [-1, 2, -1, -1]
 def test_align_many_to_one():
    words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
    words2 = ["ab", "bc", "e", "fg", "h"]
    cost, i2j, j2i, matrix = align(words1, words2)
    assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
    lengths1 = [len(w) for w in words1]
    lengths2 = [len(w) for w in words2]
    i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
    assert i2j_multi[0] == 0
    assert i2j_multi[1] == 0
    assert i2j_multi[2] == 1
    assert i2j_multi[3] == 1
    assert i2j_multi[3] == 1
    assert i2j_multi[5] == 3
    assert i2j_multi[6] == 3
    assert j2i_multi[0] == 1
    assert j2i_multi[1] == 3
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -241,20 +241,6 @@ def test_ignore_misaligned(doc):
    deps = [t.dep_ for t in doc]
    heads = [t.head.i for t in doc]
    saved_use_new_align = spacy.gold.USE_NEW_ALIGN
    spacy.gold.USE_NEW_ALIGN = False
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "test.jsonl"
        data = [docs_to_json(doc)]
        data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
        # write to JSONL train dicts
        srsly.write_jsonl(jsonl_file, data)
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
    train_reloaded_example = next(goldcorpus.train_dataset(nlp))
    spacy.gold.USE_NEW_ALIGN = True
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "test.jsonl"
        data = [docs_to_json(doc)]
@ -280,8 +266,6 @@ def test_ignore_misaligned(doc):
                                  ignore_misaligned=True))
    assert len(train_reloaded_example) == 0
    spacy.gold.USE_NEW_ALIGN = saved_use_new_align
 def test_make_orth_variants(doc):
    nlp = English()
@ -301,14 +285,12 @@ def test_make_orth_variants(doc):
    train_goldparse = train_reloaded_example.gold
 # xfail while we have backwards-compatible alignment
@pytest.mark.xfail
@pytest.mark.parametrize(
    "tokens_a,tokens_b,expected",
    [
        (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
        (
-            ["a", "b", "``", "c"],
+            ["a", "b", '"', "c"],
            ['ab"', "c"],
            (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
        ),