From 521f3610527998e3ccbd7591f1df95e66ed56350 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 21 Apr 2020 19:31:03 +0200 Subject: [PATCH] Switch to new gold.align method (#5334) * Switch from original `_align` to new simpler alignment algorithm from #4526 * Remove alignment normalizations beyond whitespace and lowercasing --- setup.py | 1 - spacy/_align.pyx | 255 -------------------------------------- spacy/gold.pyx | 54 +------- spacy/tests/test_align.py | 79 ------------ spacy/tests/test_gold.py | 3 +- 5 files changed, 2 insertions(+), 390 deletions(-) delete mode 100644 spacy/_align.pyx delete mode 100644 spacy/tests/test_align.py diff --git a/setup.py b/setup.py index 1156e7cde..62a09aa73 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ PACKAGES = find_packages() MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 6786ec7ba..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, now we can transform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 07fd3bdd0..a41f06898 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -21,7 +21,6 @@ from .util import minibatch, itershuffle from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -73,57 +72,8 @@ def merge_sents(sents): return [(m_deps, (m_cats, m_brackets))] -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): - tokens = [w.replace(" ", "").lower() for w in tokens] - output = [] - for token in tokens: - token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) - output.append(token) - return output - - -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi + return [w.replace(" ", "").lower() for w in tokens] def align(tokens_a, tokens_b): @@ -144,8 +94,6 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fbdb3155b..b546e079b 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -177,13 +177,12 @@ def test_roundtrip_docs_to_json(): assert cats["BAKING"] == goldparse.cats["BAKING"] -@pytest.mark.skip(reason="skip while we have backwards-compatible alignment") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ),