From 7865746574b6860e384a1ebcaee9234c84e37107 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 24 Feb 2018 02:09:53 +0100 Subject: [PATCH] Support many-to-one alignment --- spacy/_align.pyx | 67 ++++++++++++++++++++++++++++++++++++++- spacy/tests/test_align.py | 13 ++++++-- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/spacy/_align.pyx b/spacy/_align.pyx index daab20420..83e633e77 100644 --- a/spacy/_align.pyx +++ b/spacy/_align.pyx @@ -90,7 +90,7 @@ from .compat import unicode_ from murmurhash.mrmr cimport hash32 -def align(S, T): +def align(S, T, many_to_one=False, one_to_many=False): cdef int m = len(S) cdef int n = len(T) cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') @@ -104,8 +104,73 @@ def align(S, T): S_arr.data, m, T_arr.data, n) fill_i2j(i2j, matrix) fill_j2i(j2i, matrix) + for i in range(i2j.shape[0]): + if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): + i2j[i] = -1 + for j in range(j2i.shape[0]): + if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): + j2i[j] = -1 + + if many_to_one or one_to_many: + i2j_multi, j2i_multi = multi_align(i2j, j2i, + [len(s) for s in S], [len(t) for t in T]) + if many_to_one: + for i, j in i2j_multi.items(): + i2j[i] = j + if one_to_many: + for j, i in j2i_multi.items(): + j2i[j] = i return matrix[-1,-1], i2j, j2i, matrix + +def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): + '''Let's say we had: + + Guess: [aa bb cc dd] + Truth: [aa bbcc dd] + i2j: [0, None, -2, 2] + j2i: [0, -2, 3] + + We want: + + i2j_multi: {1: 1, 2: 1} + j2i_multi: {} + ''' + i_starts = numpy.cumsum([0] + i_lengths[:-1]) + j_starts = numpy.cumsum([0] + j_lengths[:-1]) + i2j_miss = _get_regions(i2j, i_starts) + j2i_miss = _get_regions(j2i, j_starts) + + i2j_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) + j2i_multi = _get_mapping(j2i_miss, i2j_miss, j_lengths, i_lengths) + return i2j_multi, j2i_multi + + +def _get_regions(alignment, starts): + regions = {} + start = None + for i in range(len(alignment)): + if alignment[i] < 0: + if start is None: + start = starts[i] + regions.setdefault(start, []) + regions[start].append(i) + else: + start = None + return regions + + +def _get_mapping(miss1, miss2, lengths1, lengths2): + output = {} + for start, region1 in miss1.items(): + region2 = miss2.get(start, []) + if len(region2) == 1: + if sum(lengths1[i] for i in region1): + for i in region1: + output[i] = region2[0] + return output + + def _convert_sequence(seq): if isinstance(seq, numpy.ndarray): return numpy.ascontiguousarray(seq, dtype='uint32_t') diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py index d1fc53c56..4f66f6669 100644 --- a/spacy/tests/test_align.py +++ b/spacy/tests/test_align.py @@ -43,5 +43,14 @@ def test_align_strings(): words2 = ['hellothis', 'is', 'test', '!'] cost, i2j, j2i, matrix = align(words1, words2) assert cost == 4 - assert list(i2j) == [0, -1, 1, 2] - assert list(j2i) == [0, 2, 3, -1] + assert list(i2j) == [-1, -1, 1, -1] + assert list(j2i) == [-1, 2, -1, -1] + +def test_align_many_to_one(): + words1 = ['hello', 'this', 'is', 'test!'] + words2 = ['hellothis', 'is', 'test', '!'] + cost, i2j, j2i, matrix = align(words1, words2, many_to_one=True) + assert list(i2j) == [0, 0, 1, -1] + assert list(j2i) == [-1, 2, -1, -1] + +