From 7865746574b6860e384a1ebcaee9234c84e37107 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 24 Feb 2018 02:09:53 +0100
Subject: [PATCH] Support many-to-one alignment

---
 spacy/_align.pyx          | 67 ++++++++++++++++++++++++++++++++++++++-
 spacy/tests/test_align.py | 13 ++++++--
 2 files changed, 77 insertions(+), 3 deletions(-)
diff --git a/spacy/_align.pyx b/spacy/_align.pyx
index daab20420..83e633e77 100644
--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@@ -90,7 +90,7 @@ from .compat import unicode_
 from murmurhash.mrmr cimport hash32
 
 
-def align(S, T):
+def align(S, T, many_to_one=False, one_to_many=False):
     cdef int m = len(S)
     cdef int n = len(T)
     cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
@@ -104,8 +104,73 @@ def align(S, T):
         <const int*>S_arr.data, m, <const int*>T_arr.data, n)
     fill_i2j(i2j, matrix)
     fill_j2i(j2i, matrix)
+    for i in range(i2j.shape[0]):
+        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
+            i2j[i] = -1
+    for j in range(j2i.shape[0]):
+        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
+            j2i[j] = -1
+
+    if many_to_one or one_to_many:
+        i2j_multi, j2i_multi = multi_align(i2j, j2i,
+                                [len(s) for s in S], [len(t) for t in T])
+        if many_to_one:
+            for i, j in i2j_multi.items():
+                i2j[i] = j
+        if one_to_many:
+            for j, i in j2i_multi.items():
+                j2i[j] = i
     return matrix[-1,-1], i2j, j2i, matrix
 
+
+def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
+    '''Let's say we had:
+
+    Guess: [aa bb cc dd]
+    Truth: [aa bbcc dd]
+    i2j: [0, None, -2, 2]
+    j2i: [0, -2, 3]
+
+    We want:
+
+    i2j_multi: {1: 1, 2: 1}
+    j2i_multi: {}
+    '''
+    i_starts = numpy.cumsum([0] + i_lengths[:-1])
+    j_starts = numpy.cumsum([0] + j_lengths[:-1])
+    i2j_miss = _get_regions(i2j, i_starts)
+    j2i_miss = _get_regions(j2i, j_starts)
+
+    i2j_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
+    j2i_multi = _get_mapping(j2i_miss, i2j_miss, j_lengths, i_lengths)
+    return i2j_multi, j2i_multi
+
+
+def _get_regions(alignment, starts):
+    regions = {}
+    start = None
+    for i in range(len(alignment)):
+        if alignment[i] < 0:
+            if start is None:
+                start = starts[i]
+                regions.setdefault(start, [])
+            regions[start].append(i)
+        else:
+            start = None
+    return regions
+
+
+def _get_mapping(miss1, miss2, lengths1, lengths2):
+    output = {}
+    for start, region1 in miss1.items():
+        region2 = miss2.get(start, [])
+        if len(region2) == 1:
+            if sum(lengths1[i] for i in region1):
+                for i in region1:
+                    output[i] = region2[0]
+    return output
+
+
 def _convert_sequence(seq):
     if isinstance(seq, numpy.ndarray):
         return numpy.ascontiguousarray(seq, dtype='uint32_t')
diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py
index d1fc53c56..4f66f6669 100644
--- a/spacy/tests/test_align.py
+++ b/spacy/tests/test_align.py
@@ -43,5 +43,14 @@ def test_align_strings():
     words2 = ['hellothis', 'is', 'test', '!']
     cost, i2j, j2i, matrix = align(words1, words2)
     assert cost == 4
-    assert list(i2j) == [0, -1, 1, 2]
-    assert list(j2i) == [0, 2, 3, -1]
+    assert list(i2j) == [-1, -1, 1, -1]
+    assert list(j2i) == [-1, 2, -1, -1]
+
+def test_align_many_to_one():
+    words1 = ['hello', 'this', 'is', 'test!']
+    words2 = ['hellothis', 'is', 'test', '!']
+    cost, i2j, j2i, matrix = align(words1, words2, many_to_one=True)
+    assert list(i2j) == [0, 0, 1, -1]
+    assert list(j2i) == [-1, 2, -1, -1]
+
+