* Add functions for Levenshtein distance alignment

2025-08-03 11:50:19 +03:00 · 2015-05-24 21:50:48 +02:00 · 2015-05-24 21:50:48 +02:00 · 3593babd35
commit 3593babd35
parent 744f06abf5
1 changed files with 82 additions and 1 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -2,11 +2,92 @@ import numpy
 import codecs
 import json
 import random
-from .munge.alignment import align
+import re

 from libc.string cimport memset


+def align(cand_words, gold_words):
+    cost, edit_path = _min_edit_path(cand_words, gold_words)
+    alignment = []
+    i_of_gold = 0
+    for move in edit_path:
+        if move == 'M':
+            alignment.append(i_of_gold)
+            i_of_gold += 1
+        elif move == 'S':
+            alignment.append(None)
+            i_of_gold += 1
+        elif move == 'D':
+            alignment.append(None)
+        elif move == 'I':
+            i_of_gold += 1
+        else:
+            raise Exception(move)
+    return alignment
+
+
+punct_re = re.compile(r'\W')
+def _min_edit_path(cand_words, gold_words):
+    cdef:
+        Pool mem
+        int i, j, n_cand, n_gold
+        int* curr_costs
+        int* prev_costs
+
+    # TODO: Fix this --- just do it properly, make the full edit matrix and
+    # then walk back over it...
+    mem = Pool()
+    # Preprocess inputs
+    cand_words = [punct_re.sub('', w) for w in cand_words] 
+    gold_words = [punct_re.sub('', w) for w in gold_words] 
+
+    n_cand = len(cand_words)
+    n_gold = len(gold_words)
+    # Levenshtein distance, except we need the history, and we may want different
+    # costs.
+    # Mark operations with a string, and score the history using _edit_cost.
+    previous_row = []
+    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
+    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
+    for i in range(n_gold + 1):
+        cell = ''
+        for j in range(i):
+            cell += 'I'
+        previous_row.append('I' * i)
+        prev_costs[i] = i
+    for i, cand in enumerate(cand_words):
+        current_row = ['D' * (i + 1)]
+        curr_costs[0] = i+1
+        for j, gold in enumerate(gold_words):
+            if gold.lower() == cand.lower():
+                s_cost = prev_costs[j]
+                i_cost = curr_costs[j] + 1
+                d_cost = prev_costs[j + 1] + 1
+            else:
+                s_cost = prev_costs[j] + 1
+                i_cost = curr_costs[j] + 1
+                d_cost = prev_costs[j + 1] + (1 if cand else 0)
+
+            if s_cost <= i_cost and s_cost <= d_cost:
+                best_cost = s_cost
+                best_hist = previous_row[j] + ('M' if gold == cand else 'S')
+            elif i_cost <= s_cost and i_cost <= d_cost:
+                best_cost = i_cost
+                best_hist = current_row[j] + 'I'
+            else:
+                best_cost = d_cost
+                best_hist = previous_row[j + 1] + 'D'
+            
+            current_row.append(best_hist)
+            curr_costs[j+1] = best_cost
+        previous_row = current_row
+        for j in range(len(gold_words) + 1):
+            prev_costs[j] = curr_costs[j]
+            curr_costs[j] = 0
+
+    return prev_costs[n_gold], previous_row[-1]
+
 def read_json_file(loc):
    paragraphs = []
    for doc in json.load(open(loc)):