mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* Add functions for Levenshtein distance alignment
This commit is contained in:
parent
744f06abf5
commit
3593babd35
|
@ -2,11 +2,92 @@ import numpy
|
|||
import codecs
|
||||
import json
|
||||
import random
|
||||
from .munge.alignment import align
|
||||
import re
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
|
||||
def align(cand_words, gold_words):
|
||||
cost, edit_path = _min_edit_path(cand_words, gold_words)
|
||||
alignment = []
|
||||
i_of_gold = 0
|
||||
for move in edit_path:
|
||||
if move == 'M':
|
||||
alignment.append(i_of_gold)
|
||||
i_of_gold += 1
|
||||
elif move == 'S':
|
||||
alignment.append(None)
|
||||
i_of_gold += 1
|
||||
elif move == 'D':
|
||||
alignment.append(None)
|
||||
elif move == 'I':
|
||||
i_of_gold += 1
|
||||
else:
|
||||
raise Exception(move)
|
||||
return alignment
|
||||
|
||||
|
||||
punct_re = re.compile(r'\W')
|
||||
def _min_edit_path(cand_words, gold_words):
|
||||
cdef:
|
||||
Pool mem
|
||||
int i, j, n_cand, n_gold
|
||||
int* curr_costs
|
||||
int* prev_costs
|
||||
|
||||
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
||||
# then walk back over it...
|
||||
mem = Pool()
|
||||
# Preprocess inputs
|
||||
cand_words = [punct_re.sub('', w) for w in cand_words]
|
||||
gold_words = [punct_re.sub('', w) for w in gold_words]
|
||||
|
||||
n_cand = len(cand_words)
|
||||
n_gold = len(gold_words)
|
||||
# Levenshtein distance, except we need the history, and we may want different
|
||||
# costs.
|
||||
# Mark operations with a string, and score the history using _edit_cost.
|
||||
previous_row = []
|
||||
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
for i in range(n_gold + 1):
|
||||
cell = ''
|
||||
for j in range(i):
|
||||
cell += 'I'
|
||||
previous_row.append('I' * i)
|
||||
prev_costs[i] = i
|
||||
for i, cand in enumerate(cand_words):
|
||||
current_row = ['D' * (i + 1)]
|
||||
curr_costs[0] = i+1
|
||||
for j, gold in enumerate(gold_words):
|
||||
if gold.lower() == cand.lower():
|
||||
s_cost = prev_costs[j]
|
||||
i_cost = curr_costs[j] + 1
|
||||
d_cost = prev_costs[j + 1] + 1
|
||||
else:
|
||||
s_cost = prev_costs[j] + 1
|
||||
i_cost = curr_costs[j] + 1
|
||||
d_cost = prev_costs[j + 1] + (1 if cand else 0)
|
||||
|
||||
if s_cost <= i_cost and s_cost <= d_cost:
|
||||
best_cost = s_cost
|
||||
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
|
||||
elif i_cost <= s_cost and i_cost <= d_cost:
|
||||
best_cost = i_cost
|
||||
best_hist = current_row[j] + 'I'
|
||||
else:
|
||||
best_cost = d_cost
|
||||
best_hist = previous_row[j + 1] + 'D'
|
||||
|
||||
current_row.append(best_hist)
|
||||
curr_costs[j+1] = best_cost
|
||||
previous_row = current_row
|
||||
for j in range(len(gold_words) + 1):
|
||||
prev_costs[j] = curr_costs[j]
|
||||
curr_costs[j] = 0
|
||||
|
||||
return prev_costs[n_gold], previous_row[-1]
|
||||
|
||||
def read_json_file(loc):
|
||||
paragraphs = []
|
||||
for doc in json.load(open(loc)):
|
||||
|
|
Loading…
Reference in New Issue
Block a user