Replace old gold alignment with new gold alignment (#4710)

Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization.

* Replace old alignment with new alignment, removing `_align.pyx` and
its tests
* Remove all quote normalizations
* Enable test for new align
  * Modify test case for quote normalization
This commit is contained in:
adrianeboyd 2019-11-25 23:13:26 +01:00 committed by Matthew Honnibal
parent 392c4880d9
commit 0c9640ced3
5 changed files with 1 additions and 401 deletions

View File

@ -31,7 +31,6 @@ PACKAGES = find_packages()
MOD_NAMES = [
"spacy._align",
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",

View File

@ -1,255 +0,0 @@
# cython: infer_types=True
'''Do Levenshtein alignment, for evaluation of tokenized input.
Random notes:
r i n g
0 1 2 3 4
r 1 0 1 2 3
a 2 1 1 2 3
n 3 2 2 1 2
g 4 3 3 2 1
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
2,2: (3,3)
3,2: (4,3)
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
We know the costs to transition:
S[:i] -> T[:j] (at D[i,j])
S[:i+1] -> T[:j] (at D[i+1,j])
S[:i] -> T[:j+1] (at D[i,j+1])
Further, we now we can tranform:
S[:i+1] -> S[:i] (DEL) for 1,
T[:j+1] -> T[:j] (INS) for 1.
S[i+1] -> T[j+1] (SUB) for 0 or 1
Therefore we have the costs:
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
i.e. D[i, j] + S[i+1] != T[j+1]
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
i.e. D[i+1,j] + 1
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
i.e. D[i,j+1] + 1
Source string S has length m, with index i
Target string T has length n, with index j
Output two alignment vectors: i2j (length m) and j2i (length n)
# function LevenshteinDistance(char s[1..m], char t[1..n]):
# for all i and j, d[i,j] will hold the Levenshtein distance between
# the first i characters of s and the first j characters of t
# note that d has (m+1)*(n+1) values
# set each element in d to zero
ring rang
- r i n g
- 0 0 0 0 0
r 0 0 0 0 0
a 0 0 0 0 0
n 0 0 0 0 0
g 0 0 0 0 0
# source prefixes can be transformed into empty string by
# dropping all characters
# d[i, 0] := i
ring rang
- r i n g
- 0 0 0 0 0
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
# target prefixes can be reached from empty source prefix
# by inserting every character
# d[0, j] := j
- r i n g
- 0 1 2 3 4
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
'''
from __future__ import unicode_literals
from libc.stdint cimport uint32_t
import numpy
cimport numpy as np
from .compat import unicode_
from murmurhash.mrmr cimport hash32
def align(S, T):
cdef int m = len(S)
cdef int n = len(T)
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
cdef np.ndarray S_arr = _convert_sequence(S)
cdef np.ndarray T_arr = _convert_sequence(T)
fill_matrix(<int*>matrix.data,
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
fill_i2j(i2j, matrix)
fill_j2i(j2i, matrix)
for i in range(i2j.shape[0]):
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
i2j[i] = -1
for j in range(j2i.shape[0]):
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
j2i[j] = -1
return matrix[-1,-1], i2j, j2i, matrix
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
'''Let's say we had:
Guess: [aa bb cc dd]
Truth: [aa bbcc dd]
i2j: [0, None, -2, 2]
j2i: [0, -2, 3]
We want:
i2j_multi: {1: 1, 2: 1}
j2i_multi: {}
'''
i2j_miss = _get_regions(i2j, i_lengths)
j2i_miss = _get_regions(j2i, j_lengths)
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
return i2j_multi, j2i_multi
def _get_regions(alignment, lengths):
regions = {}
start = None
offset = 0
for i in range(len(alignment)):
if alignment[i] < 0:
if start is None:
start = offset
regions.setdefault(start, [])
regions[start].append(i)
else:
start = None
offset += lengths[i]
return regions
def _get_mapping(miss1, miss2, lengths1, lengths2):
i2j = {}
j2i = {}
for start, region1 in miss1.items():
if not region1 or start not in miss2:
continue
region2 = miss2[start]
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
j = region2.pop(0)
buff = []
# Consume tokens from region 1, until we meet the length of the
# first token in region2. If we do, align the tokens. If
# we exceed the length, break.
while region1:
buff.append(region1.pop(0))
if sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
j += 1
buff = []
elif sum(lengths1[i] for i in buff) > lengths2[j]:
break
else:
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
return i2j, j2i
def _convert_sequence(seq):
if isinstance(seq, numpy.ndarray):
return numpy.ascontiguousarray(seq, dtype='uint32_t')
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
cdef bytes item_bytes
for i, item in enumerate(seq):
if item == "``":
item = '"'
elif item == "''":
item = '"'
if isinstance(item, unicode):
item_bytes = item.encode('utf8')
else:
item_bytes = item
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
return output
cdef void fill_matrix(int* D,
const int* S, int m, const int* T, int n) nogil:
m1 = m+1
n1 = n+1
for i in range(m1*n1):
D[i] = 0
for i in range(m1):
D[i*n1] = i
for j in range(n1):
D[j] = j
cdef int sub_cost, ins_cost, del_cost
for j in range(n):
for i in range(m):
i_j = i*n1 + j
i1_j1 = (i+1)*n1 + j+1
i1_j = (i+1)*n1 + j
i_j1 = i*n1 + j+1
if S[i] != T[j]:
sub_cost = D[i_j] + 1
else:
sub_cost = D[i_j]
del_cost = D[i_j1] + 1
ins_cost = D[i1_j] + 1
best = min(min(sub_cost, ins_cost), del_cost)
D[i1_j1] = best
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
j = D.shape[1]-2
cdef int i = D.shape[0]-2
while i >= 0:
while D[i+1, j] < D[i+1, j+1]:
j -= 1
if D[i, j+1] < D[i+1, j+1]:
i2j[i] = -1
else:
i2j[i] = j
j -= 1
i -= 1
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
i = D.shape[0]-2
cdef int j = D.shape[1]-2
while j >= 0:
while D[i, j+1] < D[i+1, j+1]:
i -= 1
if D[i+1, j] < D[i+1, j+1]:
j2i[j] = -1
else:
j2i[j] = i
i -= 1
j -= 1

View File

@ -18,7 +18,6 @@ from .compat import path2str, basestring_
from . import util
USE_NEW_ALIGN = False
punct_re = re.compile(r"\W")
@ -51,59 +50,15 @@ def tags_to_entities(tags):
return entities
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
def _normalize_for_alignment(tokens):
tokens = [w.replace(" ", "").lower() for w in tokens]
output = []
for token in tokens:
token = token.replace(" ", "").lower()
for before, after in _ALIGNMENT_NORM_MAP:
token = token.replace(before, after)
output.append(token)
return output
def _align_before_v2_2_2(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations, using the Levenshtein
algorithm. The alignment is case-insensitive.
tokens_a (List[str]): The candidate tokenization.
tokens_b (List[str]): The reference tokenization.
RETURNS: (tuple): A 5-tuple consisting of the following information:
* cost (int): The number of misaligned tokens.
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
it has the value -1.
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
the same token of `tokens_b`.
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
from . import _align
if tokens_a == tokens_b:
alignment = numpy.arange(len(tokens_a))
return 0, alignment, alignment, {}, {}
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
[len(w) for w in tokens_b])
for i, j in list(i2j_multi.items()):
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
i2j[i] = j
i2j_multi.pop(i)
for j, i in list(j2i_multi.items()):
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
j2i[j] = i
j2i_multi.pop(j)
return cost, i2j, j2i, i2j_multi, j2i_multi
def align(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations.
@ -122,8 +77,6 @@ def align(tokens_a, tokens_b):
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
if not USE_NEW_ALIGN:
return _align_before_v2_2_2(tokens_a, tokens_b)
tokens_a = _normalize_for_alignment(tokens_a)
tokens_b = _normalize_for_alignment(tokens_b)
cost = 0

View File

@ -1,79 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy._align import align, multi_align
@pytest.mark.parametrize(
"string1,string2,cost",
[
("hello", "hell", 1),
("rat", "cat", 1),
("rat", "rat", 0),
("rat", "catsie", 4),
("t", "catsie", 5),
],
)
def test_align_costs(string1, string2, cost):
output_cost, i2j, j2i, matrix = align(string1, string2)
assert output_cost == cost
@pytest.mark.parametrize(
"string1,string2,i2j",
[
("hello", "hell", [0, 1, 2, 3, -1]),
("rat", "cat", [0, 1, 2]),
("rat", "rat", [0, 1, 2]),
("rat", "catsie", [0, 1, 2]),
("t", "catsie", [2]),
],
)
def test_align_i2j(string1, string2, i2j):
output_cost, output_i2j, j2i, matrix = align(string1, string2)
assert list(output_i2j) == i2j
@pytest.mark.parametrize(
"string1,string2,j2i",
[
("hello", "hell", [0, 1, 2, 3]),
("rat", "cat", [0, 1, 2]),
("rat", "rat", [0, 1, 2]),
("rat", "catsie", [0, 1, 2, -1, -1, -1]),
("t", "catsie", [-1, -1, 0, -1, -1, -1]),
],
)
def test_align_i2j_2(string1, string2, j2i):
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
assert list(output_j2i) == j2i
def test_align_strings():
words1 = ["hello", "this", "is", "test!"]
words2 = ["hellothis", "is", "test", "!"]
cost, i2j, j2i, matrix = align(words1, words2)
assert cost == 4
assert list(i2j) == [-1, -1, 1, -1]
assert list(j2i) == [-1, 2, -1, -1]
def test_align_many_to_one():
words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
words2 = ["ab", "bc", "e", "fg", "h"]
cost, i2j, j2i, matrix = align(words1, words2)
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
lengths1 = [len(w) for w in words1]
lengths2 = [len(w) for w in words2]
i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
assert i2j_multi[0] == 0
assert i2j_multi[1] == 0
assert i2j_multi[2] == 1
assert i2j_multi[3] == 1
assert i2j_multi[3] == 1
assert i2j_multi[5] == 3
assert i2j_multi[6] == 3
assert j2i_multi[0] == 1
assert j2i_multi[1] == 3

View File

@ -241,20 +241,6 @@ def test_ignore_misaligned(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
saved_use_new_align = spacy.gold.USE_NEW_ALIGN
spacy.gold.USE_NEW_ALIGN = False
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
spacy.gold.USE_NEW_ALIGN = True
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
@ -280,8 +266,6 @@ def test_ignore_misaligned(doc):
ignore_misaligned=True))
assert len(train_reloaded_example) == 0
spacy.gold.USE_NEW_ALIGN = saved_use_new_align
def test_make_orth_variants(doc):
nlp = English()
@ -301,14 +285,12 @@ def test_make_orth_variants(doc):
train_goldparse = train_reloaded_example.gold
# xfail while we have backwards-compatible alignment
@pytest.mark.xfail
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
[
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
(
["a", "b", "``", "c"],
["a", "b", '"', "c"],
['ab"', "c"],
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
),