mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Replace old gold alignment with new gold alignment (#4710)
Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization. * Replace old alignment with new alignment, removing `_align.pyx` and its tests * Remove all quote normalizations * Enable test for new align * Modify test case for quote normalization
This commit is contained in:
parent
392c4880d9
commit
0c9640ced3
1
setup.py
1
setup.py
|
@ -31,7 +31,6 @@ PACKAGES = find_packages()
|
|||
|
||||
|
||||
MOD_NAMES = [
|
||||
"spacy._align",
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
"spacy.lexeme",
|
||||
|
|
255
spacy/_align.pyx
255
spacy/_align.pyx
|
@ -1,255 +0,0 @@
|
|||
# cython: infer_types=True
|
||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
|
||||
|
||||
Random notes:
|
||||
|
||||
r i n g
|
||||
0 1 2 3 4
|
||||
r 1 0 1 2 3
|
||||
a 2 1 1 2 3
|
||||
n 3 2 2 1 2
|
||||
g 4 3 3 2 1
|
||||
|
||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
|
||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
|
||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
|
||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
|
||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
|
||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
|
||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
|
||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
|
||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
|
||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
|
||||
2,2: (3,3)
|
||||
3,2: (4,3)
|
||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
|
||||
|
||||
We know the costs to transition:
|
||||
|
||||
S[:i] -> T[:j] (at D[i,j])
|
||||
S[:i+1] -> T[:j] (at D[i+1,j])
|
||||
S[:i] -> T[:j+1] (at D[i,j+1])
|
||||
|
||||
Further, we now we can tranform:
|
||||
S[:i+1] -> S[:i] (DEL) for 1,
|
||||
T[:j+1] -> T[:j] (INS) for 1.
|
||||
S[i+1] -> T[j+1] (SUB) for 0 or 1
|
||||
|
||||
Therefore we have the costs:
|
||||
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
|
||||
i.e. D[i, j] + S[i+1] != T[j+1]
|
||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
|
||||
i.e. D[i+1,j] + 1
|
||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
|
||||
i.e. D[i,j+1] + 1
|
||||
|
||||
Source string S has length m, with index i
|
||||
Target string T has length n, with index j
|
||||
|
||||
Output two alignment vectors: i2j (length m) and j2i (length n)
|
||||
# function LevenshteinDistance(char s[1..m], char t[1..n]):
|
||||
# for all i and j, d[i,j] will hold the Levenshtein distance between
|
||||
# the first i characters of s and the first j characters of t
|
||||
# note that d has (m+1)*(n+1) values
|
||||
# set each element in d to zero
|
||||
ring rang
|
||||
- r i n g
|
||||
- 0 0 0 0 0
|
||||
r 0 0 0 0 0
|
||||
a 0 0 0 0 0
|
||||
n 0 0 0 0 0
|
||||
g 0 0 0 0 0
|
||||
|
||||
# source prefixes can be transformed into empty string by
|
||||
# dropping all characters
|
||||
# d[i, 0] := i
|
||||
ring rang
|
||||
- r i n g
|
||||
- 0 0 0 0 0
|
||||
r 1 0 0 0 0
|
||||
a 2 0 0 0 0
|
||||
n 3 0 0 0 0
|
||||
g 4 0 0 0 0
|
||||
|
||||
# target prefixes can be reached from empty source prefix
|
||||
# by inserting every character
|
||||
# d[0, j] := j
|
||||
- r i n g
|
||||
- 0 1 2 3 4
|
||||
r 1 0 0 0 0
|
||||
a 2 0 0 0 0
|
||||
n 3 0 0 0 0
|
||||
g 4 0 0 0 0
|
||||
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
from libc.stdint cimport uint32_t
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
from .compat import unicode_
|
||||
from murmurhash.mrmr cimport hash32
|
||||
|
||||
|
||||
def align(S, T):
|
||||
cdef int m = len(S)
|
||||
cdef int n = len(T)
|
||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
||||
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
|
||||
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
|
||||
|
||||
cdef np.ndarray S_arr = _convert_sequence(S)
|
||||
cdef np.ndarray T_arr = _convert_sequence(T)
|
||||
|
||||
fill_matrix(<int*>matrix.data,
|
||||
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
|
||||
fill_i2j(i2j, matrix)
|
||||
fill_j2i(j2i, matrix)
|
||||
for i in range(i2j.shape[0]):
|
||||
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
|
||||
i2j[i] = -1
|
||||
for j in range(j2i.shape[0]):
|
||||
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
|
||||
j2i[j] = -1
|
||||
return matrix[-1,-1], i2j, j2i, matrix
|
||||
|
||||
|
||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
||||
'''Let's say we had:
|
||||
|
||||
Guess: [aa bb cc dd]
|
||||
Truth: [aa bbcc dd]
|
||||
i2j: [0, None, -2, 2]
|
||||
j2i: [0, -2, 3]
|
||||
|
||||
We want:
|
||||
|
||||
i2j_multi: {1: 1, 2: 1}
|
||||
j2i_multi: {}
|
||||
'''
|
||||
i2j_miss = _get_regions(i2j, i_lengths)
|
||||
j2i_miss = _get_regions(j2i, j_lengths)
|
||||
|
||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
||||
return i2j_multi, j2i_multi
|
||||
|
||||
|
||||
def _get_regions(alignment, lengths):
|
||||
regions = {}
|
||||
start = None
|
||||
offset = 0
|
||||
for i in range(len(alignment)):
|
||||
if alignment[i] < 0:
|
||||
if start is None:
|
||||
start = offset
|
||||
regions.setdefault(start, [])
|
||||
regions[start].append(i)
|
||||
else:
|
||||
start = None
|
||||
offset += lengths[i]
|
||||
return regions
|
||||
|
||||
|
||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
||||
i2j = {}
|
||||
j2i = {}
|
||||
for start, region1 in miss1.items():
|
||||
if not region1 or start not in miss2:
|
||||
continue
|
||||
region2 = miss2[start]
|
||||
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
|
||||
j = region2.pop(0)
|
||||
buff = []
|
||||
# Consume tokens from region 1, until we meet the length of the
|
||||
# first token in region2. If we do, align the tokens. If
|
||||
# we exceed the length, break.
|
||||
while region1:
|
||||
buff.append(region1.pop(0))
|
||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
j += 1
|
||||
buff = []
|
||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
||||
break
|
||||
else:
|
||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
return i2j, j2i
|
||||
|
||||
|
||||
def _convert_sequence(seq):
|
||||
if isinstance(seq, numpy.ndarray):
|
||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
||||
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
|
||||
cdef bytes item_bytes
|
||||
for i, item in enumerate(seq):
|
||||
if item == "``":
|
||||
item = '"'
|
||||
elif item == "''":
|
||||
item = '"'
|
||||
if isinstance(item, unicode):
|
||||
item_bytes = item.encode('utf8')
|
||||
else:
|
||||
item_bytes = item
|
||||
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
|
||||
return output
|
||||
|
||||
|
||||
cdef void fill_matrix(int* D,
|
||||
const int* S, int m, const int* T, int n) nogil:
|
||||
m1 = m+1
|
||||
n1 = n+1
|
||||
for i in range(m1*n1):
|
||||
D[i] = 0
|
||||
|
||||
for i in range(m1):
|
||||
D[i*n1] = i
|
||||
|
||||
for j in range(n1):
|
||||
D[j] = j
|
||||
|
||||
cdef int sub_cost, ins_cost, del_cost
|
||||
for j in range(n):
|
||||
for i in range(m):
|
||||
i_j = i*n1 + j
|
||||
i1_j1 = (i+1)*n1 + j+1
|
||||
i1_j = (i+1)*n1 + j
|
||||
i_j1 = i*n1 + j+1
|
||||
if S[i] != T[j]:
|
||||
sub_cost = D[i_j] + 1
|
||||
else:
|
||||
sub_cost = D[i_j]
|
||||
del_cost = D[i_j1] + 1
|
||||
ins_cost = D[i1_j] + 1
|
||||
best = min(min(sub_cost, ins_cost), del_cost)
|
||||
D[i1_j1] = best
|
||||
|
||||
|
||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
|
||||
j = D.shape[1]-2
|
||||
cdef int i = D.shape[0]-2
|
||||
while i >= 0:
|
||||
while D[i+1, j] < D[i+1, j+1]:
|
||||
j -= 1
|
||||
if D[i, j+1] < D[i+1, j+1]:
|
||||
i2j[i] = -1
|
||||
else:
|
||||
i2j[i] = j
|
||||
j -= 1
|
||||
i -= 1
|
||||
|
||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
|
||||
i = D.shape[0]-2
|
||||
cdef int j = D.shape[1]-2
|
||||
while j >= 0:
|
||||
while D[i, j+1] < D[i+1, j+1]:
|
||||
i -= 1
|
||||
if D[i+1, j] < D[i+1, j+1]:
|
||||
j2i[j] = -1
|
||||
else:
|
||||
j2i[j] = i
|
||||
i -= 1
|
||||
j -= 1
|
|
@ -18,7 +18,6 @@ from .compat import path2str, basestring_
|
|||
from . import util
|
||||
|
||||
|
||||
USE_NEW_ALIGN = False
|
||||
punct_re = re.compile(r"\W")
|
||||
|
||||
|
||||
|
@ -51,59 +50,15 @@ def tags_to_entities(tags):
|
|||
return entities
|
||||
|
||||
|
||||
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
|
||||
|
||||
|
||||
def _normalize_for_alignment(tokens):
|
||||
tokens = [w.replace(" ", "").lower() for w in tokens]
|
||||
output = []
|
||||
for token in tokens:
|
||||
token = token.replace(" ", "").lower()
|
||||
for before, after in _ALIGNMENT_NORM_MAP:
|
||||
token = token.replace(before, after)
|
||||
output.append(token)
|
||||
return output
|
||||
|
||||
|
||||
def _align_before_v2_2_2(tokens_a, tokens_b):
|
||||
"""Calculate alignment tables between two tokenizations, using the Levenshtein
|
||||
algorithm. The alignment is case-insensitive.
|
||||
|
||||
tokens_a (List[str]): The candidate tokenization.
|
||||
tokens_b (List[str]): The reference tokenization.
|
||||
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
||||
* cost (int): The number of misaligned tokens.
|
||||
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
||||
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
||||
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
||||
it has the value -1.
|
||||
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
||||
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
||||
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
||||
the same token of `tokens_b`.
|
||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||
direction.
|
||||
"""
|
||||
from . import _align
|
||||
if tokens_a == tokens_b:
|
||||
alignment = numpy.arange(len(tokens_a))
|
||||
return 0, alignment, alignment, {}, {}
|
||||
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
|
||||
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
|
||||
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
|
||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
|
||||
[len(w) for w in tokens_b])
|
||||
for i, j in list(i2j_multi.items()):
|
||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
||||
i2j[i] = j
|
||||
i2j_multi.pop(i)
|
||||
for j, i in list(j2i_multi.items()):
|
||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
||||
j2i[j] = i
|
||||
j2i_multi.pop(j)
|
||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
||||
|
||||
|
||||
def align(tokens_a, tokens_b):
|
||||
"""Calculate alignment tables between two tokenizations.
|
||||
|
||||
|
@ -122,8 +77,6 @@ def align(tokens_a, tokens_b):
|
|||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||
direction.
|
||||
"""
|
||||
if not USE_NEW_ALIGN:
|
||||
return _align_before_v2_2_2(tokens_a, tokens_b)
|
||||
tokens_a = _normalize_for_alignment(tokens_a)
|
||||
tokens_b = _normalize_for_alignment(tokens_b)
|
||||
cost = 0
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy._align import align, multi_align
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,cost",
|
||||
[
|
||||
("hello", "hell", 1),
|
||||
("rat", "cat", 1),
|
||||
("rat", "rat", 0),
|
||||
("rat", "catsie", 4),
|
||||
("t", "catsie", 5),
|
||||
],
|
||||
)
|
||||
def test_align_costs(string1, string2, cost):
|
||||
output_cost, i2j, j2i, matrix = align(string1, string2)
|
||||
assert output_cost == cost
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,i2j",
|
||||
[
|
||||
("hello", "hell", [0, 1, 2, 3, -1]),
|
||||
("rat", "cat", [0, 1, 2]),
|
||||
("rat", "rat", [0, 1, 2]),
|
||||
("rat", "catsie", [0, 1, 2]),
|
||||
("t", "catsie", [2]),
|
||||
],
|
||||
)
|
||||
def test_align_i2j(string1, string2, i2j):
|
||||
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
||||
assert list(output_i2j) == i2j
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,j2i",
|
||||
[
|
||||
("hello", "hell", [0, 1, 2, 3]),
|
||||
("rat", "cat", [0, 1, 2]),
|
||||
("rat", "rat", [0, 1, 2]),
|
||||
("rat", "catsie", [0, 1, 2, -1, -1, -1]),
|
||||
("t", "catsie", [-1, -1, 0, -1, -1, -1]),
|
||||
],
|
||||
)
|
||||
def test_align_i2j_2(string1, string2, j2i):
|
||||
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
||||
assert list(output_j2i) == j2i
|
||||
|
||||
|
||||
def test_align_strings():
|
||||
words1 = ["hello", "this", "is", "test!"]
|
||||
words2 = ["hellothis", "is", "test", "!"]
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert cost == 4
|
||||
assert list(i2j) == [-1, -1, 1, -1]
|
||||
assert list(j2i) == [-1, 2, -1, -1]
|
||||
|
||||
|
||||
def test_align_many_to_one():
|
||||
words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
||||
words2 = ["ab", "bc", "e", "fg", "h"]
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
||||
lengths1 = [len(w) for w in words1]
|
||||
lengths2 = [len(w) for w in words2]
|
||||
i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
|
||||
assert i2j_multi[0] == 0
|
||||
assert i2j_multi[1] == 0
|
||||
assert i2j_multi[2] == 1
|
||||
assert i2j_multi[3] == 1
|
||||
assert i2j_multi[3] == 1
|
||||
assert i2j_multi[5] == 3
|
||||
assert i2j_multi[6] == 3
|
||||
|
||||
assert j2i_multi[0] == 1
|
||||
assert j2i_multi[1] == 3
|
|
@ -241,20 +241,6 @@ def test_ignore_misaligned(doc):
|
|||
deps = [t.dep_ for t in doc]
|
||||
heads = [t.head.i for t in doc]
|
||||
|
||||
saved_use_new_align = spacy.gold.USE_NEW_ALIGN
|
||||
|
||||
spacy.gold.USE_NEW_ALIGN = False
|
||||
with make_tempdir() as tmpdir:
|
||||
jsonl_file = tmpdir / "test.jsonl"
|
||||
data = [docs_to_json(doc)]
|
||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||
# write to JSONL train dicts
|
||||
srsly.write_jsonl(jsonl_file, data)
|
||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||
|
||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||
|
||||
spacy.gold.USE_NEW_ALIGN = True
|
||||
with make_tempdir() as tmpdir:
|
||||
jsonl_file = tmpdir / "test.jsonl"
|
||||
data = [docs_to_json(doc)]
|
||||
|
@ -280,8 +266,6 @@ def test_ignore_misaligned(doc):
|
|||
ignore_misaligned=True))
|
||||
assert len(train_reloaded_example) == 0
|
||||
|
||||
spacy.gold.USE_NEW_ALIGN = saved_use_new_align
|
||||
|
||||
|
||||
def test_make_orth_variants(doc):
|
||||
nlp = English()
|
||||
|
@ -301,14 +285,12 @@ def test_make_orth_variants(doc):
|
|||
train_goldparse = train_reloaded_example.gold
|
||||
|
||||
|
||||
# xfail while we have backwards-compatible alignment
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize(
|
||||
"tokens_a,tokens_b,expected",
|
||||
[
|
||||
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
|
||||
(
|
||||
["a", "b", "``", "c"],
|
||||
["a", "b", '"', "c"],
|
||||
['ab"', "c"],
|
||||
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
|
||||
),
|
||||
|
|
Loading…
Reference in New Issue
Block a user