From 0c9640ced3c58bca6a6838c0b2e07c3e8b115e99 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 25 Nov 2019 23:13:26 +0100 Subject: [PATCH] Replace old gold alignment with new gold alignment (#4710) Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization. * Replace old alignment with new alignment, removing `_align.pyx` and its tests * Remove all quote normalizations * Enable test for new align * Modify test case for quote normalization --- setup.py | 1 - spacy/_align.pyx | 255 -------------------------------------- spacy/gold.pyx | 47 ------- spacy/tests/test_align.py | 79 ------------ spacy/tests/test_gold.py | 20 +-- 5 files changed, 1 insertion(+), 401 deletions(-) delete mode 100644 spacy/_align.pyx delete mode 100644 spacy/tests/test_align.py diff --git a/setup.py b/setup.py index 1156e7cde..62a09aa73 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ PACKAGES = find_packages() MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 8ae7cdf4e..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, we now we can tranform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0659ddd02..f2f127438 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -18,7 +18,6 @@ from .compat import path2str, basestring_ from . import util -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -51,59 +50,15 @@ def tags_to_entities(tags): return entities -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): tokens = [w.replace(" ", "").lower() for w in tokens] output = [] for token in tokens: token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) output.append(token) return output -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi - - def align(tokens_a, tokens_b): """Calculate alignment tables between two tokenizations. @@ -122,8 +77,6 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d1255c176..639d98859 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -241,20 +241,6 @@ def test_ignore_misaligned(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - saved_use_new_align = spacy.gold.USE_NEW_ALIGN - - spacy.gold.USE_NEW_ALIGN = False - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - data = [docs_to_json(doc)] - data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) - - spacy.gold.USE_NEW_ALIGN = True with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" data = [docs_to_json(doc)] @@ -280,8 +266,6 @@ def test_ignore_misaligned(doc): ignore_misaligned=True)) assert len(train_reloaded_example) == 0 - spacy.gold.USE_NEW_ALIGN = saved_use_new_align - def test_make_orth_variants(doc): nlp = English() @@ -301,14 +285,12 @@ def test_make_orth_variants(doc): train_goldparse = train_reloaded_example.gold -# xfail while we have backwards-compatible alignment -@pytest.mark.xfail @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ),