From fcb4f7a6db10b94a5ae2f2b961009c67382295ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:15:12 +0200 Subject: [PATCH] Start breaking down gold.pyx --- spacy/_gold/align.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 spacy/_gold/align.py diff --git a/spacy/_gold/align.py b/spacy/_gold/align.py new file mode 100644 index 000000000..7703232b2 --- /dev/null +++ b/spacy/_gold/align.py @@ -0,0 +1,81 @@ +import numpy +from .errors import Errors, AlignmentError + + +def align(tokens_a, tokens_b): + """Calculate alignment tables between two tokenizations. + + tokens_a (List[str]): The candidate tokenization. + tokens_b (List[str]): The reference tokenization. + RETURNS: (tuple): A 5-tuple consisting of the following information: + * cost (int): The number of misaligned tokens. + * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. + For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns + to `tokens_b[6]`. If there's no one-to-one alignment for a token, + it has the value -1. + * b2a (List[int]): The same as `a2b`, but mapping the other direction. + * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` + to indices in `tokens_b`, where multiple tokens of `tokens_a` align to + the same token of `tokens_b`. + * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other + direction. + """ + tokens_a = _normalize_for_alignment(tokens_a) + tokens_b = _normalize_for_alignment(tokens_b) + cost = 0 + a2b = numpy.empty(len(tokens_a), dtype="i") + b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) + a2b_multi = {} + b2a_multi = {} + i = 0 + j = 0 + offset_a = 0 + offset_b = 0 + while i < len(tokens_a) and j < len(tokens_b): + a = tokens_a[i][offset_a:] + b = tokens_b[j][offset_b:] + if a == b: + if offset_a == offset_b == 0: + a2b[i] = j + b2a[j] = i + elif offset_a == 0: + cost += 2 + a2b_multi[i] = j + elif offset_b == 0: + cost += 2 + b2a_multi[j] = i + offset_a = offset_b = 0 + i += 1 + j += 1 + elif a == "": + assert offset_a == 0 + cost += 1 + i += 1 + elif b == "": + assert offset_b == 0 + cost += 1 + j += 1 + elif b.startswith(a): + cost += 1 + if offset_a == 0: + a2b_multi[i] = j + i += 1 + offset_a = 0 + offset_b += len(a) + elif a.startswith(b): + cost += 1 + if offset_b == 0: + b2a_multi[j] = i + j += 1 + offset_b = 0 + offset_a += len(b) + else: + assert "".join(tokens_a) != "".join(tokens_b) + raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) + return cost, a2b, b2a, a2b_multi, b2a_multi + + +def _normalize_for_alignment(tokens): + return [w.replace(" ", "").lower() for w in tokens]