spaCy/spacy/_gold/align.py

import numpy
from .errors import Errors, AlignmentError


def align(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations.

    tokens_a (List[str]): The candidate tokenization.
    tokens_b (List[str]): The reference tokenization.
    RETURNS: (tuple): A 5-tuple consisting of the following information:
      * cost (int): The number of misaligned tokens.
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
        it has the value -1.
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
        the same token of `tokens_b`.
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    tokens_a = _normalize_for_alignment(tokens_a)
    tokens_b = _normalize_for_alignment(tokens_b)
    cost = 0
    a2b = numpy.empty(len(tokens_a), dtype="i")
    b2a = numpy.empty(len(tokens_b), dtype="i")
    a2b.fill(-1)
    b2a.fill(-1)
    a2b_multi = {}
    b2a_multi = {}
    i = 0
    j = 0
    offset_a = 0
    offset_b = 0
    while i < len(tokens_a) and j < len(tokens_b):
        a = tokens_a[i][offset_a:]
        b = tokens_b[j][offset_b:]
        if a == b:
            if offset_a == offset_b == 0:
                a2b[i] = j
                b2a[j] = i
            elif offset_a == 0:
                cost += 2
                a2b_multi[i] = j
            elif offset_b == 0:
                cost += 2
                b2a_multi[j] = i
            offset_a = offset_b = 0
            i += 1
            j += 1
        elif a == "":
            assert offset_a == 0
            cost += 1
            i += 1
        elif b == "":
            assert offset_b == 0
            cost += 1
            j += 1
        elif b.startswith(a):
            cost += 1
            if offset_a == 0:
                a2b_multi[i] = j
            i += 1
            offset_a = 0
            offset_b += len(a)
        elif a.startswith(b):
            cost += 1
            if offset_b == 0:
                b2a_multi[j] = i
            j += 1
            offset_b = 0
            offset_a += len(b)
        else:
            assert "".join(tokens_a) != "".join(tokens_b)
            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
    return cost, a2b, b2a, a2b_multi, b2a_multi


def _normalize_for_alignment(tokens):
    return [w.replace(" ", "").lower() for w in tokens]
Start breaking down gold.pyx 2020-06-06 15:15:12 +03:00			`import numpy`
			`from .errors import Errors, AlignmentError`


			`def align(tokens_a, tokens_b):`
			`"""Calculate alignment tables between two tokenizations.`

			`tokens_a (List[str]): The candidate tokenization.`
			`tokens_b (List[str]): The reference tokenization.`
			`RETURNS: (tuple): A 5-tuple consisting of the following information:`
			`* cost (int): The number of misaligned tokens.`
			* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
			For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
			to `tokens_b[6]`. If there's no one-to-one alignment for a token,
			`it has the value -1.`
			* b2a (List[int]): The same as `a2b`, but mapping the other direction.
			* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
			to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
			the same token of `tokens_b`.
			* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
			`direction.`
			`"""`
			`tokens_a = _normalize_for_alignment(tokens_a)`
			`tokens_b = _normalize_for_alignment(tokens_b)`
			`cost = 0`
			`a2b = numpy.empty(len(tokens_a), dtype="i")`
			`b2a = numpy.empty(len(tokens_b), dtype="i")`
			`a2b.fill(-1)`
			`b2a.fill(-1)`
			`a2b_multi = {}`
			`b2a_multi = {}`
			`i = 0`
			`j = 0`
			`offset_a = 0`
			`offset_b = 0`
			`while i < len(tokens_a) and j < len(tokens_b):`
			`a = tokens_a[i][offset_a:]`
			`b = tokens_b[j][offset_b:]`
			`if a == b:`
			`if offset_a == offset_b == 0:`
			`a2b[i] = j`
			`b2a[j] = i`
			`elif offset_a == 0:`
			`cost += 2`
			`a2b_multi[i] = j`
			`elif offset_b == 0:`
			`cost += 2`
			`b2a_multi[j] = i`
			`offset_a = offset_b = 0`
			`i += 1`
			`j += 1`
			`elif a == "":`
			`assert offset_a == 0`
			`cost += 1`
			`i += 1`
			`elif b == "":`
			`assert offset_b == 0`
			`cost += 1`
			`j += 1`
			`elif b.startswith(a):`
			`cost += 1`
			`if offset_a == 0:`
			`a2b_multi[i] = j`
			`i += 1`
			`offset_a = 0`
			`offset_b += len(a)`
			`elif a.startswith(b):`
			`cost += 1`
			`if offset_b == 0:`
			`b2a_multi[j] = i`
			`j += 1`
			`offset_b = 0`
			`offset_a += len(b)`
			`else:`
			`assert "".join(tokens_a) != "".join(tokens_b)`
			`raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))`
			`return cost, a2b, b2a, a2b_multi, b2a_multi`


			`def _normalize_for_alignment(tokens):`
			`return [w.replace(" ", "").lower() for w in tokens]`