Rename function arguments

This commit is contained in:
Ines Montani 2019-07-17 14:29:52 +02:00
parent 394e4d8058
commit 73565c6d9d

View File

@ -70,32 +70,33 @@ def merge_sents(sents):
return [(m_deps, m_brackets)] return [(m_deps, m_brackets)]
def align(cand_words, gold_words): def align(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations, using the Levenshtein """Calculate alignment tables between two tokenizations, using the Levenshtein
algorithm. The alignment is case-insensitive. algorithm. The alignment is case-insensitive.
cand_words (List[str]): The candidate tokenization. tokens_a (List[str]): The candidate tokenization.
gold_words (List[str]): The reference tokenization. tokens_b (List[str]): The reference tokenization.
RETURNS: (tuple): A 5-tuple consisting of the following information: RETURNS: (tuple): A 5-tuple consisting of the following information:
* cost (int): The number of misaligned tokens. * cost (int): The number of misaligned tokens.
* a2b (List[int]): Mapping of indices in `cand_words` to indices in `gold_words`. * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
For instance, if `a2b[4] == 6`, that means that `cand_words[4]` aligns For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
to `gold_words[6]`. If there's no one-to-one alignment for a token, to `tokens_b[6]`. If there's no one-to-one alignment for a token,
it has the value -1. it has the value -1.
* b2a (List[int]): The same as `a2b`, but mapping the other direction. * b2a (List[int]): The same as `a2b`, but mapping the other direction.
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `a` to indices * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
in `b`, where multiple tokens of `a` align to the same token of `b`. to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
the same token of `tokens_b`.
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction. direction.
""" """
if cand_words == gold_words: if tokens_a == tokens_b:
alignment = numpy.arange(len(cand_words)) alignment = numpy.arange(len(tokens_a))
return 0, alignment, alignment, {}, {} return 0, alignment, alignment, {}, {}
cand_words = [w.replace(" ", "").lower() for w in cand_words] tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
gold_words = [w.replace(" ", "").lower() for w in gold_words] tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words) cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words], i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
[len(w) for w in gold_words]) [len(w) for w in tokens_b])
for i, j in list(i2j_multi.items()): for i, j in list(i2j_multi.items()):
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
i2j[i] = j i2j[i] = j