Fix range in Span.get_lca_matrix (#8115)

Fix the adjusted token index / lca matrix index ranges for
`_get_lca_matrix` for spans.

* The range for `k` should correspond to the adjusted indices in
`lca_matrix` with the `start` indexed at `0`
This commit is contained in:
Adriane Boyd 2021-05-17 16:54:10 +02:00 committed by GitHub
parent 6ce9f0469f
commit 5e7e7cda94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 1 deletions

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import numpy
from numpy.testing import assert_array_equal
from spacy.attrs import ORTH, LENGTH from spacy.attrs import ORTH, LENGTH
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -118,6 +120,14 @@ def test_spans_lca_matrix(en_tokenizer):
assert lca[1, 0] == 1 # slept & dog -> slept assert lca[1, 0] == 1 # slept & dog -> slept
assert lca[1, 1] == 1 # slept & slept -> slept assert lca[1, 1] == 1 # slept & slept -> slept
# example from Span API docs
tokens = en_tokenizer("I like New York in Autumn")
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], heads=[1, 0, 1, -2, -1, -1]
)
lca = doc[1:4].get_lca_matrix()
assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]]))
def test_span_similarity_match(): def test_span_similarity_match():
doc = Doc(Vocab(), words=["a", "b", "a", "b"]) doc = Doc(Vocab(), words=["a", "b", "a", "b"])

View File

@ -1351,7 +1351,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
j_idx_in_sent = start + j - sent_start j_idx_in_sent = start + j - sent_start
n_missing_tokens_in_sent = len(sent) - j_idx_in_sent n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
# make sure we do not go past `end`, in cases where `end` < sent.end # make sure we do not go past `end`, in cases where `end` < sent.end
max_range = min(j + n_missing_tokens_in_sent, end) max_range = min(j + n_missing_tokens_in_sent, end - start)
for k in range(j + 1, max_range): for k in range(j + 1, max_range):
lca = _get_tokens_lca(token_j, doc[start + k]) lca = _get_tokens_lca(token_j, doc[start + k])
# if lca is outside of span, we set it to -1 # if lca is outside of span, we set it to -1