From 5e7e7cda94fff546a38e1388d22253ea5946692b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 17 May 2021 16:54:10 +0200 Subject: [PATCH] Fix range in Span.get_lca_matrix (#8115) Fix the adjusted token index / lca matrix index ranges for `_get_lca_matrix` for spans. * The range for `k` should correspond to the adjusted indices in `lca_matrix` with the `start` indexed at `0` --- spacy/tests/doc/test_span.py | 10 ++++++++++ spacy/tokens/doc.pyx | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index a5da50fbd..b17060d8d 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import pytest +import numpy +from numpy.testing import assert_array_equal from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span from spacy.vocab import Vocab @@ -118,6 +120,14 @@ def test_spans_lca_matrix(en_tokenizer): assert lca[1, 0] == 1 # slept & dog -> slept assert lca[1, 1] == 1 # slept & slept -> slept + # example from Span API docs + tokens = en_tokenizer("I like New York in Autumn") + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], heads=[1, 0, 1, -2, -1, -1] + ) + lca = doc[1:4].get_lca_matrix() + assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]])) + def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 584f9d483..07d44d01c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1351,7 +1351,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): j_idx_in_sent = start + j - sent_start n_missing_tokens_in_sent = len(sent) - j_idx_in_sent # make sure we do not go past `end`, in cases where `end` < sent.end - max_range = min(j + n_missing_tokens_in_sent, end) + max_range = min(j + n_missing_tokens_in_sent, end - start) for k in range(j + 1, max_range): lca = _get_tokens_lca(token_j, doc[start + k]) # if lca is outside of span, we set it to -1