Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!). ## Description The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed. ### Types of change Bug fix ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2025-12-23 10:03:15 +03:00 · 2019-01-06 19:07:50 +01:00 · 2019-01-06 19:07:50 +01:00 · e03e1eee92
commit e03e1eee92
parent 9972716e01
4 changed files with 62 additions and 18 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -278,10 +278,30 @@ def test_doc_api_similarity_match():
    assert doc.similarity(doc2) == 0.0


-def test_lowest_common_ancestor(en_tokenizer):
-    tokens = en_tokenizer('the lazy dog slept')
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
+@pytest.mark.parametrize('sentence,heads,lca_matrix', [
+    ('the lazy dog slept',
+     [2, 1, 1, 0],
+     numpy.array([[0, 2, 2, 3],
+                  [2, 1, 2, 3],
+                  [2, 2, 2, 3],
+                  [3, 3, 3, 3]])),
+    ('The lazy dog slept. The quick fox jumped',
+     [2, 1, 1, 0, -1, 2, 1, 1, 0],
+     numpy.array([[0, 2, 2, 3, 3, -1, -1, -1, -1],
+                  [2, 1, 2, 3, 3, -1, -1, -1, -1],
+                  [2, 2, 2, 3, 3, -1, -1, -1, -1],
+                  [3, 3, 3, 3, 3, -1, -1, -1, -1],
+                  [3, 3, 3, 3, 4, -1, -1, -1, -1],
+                  [-1, -1, -1, -1, -1, 5, 7, 7, 8],
+                  [-1, -1, -1, -1, -1, 7, 6, 7, 8],
+                  [-1, -1, -1, -1, -1, 7, 7, 7, 8],
+                  [-1, -1, -1, -1, -1, 8, 8, 8, 8]]))
+])
+def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
+    tokens = en_tokenizer(sentence)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
    lca = doc.get_lca_matrix()
+    assert (lca == lca_matrix).all()
    assert(lca[1, 1] == 1)
    assert(lca[0, 1] == 2)
    assert(lca[1, 2] == 2)
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -62,10 +62,24 @@ def test_spans_lca_matrix(en_tokenizer):
    tokens = en_tokenizer('the lazy dog slept')
    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
    lca = doc[:2].get_lca_matrix()
-    assert(lca[0, 0] == 0)
-    assert(lca[0, 1] == -1)
-    assert(lca[1, 0] == -1)
-    assert(lca[1, 1] == 1)
+    assert lca.shape == (2, 2)
+    assert lca[0, 0] == 0  # the & the -> the
+    assert lca[0, 1] == -1 # the & lazy -> dog (out of span)
+    assert lca[1, 0] == -1 # lazy & the -> dog (out of span)
+    assert lca[1, 1] == 1  # lazy & lazy -> lazy
+
+    lca = doc[1:].get_lca_matrix()
+    assert lca.shape == (3, 3)
+    assert lca[0, 0] == 0 # lazy & lazy -> lazy
+    assert lca[0, 1] == 1 # lazy & dog -> dog
+    assert lca[0, 2] == 2 # lazy & slept -> slept
+
+    lca = doc[2:].get_lca_matrix()
+    assert lca.shape == (2, 2)
+    assert lca[0, 0] == 0 # dog & dog -> dog
+    assert lca[0, 1] == 1 # dog & slept -> slept
+    assert lca[1, 0] == 1 # slept & dog -> slept
+    assert lca[1, 1] == 1 # slept & slept -> slept


 def test_span_similarity_match():
--- a/spacy/tests/regression/test_issue2396.py
+++ b/spacy/tests/regression/test_issue2396.py
@ -6,9 +6,10 @@ from ..util import get_doc
 import pytest
 import numpy

-@pytest.mark.parametrize('sentence,matrix', [
+@pytest.mark.parametrize('sentence,heads,matrix', [
    (
        'She created a test for spacy',
+        [1, 0, 1, -2, -1, -1],
        numpy.array([
            [0, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1],
@ -18,8 +19,9 @@ import numpy
            [1, 1, 3, 3, 4, 5]], dtype=numpy.int32)
    )
    ])
-def test_issue2396(EN, sentence, matrix):
-    doc = EN(sentence)
+def test_issue2396(en_tokenizer, sentence, heads, matrix):
+    tokens = en_tokenizer(sentence)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
    span = doc[:]
    assert (doc.get_lca_matrix() == matrix).all()
    assert (span.get_lca_matrix() == matrix).all()
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1,4 +1,3 @@
-
 # coding: utf8
 # cython: infer_types=True
 # cython: bounds_check=False
@ -1051,21 +1050,30 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    cdef int [:,:] lca_matrix

    n_tokens= end - start
-    lca_matrix = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
+    lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
+    lca_mat.fill(-1)
+    lca_matrix = lca_mat

-    for j in range(start, end):
-        token_j = doc[j]
+    for j in range(n_tokens):
+        token_j = doc[start + j]
        # the common ancestor of token and itself is itself:
        lca_matrix[j, j] = j
-        for k in range(j + 1, end):
-            lca = _get_tokens_lca(token_j, doc[k])
+        # we will only iterate through tokens in the same sentence
+        sent = token_j.sent
+        sent_start = sent.start
+        j_idx_in_sent = start + j - sent_start
+        n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
+        # make sure we do not go past `end`, in cases where `end` < sent.end
+        max_range = min(j + n_missing_tokens_in_sent, end)
+        for k in range(j + 1, max_range):
+            lca = _get_tokens_lca(token_j, doc[start + k])
            # if lca is outside of span, we set it to -1
            if not start <= lca < end:
                lca_matrix[j, k] = -1
                lca_matrix[k, j] = -1
            else:
-                lca_matrix[j, k] = lca
-                lca_matrix[k, j] = lca
+                lca_matrix[j, k] = lca - start
+                lca_matrix[k, j] = lca - start

    return lca_matrix