Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!).

## Description
The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed.

### Types of change
Bug fix

## Checklist

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Álvaro Abella Bascarán 2019-01-06 19:07:50 +01:00 committed by Ines Montani
parent 9972716e01
commit e03e1eee92
4 changed files with 62 additions and 18 deletions

View File

@ -278,10 +278,30 @@ def test_doc_api_similarity_match():
assert doc.similarity(doc2) == 0.0
def test_lowest_common_ancestor(en_tokenizer):
tokens = en_tokenizer('the lazy dog slept')
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
@pytest.mark.parametrize('sentence,heads,lca_matrix', [
('the lazy dog slept',
[2, 1, 1, 0],
numpy.array([[0, 2, 2, 3],
[2, 1, 2, 3],
[2, 2, 2, 3],
[3, 3, 3, 3]])),
('The lazy dog slept. The quick fox jumped',
[2, 1, 1, 0, -1, 2, 1, 1, 0],
numpy.array([[0, 2, 2, 3, 3, -1, -1, -1, -1],
[2, 1, 2, 3, 3, -1, -1, -1, -1],
[2, 2, 2, 3, 3, -1, -1, -1, -1],
[3, 3, 3, 3, 3, -1, -1, -1, -1],
[3, 3, 3, 3, 4, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 5, 7, 7, 8],
[-1, -1, -1, -1, -1, 7, 6, 7, 8],
[-1, -1, -1, -1, -1, 7, 7, 7, 8],
[-1, -1, -1, -1, -1, 8, 8, 8, 8]]))
])
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
tokens = en_tokenizer(sentence)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
lca = doc.get_lca_matrix()
assert (lca == lca_matrix).all()
assert(lca[1, 1] == 1)
assert(lca[0, 1] == 2)
assert(lca[1, 2] == 2)

View File

@ -62,10 +62,24 @@ def test_spans_lca_matrix(en_tokenizer):
tokens = en_tokenizer('the lazy dog slept')
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
lca = doc[:2].get_lca_matrix()
assert(lca[0, 0] == 0)
assert(lca[0, 1] == -1)
assert(lca[1, 0] == -1)
assert(lca[1, 1] == 1)
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the
assert lca[0, 1] == -1 # the & lazy -> dog (out of span)
assert lca[1, 0] == -1 # lazy & the -> dog (out of span)
assert lca[1, 1] == 1 # lazy & lazy -> lazy
lca = doc[1:].get_lca_matrix()
assert lca.shape == (3, 3)
assert lca[0, 0] == 0 # lazy & lazy -> lazy
assert lca[0, 1] == 1 # lazy & dog -> dog
assert lca[0, 2] == 2 # lazy & slept -> slept
lca = doc[2:].get_lca_matrix()
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # dog & dog -> dog
assert lca[0, 1] == 1 # dog & slept -> slept
assert lca[1, 0] == 1 # slept & dog -> slept
assert lca[1, 1] == 1 # slept & slept -> slept
def test_span_similarity_match():

View File

@ -6,9 +6,10 @@ from ..util import get_doc
import pytest
import numpy
@pytest.mark.parametrize('sentence,matrix', [
@pytest.mark.parametrize('sentence,heads,matrix', [
(
'She created a test for spacy',
[1, 0, 1, -2, -1, -1],
numpy.array([
[0, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
@ -18,8 +19,9 @@ import numpy
[1, 1, 3, 3, 4, 5]], dtype=numpy.int32)
)
])
def test_issue2396(EN, sentence, matrix):
doc = EN(sentence)
def test_issue2396(en_tokenizer, sentence, heads, matrix):
tokens = en_tokenizer(sentence)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
span = doc[:]
assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all()

View File

@ -1,4 +1,3 @@
# coding: utf8
# cython: infer_types=True
# cython: bounds_check=False
@ -1051,21 +1050,30 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
cdef int [:,:] lca_matrix
n_tokens= end - start
lca_matrix = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
lca_mat.fill(-1)
lca_matrix = lca_mat
for j in range(start, end):
token_j = doc[j]
for j in range(n_tokens):
token_j = doc[start + j]
# the common ancestor of token and itself is itself:
lca_matrix[j, j] = j
for k in range(j + 1, end):
lca = _get_tokens_lca(token_j, doc[k])
# we will only iterate through tokens in the same sentence
sent = token_j.sent
sent_start = sent.start
j_idx_in_sent = start + j - sent_start
n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
# make sure we do not go past `end`, in cases where `end` < sent.end
max_range = min(j + n_missing_tokens_in_sent, end)
for k in range(j + 1, max_range):
lca = _get_tokens_lca(token_j, doc[start + k])
# if lca is outside of span, we set it to -1
if not start <= lca < end:
lca_matrix[j, k] = -1
lca_matrix[k, j] = -1
else:
lca_matrix[j, k] = lca
lca_matrix[k, j] = lca
lca_matrix[j, k] = lca - start
lca_matrix[k, j] = lca - start
return lca_matrix