diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index f00668d83..ee99897d6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -278,10 +278,30 @@ def test_doc_api_similarity_match(): assert doc.similarity(doc2) == 0.0 -def test_lowest_common_ancestor(en_tokenizer): - tokens = en_tokenizer('the lazy dog slept') - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) +@pytest.mark.parametrize('sentence,heads,lca_matrix', [ + ('the lazy dog slept', + [2, 1, 1, 0], + numpy.array([[0, 2, 2, 3], + [2, 1, 2, 3], + [2, 2, 2, 3], + [3, 3, 3, 3]])), + ('The lazy dog slept. The quick fox jumped', + [2, 1, 1, 0, -1, 2, 1, 1, 0], + numpy.array([[0, 2, 2, 3, 3, -1, -1, -1, -1], + [2, 1, 2, 3, 3, -1, -1, -1, -1], + [2, 2, 2, 3, 3, -1, -1, -1, -1], + [3, 3, 3, 3, 3, -1, -1, -1, -1], + [3, 3, 3, 3, 4, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, 5, 7, 7, 8], + [-1, -1, -1, -1, -1, 7, 6, 7, 8], + [-1, -1, -1, -1, -1, 7, 7, 7, 8], + [-1, -1, -1, -1, -1, 8, 8, 8, 8]])) +]) +def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): + tokens = en_tokenizer(sentence) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) lca = doc.get_lca_matrix() + assert (lca == lca_matrix).all() assert(lca[1, 1] == 1) assert(lca[0, 1] == 2) assert(lca[1, 2] == 2) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 926624633..413bd0611 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -62,10 +62,24 @@ def test_spans_lca_matrix(en_tokenizer): tokens = en_tokenizer('the lazy dog slept') doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) lca = doc[:2].get_lca_matrix() - assert(lca[0, 0] == 0) - assert(lca[0, 1] == -1) - assert(lca[1, 0] == -1) - assert(lca[1, 1] == 1) + assert lca.shape == (2, 2) + assert lca[0, 0] == 0 # the & the -> the + assert lca[0, 1] == -1 # the & lazy -> dog (out of span) + assert lca[1, 0] == -1 # lazy & the -> dog (out of span) + assert lca[1, 1] == 1 # lazy & lazy -> lazy + + lca = doc[1:].get_lca_matrix() + assert lca.shape == (3, 3) + assert lca[0, 0] == 0 # lazy & lazy -> lazy + assert lca[0, 1] == 1 # lazy & dog -> dog + assert lca[0, 2] == 2 # lazy & slept -> slept + + lca = doc[2:].get_lca_matrix() + assert lca.shape == (2, 2) + assert lca[0, 0] == 0 # dog & dog -> dog + assert lca[0, 1] == 1 # dog & slept -> slept + assert lca[1, 0] == 1 # slept & dog -> slept + assert lca[1, 1] == 1 # slept & slept -> slept def test_span_similarity_match(): diff --git a/spacy/tests/regression/test_issue2396.py b/spacy/tests/regression/test_issue2396.py index c3ff04225..424c34ac1 100644 --- a/spacy/tests/regression/test_issue2396.py +++ b/spacy/tests/regression/test_issue2396.py @@ -6,9 +6,10 @@ from ..util import get_doc import pytest import numpy -@pytest.mark.parametrize('sentence,matrix', [ +@pytest.mark.parametrize('sentence,heads,matrix', [ ( 'She created a test for spacy', + [1, 0, 1, -2, -1, -1], numpy.array([ [0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], @@ -18,8 +19,9 @@ import numpy [1, 1, 3, 3, 4, 5]], dtype=numpy.int32) ) ]) -def test_issue2396(EN, sentence, matrix): - doc = EN(sentence) +def test_issue2396(en_tokenizer, sentence, heads, matrix): + tokens = en_tokenizer(sentence) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) span = doc[:] assert (doc.get_lca_matrix() == matrix).all() assert (span.get_lca_matrix() == matrix).all() diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ae3b4fed6..ba39ac304 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,4 +1,3 @@ - # coding: utf8 # cython: infer_types=True # cython: bounds_check=False @@ -1051,21 +1050,30 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): cdef int [:,:] lca_matrix n_tokens= end - start - lca_matrix = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) + lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) + lca_mat.fill(-1) + lca_matrix = lca_mat - for j in range(start, end): - token_j = doc[j] + for j in range(n_tokens): + token_j = doc[start + j] # the common ancestor of token and itself is itself: lca_matrix[j, j] = j - for k in range(j + 1, end): - lca = _get_tokens_lca(token_j, doc[k]) + # we will only iterate through tokens in the same sentence + sent = token_j.sent + sent_start = sent.start + j_idx_in_sent = start + j - sent_start + n_missing_tokens_in_sent = len(sent) - j_idx_in_sent + # make sure we do not go past `end`, in cases where `end` < sent.end + max_range = min(j + n_missing_tokens_in_sent, end) + for k in range(j + 1, max_range): + lca = _get_tokens_lca(token_j, doc[start + k]) # if lca is outside of span, we set it to -1 if not start <= lca < end: lca_matrix[j, k] = -1 lca_matrix[k, j] = -1 else: - lca_matrix[j, k] = lca - lca_matrix[k, j] = lca + lca_matrix[j, k] = lca - start + lca_matrix[k, j] = lca - start return lca_matrix