mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Bugfix/get lca matrix (#3110)
This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!). ## Description The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed. ### Types of change Bug fix ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
9972716e01
commit
e03e1eee92
|
@ -278,10 +278,30 @@ def test_doc_api_similarity_match():
|
|||
assert doc.similarity(doc2) == 0.0
|
||||
|
||||
|
||||
def test_lowest_common_ancestor(en_tokenizer):
|
||||
tokens = en_tokenizer('the lazy dog slept')
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
@pytest.mark.parametrize('sentence,heads,lca_matrix', [
|
||||
('the lazy dog slept',
|
||||
[2, 1, 1, 0],
|
||||
numpy.array([[0, 2, 2, 3],
|
||||
[2, 1, 2, 3],
|
||||
[2, 2, 2, 3],
|
||||
[3, 3, 3, 3]])),
|
||||
('The lazy dog slept. The quick fox jumped',
|
||||
[2, 1, 1, 0, -1, 2, 1, 1, 0],
|
||||
numpy.array([[0, 2, 2, 3, 3, -1, -1, -1, -1],
|
||||
[2, 1, 2, 3, 3, -1, -1, -1, -1],
|
||||
[2, 2, 2, 3, 3, -1, -1, -1, -1],
|
||||
[3, 3, 3, 3, 3, -1, -1, -1, -1],
|
||||
[3, 3, 3, 3, 4, -1, -1, -1, -1],
|
||||
[-1, -1, -1, -1, -1, 5, 7, 7, 8],
|
||||
[-1, -1, -1, -1, -1, 7, 6, 7, 8],
|
||||
[-1, -1, -1, -1, -1, 7, 7, 7, 8],
|
||||
[-1, -1, -1, -1, -1, 8, 8, 8, 8]]))
|
||||
])
|
||||
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
lca = doc.get_lca_matrix()
|
||||
assert (lca == lca_matrix).all()
|
||||
assert(lca[1, 1] == 1)
|
||||
assert(lca[0, 1] == 2)
|
||||
assert(lca[1, 2] == 2)
|
||||
|
|
|
@ -62,10 +62,24 @@ def test_spans_lca_matrix(en_tokenizer):
|
|||
tokens = en_tokenizer('the lazy dog slept')
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert(lca[0, 0] == 0)
|
||||
assert(lca[0, 1] == -1)
|
||||
assert(lca[1, 0] == -1)
|
||||
assert(lca[1, 1] == 1)
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # the & the -> the
|
||||
assert lca[0, 1] == -1 # the & lazy -> dog (out of span)
|
||||
assert lca[1, 0] == -1 # lazy & the -> dog (out of span)
|
||||
assert lca[1, 1] == 1 # lazy & lazy -> lazy
|
||||
|
||||
lca = doc[1:].get_lca_matrix()
|
||||
assert lca.shape == (3, 3)
|
||||
assert lca[0, 0] == 0 # lazy & lazy -> lazy
|
||||
assert lca[0, 1] == 1 # lazy & dog -> dog
|
||||
assert lca[0, 2] == 2 # lazy & slept -> slept
|
||||
|
||||
lca = doc[2:].get_lca_matrix()
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # dog & dog -> dog
|
||||
assert lca[0, 1] == 1 # dog & slept -> slept
|
||||
assert lca[1, 0] == 1 # slept & dog -> slept
|
||||
assert lca[1, 1] == 1 # slept & slept -> slept
|
||||
|
||||
|
||||
def test_span_similarity_match():
|
||||
|
|
|
@ -6,9 +6,10 @@ from ..util import get_doc
|
|||
import pytest
|
||||
import numpy
|
||||
|
||||
@pytest.mark.parametrize('sentence,matrix', [
|
||||
@pytest.mark.parametrize('sentence,heads,matrix', [
|
||||
(
|
||||
'She created a test for spacy',
|
||||
[1, 0, 1, -2, -1, -1],
|
||||
numpy.array([
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1],
|
||||
|
@ -18,8 +19,9 @@ import numpy
|
|||
[1, 1, 3, 3, 4, 5]], dtype=numpy.int32)
|
||||
)
|
||||
])
|
||||
def test_issue2396(EN, sentence, matrix):
|
||||
doc = EN(sentence)
|
||||
def test_issue2396(en_tokenizer, sentence, heads, matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
span = doc[:]
|
||||
assert (doc.get_lca_matrix() == matrix).all()
|
||||
assert (span.get_lca_matrix() == matrix).all()
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# coding: utf8
|
||||
# cython: infer_types=True
|
||||
# cython: bounds_check=False
|
||||
|
@ -1051,21 +1050,30 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
cdef int [:,:] lca_matrix
|
||||
|
||||
n_tokens= end - start
|
||||
lca_matrix = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||
lca_mat.fill(-1)
|
||||
lca_matrix = lca_mat
|
||||
|
||||
for j in range(start, end):
|
||||
token_j = doc[j]
|
||||
for j in range(n_tokens):
|
||||
token_j = doc[start + j]
|
||||
# the common ancestor of token and itself is itself:
|
||||
lca_matrix[j, j] = j
|
||||
for k in range(j + 1, end):
|
||||
lca = _get_tokens_lca(token_j, doc[k])
|
||||
# we will only iterate through tokens in the same sentence
|
||||
sent = token_j.sent
|
||||
sent_start = sent.start
|
||||
j_idx_in_sent = start + j - sent_start
|
||||
n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
|
||||
# make sure we do not go past `end`, in cases where `end` < sent.end
|
||||
max_range = min(j + n_missing_tokens_in_sent, end)
|
||||
for k in range(j + 1, max_range):
|
||||
lca = _get_tokens_lca(token_j, doc[start + k])
|
||||
# if lca is outside of span, we set it to -1
|
||||
if not start <= lca < end:
|
||||
lca_matrix[j, k] = -1
|
||||
lca_matrix[k, j] = -1
|
||||
else:
|
||||
lca_matrix[j, k] = lca
|
||||
lca_matrix[k, j] = lca
|
||||
lca_matrix[j, k] = lca - start
|
||||
lca_matrix[k, j] = lca - start
|
||||
|
||||
return lca_matrix
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user