diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1bc534ecd..d1a6316d5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): doc = en_tokenizer(text) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index d22fa52ae..29aefe5c7 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -54,6 +54,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ca5a3d696..aca35a73f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -614,6 +614,56 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy doc. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + # Efficiency notes: + # + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(j, len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + def to_bytes(self): """ Serialize, producing a byte string. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d8890addc..ae28f698a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -130,6 +130,58 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy span. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \