From d2fe56a5779fbc56b1b8db2b16dc45443d1e076c Mon Sep 17 00:00:00 2001
From: Ramanan Balakrishnan <ramanan90@gmail.com>
Date: Fri, 20 Oct 2017 23:58:00 +0530
Subject: [PATCH] Add LCA matrix for spans and docs

---
 spacy/tests/doc/test_doc_api.py |  7 +++++
 spacy/tests/spans/test_span.py  | 11 ++++++++
 spacy/tokens/doc.pyx            | 48 +++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx           | 50 +++++++++++++++++++++++++++++++++
 4 files changed, 116 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index cbe1bbc66..5e052f771 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -217,6 +217,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
     doc = en_tokenizer(text)
     assert doc.has_vector
 
+def test_lowest_common_ancestor(en_tokenizer):
+    tokens = en_tokenizer('the lazy dog slept')
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
+    lca = doc.get_lca_matrix()
+    assert(lca[1, 1] == 1)
+    assert(lca[0, 1] == 2)
+    assert(lca[1, 2] == 2)
 
 def test_parse_tree(en_tokenizer):
     """Tests doc.print_tree() method."""
diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py
index 7ed9333b8..5e7c638b6 100644
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@@ -55,6 +55,17 @@ def test_spans_span_sent(doc):
     assert doc[6:7].sent.root.left_edge.text == 'This'
 
 
+def test_spans_lca_matrix(en_tokenizer):
+    """Test span's lca matrix generation"""
+    tokens = en_tokenizer('the lazy dog slept')
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
+    lca = doc[:2].get_lca_matrix()
+    assert(lca[0, 0] == 0)
+    assert(lca[0, 1] == -1)
+    assert(lca[1, 0] == -1)
+    assert(lca[1, 1] == 1)
+
+
 def test_spans_default_sentiment(en_tokenizer):
     """Test span.sentiment property's default averaging behaviour"""
     text = "good stuff bad stuff"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 809f178f8..fa5b4ba28 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -660,6 +660,54 @@ cdef class Doc:
         self.is_tagged = bool(TAG in attrs or POS in attrs)
         return self
 
+    def get_lca_matrix(self):
+        '''
+        Calculates the lowest common ancestor matrix
+        for a given Spacy doc.
+        Returns LCA matrix containing the integer index
+        of the ancestor, or -1 if no common ancestor is
+        found (ex if span excludes a necessary ancestor).
+        Apologies about the recursion, but the
+        impact on performance is negligible given
+        the natural limitations on the depth of a typical human sentence.
+        '''
+        # Efficiency notes:
+        #
+        # We can easily improve the performance here by iterating in Cython.
+        # To loop over the tokens in Cython, the easiest way is:
+        # for token in doc.c[:doc.c.length]:
+        #     head = token + token.head
+        # Both token and head will be TokenC* here. The token.head attribute
+        # is an integer offset.
+        def __pairwise_lca(token_j, token_k, lca_matrix):
+            if lca_matrix[token_j.i][token_k.i] != -2:
+                return lca_matrix[token_j.i][token_k.i]
+            elif token_j == token_k:
+                lca_index = token_j.i
+            elif token_k.head == token_j:
+                lca_index = token_j.i
+            elif token_j.head == token_k:
+                lca_index = token_k.i
+            elif (token_j.head == token_j) and (token_k.head == token_k):
+                lca_index = -1
+            else:
+                lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
+            lca_matrix[token_j.i][token_k.i] = lca_index
+            lca_matrix[token_k.i][token_j.i] = lca_index
+
+            return lca_index
+
+        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
+        lca_matrix.fill(-2)
+        for j in range(len(self)):
+            token_j = self[j]
+            for k in range(j, len(self)):
+                token_k = self[k]
+                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
+                lca_matrix[k][j] = lca_matrix[j][k]
+
+        return lca_matrix
+
     def to_disk(self, path, **exclude):
         """Save the current state to a directory.
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 3b31c50c0..b0a170ddf 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -177,6 +177,56 @@ cdef class Span:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
+    def get_lca_matrix(self):
+        '''
+        Calculates the lowest common ancestor matrix
+        for a given Spacy span.
+        Returns LCA matrix containing the integer index
+        of the ancestor, or -1 if no common ancestor is
+        found (ex if span excludes a necessary ancestor).
+        Apologies about the recursion, but the
+        impact on performance is negligible given
+        the natural limitations on the depth of a typical human sentence.
+        '''
+
+        def __pairwise_lca(token_j, token_k, lca_matrix, margins):
+            offset = margins[0]
+            token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
+            token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
+            token_j_i = token_j.i - offset
+            token_k_i = token_k.i - offset
+
+            if lca_matrix[token_j_i][token_k_i] != -2:
+                return lca_matrix[token_j_i][token_k_i]
+            elif token_j == token_k:
+                lca_index = token_j_i
+            elif token_k_head == token_j:
+                lca_index = token_j_i
+            elif token_j_head == token_k:
+                lca_index = token_k_i
+            elif (token_j_head == token_j) and (token_k_head == token_k):
+                lca_index = -1
+            else:
+                lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
+
+            lca_matrix[token_j_i][token_k_i] = lca_index
+            lca_matrix[token_k_i][token_j_i] = lca_index
+
+            return lca_index
+
+        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
+        lca_matrix.fill(-2)
+        margins = [self.start, self.end]
+
+        for j in range(len(self)):
+            token_j = self[j]
+            for k in range(len(self)):
+                token_k = self[k]
+                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
+                lca_matrix[k][j] = lca_matrix[j][k]
+
+        return lca_matrix
+
     cpdef np.ndarray to_array(self, object py_attr_ids):
         """Given a list of M attribute IDs, export the tokens to a numpy
         `ndarray` of shape `(N, M)`, where `N` is the length of the document.