mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
Merge pull request #1298 from ericzhao28/master
Lowest common ancestor matrix for spans and docs
This commit is contained in:
commit
33313c01ad
|
@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
|
|
||||||
|
def test_lowest_common_ancestor(en_tokenizer):
|
||||||
|
tokens = en_tokenizer('the lazy dog slept')
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||||
|
lca = doc.get_lca_matrix()
|
||||||
|
assert(lca[1, 1] == 1)
|
||||||
|
assert(lca[0, 1] == 2)
|
||||||
|
assert(lca[1, 2] == 2)
|
||||||
|
|
||||||
def test_parse_tree(en_tokenizer):
|
def test_parse_tree(en_tokenizer):
|
||||||
"""Tests doc.print_tree() method."""
|
"""Tests doc.print_tree() method."""
|
||||||
|
|
|
@ -54,6 +54,17 @@ def test_spans_span_sent(doc):
|
||||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||||
|
|
||||||
|
|
||||||
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
|
"""Test span's lca matrix generation"""
|
||||||
|
tokens = en_tokenizer('the lazy dog slept')
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||||
|
lca = doc[:2].get_lca_matrix()
|
||||||
|
assert(lca[0, 0] == 0)
|
||||||
|
assert(lca[0, 1] == -1)
|
||||||
|
assert(lca[1, 0] == -1)
|
||||||
|
assert(lca[1, 1] == 1)
|
||||||
|
|
||||||
|
|
||||||
def test_spans_default_sentiment(en_tokenizer):
|
def test_spans_default_sentiment(en_tokenizer):
|
||||||
"""Test span.sentiment property's default averaging behaviour"""
|
"""Test span.sentiment property's default averaging behaviour"""
|
||||||
text = "good stuff bad stuff"
|
text = "good stuff bad stuff"
|
||||||
|
|
|
@ -614,6 +614,56 @@ cdef class Doc:
|
||||||
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def get_lca_matrix(self):
|
||||||
|
'''
|
||||||
|
Calculates the lowest common ancestor matrix
|
||||||
|
for a given Spacy doc.
|
||||||
|
Returns LCA matrix containing the integer index
|
||||||
|
of the ancestor, or -1 if no common ancestor is
|
||||||
|
found (ex if span excludes a necessary ancestor).
|
||||||
|
Apologies about the recursion, but the
|
||||||
|
impact on performance is negligible given
|
||||||
|
the natural limitations on the depth of a typical human sentence.
|
||||||
|
'''
|
||||||
|
# Efficiency notes:
|
||||||
|
#
|
||||||
|
# We can easily improve the performance here by iterating in Cython.
|
||||||
|
# To loop over the tokens in Cython, the easiest way is:
|
||||||
|
# for token in doc.c[:doc.c.length]:
|
||||||
|
# head = token + token.head
|
||||||
|
# Both token and head will be TokenC* here. The token.head attribute
|
||||||
|
# is an integer offset.
|
||||||
|
def __pairwise_lca(token_j, token_k, lca_matrix):
|
||||||
|
if lca_matrix[token_j.i][token_k.i] != -2:
|
||||||
|
return lca_matrix[token_j.i][token_k.i]
|
||||||
|
elif token_j == token_k:
|
||||||
|
lca_index = token_j.i
|
||||||
|
elif token_k.head == token_j:
|
||||||
|
lca_index = token_j.i
|
||||||
|
elif token_j.head == token_k:
|
||||||
|
lca_index = token_k.i
|
||||||
|
elif (token_j.head == token_j) and (token_k.head == token_k):
|
||||||
|
lca_index = -1
|
||||||
|
else:
|
||||||
|
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
|
||||||
|
lca_matrix[token_j.i][token_k.i] = lca_index
|
||||||
|
lca_matrix[token_k.i][token_j.i] = lca_index
|
||||||
|
|
||||||
|
return lca_index
|
||||||
|
|
||||||
|
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||||
|
lca_matrix.fill(-2)
|
||||||
|
for j in range(len(self)):
|
||||||
|
token_j = self[j]
|
||||||
|
for k in range(j, len(self)):
|
||||||
|
token_k = self[k]
|
||||||
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
||||||
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
|
return lca_matrix
|
||||||
|
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
"""
|
"""
|
||||||
Serialize, producing a byte string.
|
Serialize, producing a byte string.
|
||||||
|
|
|
@ -130,6 +130,58 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
def get_lca_matrix(self):
|
||||||
|
'''
|
||||||
|
Calculates the lowest common ancestor matrix
|
||||||
|
for a given Spacy span.
|
||||||
|
Returns LCA matrix containing the integer index
|
||||||
|
of the ancestor, or -1 if no common ancestor is
|
||||||
|
found (ex if span excludes a necessary ancestor).
|
||||||
|
Apologies about the recursion, but the
|
||||||
|
impact on performance is negligible given
|
||||||
|
the natural limitations on the depth of a typical human sentence.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
||||||
|
offset = margins[0]
|
||||||
|
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
||||||
|
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
||||||
|
token_j_i = token_j.i - offset
|
||||||
|
token_k_i = token_k.i - offset
|
||||||
|
|
||||||
|
if lca_matrix[token_j_i][token_k_i] != -2:
|
||||||
|
return lca_matrix[token_j_i][token_k_i]
|
||||||
|
elif token_j == token_k:
|
||||||
|
lca_index = token_j_i
|
||||||
|
elif token_k_head == token_j:
|
||||||
|
lca_index = token_j_i
|
||||||
|
elif token_j_head == token_k:
|
||||||
|
lca_index = token_k_i
|
||||||
|
elif (token_j_head == token_j) and (token_k_head == token_k):
|
||||||
|
lca_index = -1
|
||||||
|
else:
|
||||||
|
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
||||||
|
|
||||||
|
lca_matrix[token_j_i][token_k_i] = lca_index
|
||||||
|
lca_matrix[token_k_i][token_j_i] = lca_index
|
||||||
|
|
||||||
|
return lca_index
|
||||||
|
|
||||||
|
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||||
|
lca_matrix.fill(-2)
|
||||||
|
margins = [self.start, self.end]
|
||||||
|
|
||||||
|
for j in range(len(self)):
|
||||||
|
token_j = self[j]
|
||||||
|
for k in range(len(self)):
|
||||||
|
token_k = self[k]
|
||||||
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
||||||
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
|
return lca_matrix
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1:
|
cpdef int _recalculate_indices(self) except -1:
|
||||||
if self.end > self.doc.length \
|
if self.end > self.doc.length \
|
||||||
or self.doc.c[self.start].idx != self.start_char \
|
or self.doc.c[self.start].idx != self.start_char \
|
||||||
|
|
Loading…
Reference in New Issue
Block a user