Merge pull request #1987 from thomasopsomer/span-sent

Make span.sent work when only manual / custom sbd
This commit is contained in:
Matthew Honnibal 2018-02-18 14:05:37 +01:00 committed by GitHub
commit 1e5aeb4eec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 54 additions and 9 deletions

View File

@ -19,6 +19,15 @@ def doc(en_tokenizer):
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
@pytest.fixture
def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text)
d = get_doc(tokens.vocab, [t.text for t in tokens])
d.is_parsed = False
return d
def test_spans_sent_spans(doc): def test_spans_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
assert span.root.text == 'sentence' assert span.root.text == 'sentence'
assert span.root.head.text == 'is' assert span.root.head.text == 'is'
def test_spans_string_fn(doc): def test_spans_string_fn(doc):
span = doc[0:4] span = doc[0:4]
assert len(span) == 4 assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
assert span.upper_ == 'THIS IS A SENTENCE' assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence' assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer): def test_spans_root2(en_tokenizer):
text = "through North and South Carolina" text = "through North and South Carolina"
heads = [0, 3, -1, -2, -4] heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
assert doc[-2:].root.text == 'Carolina' assert doc[-2:].root.text == 'Carolina'
def test_spans_span_sent(doc): def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property""" """Test span.sent property"""
assert len(list(doc.sents)) assert len(list(doc.sents))
assert doc[:2].sent.root.text == 'is' assert doc[:2].sent.root.text == 'is'
assert doc[:2].sent.text == 'This is a sentence .' assert doc[:2].sent.text == 'This is a sentence .'
assert doc[6:7].sent.root.left_edge.text == 'This' assert doc[6:7].sent.root.left_edge.text == 'This'
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
def test_spans_lca_matrix(en_tokenizer): def test_spans_lca_matrix(en_tokenizer):

View File

@ -285,16 +285,45 @@ cdef class Span:
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
# This should raise if we're not parsed. # This should raise if we're not parsed
# or doesen't have any sbd component :)
self.doc.sents self.doc.sents
# if doc is parsed we can use the deps to find the sentence
# otherwise we use the `sent_start` token attribute
cdef int n = 0 cdef int n = 0
root = &self.doc.c[self.start] cdef int i
while root.head != 0: if self.doc.is_parsed:
root += root.head root = &self.doc.c[self.start]
n += 1 while root.head != 0:
if n >= self.doc.length: root += root.head
raise RuntimeError n += 1
return self.doc[root.l_edge:root.r_edge + 1] if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1]
else:
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start == 1
for i in range(self.doc.length):
if self.doc.c[i].sent_start == 1:
break
else:
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
# find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
start += -1
# find end of the sentence
end = self.end
while self.doc.c[end].sent_start != 1:
end += 1
if n >= self.doc.length:
break
#
return self.doc[start:end]
property has_vector: property has_vector:
"""RETURNS (bool): Whether a word vector is associated with the object. """RETURNS (bool): Whether a word vector is associated with the object.