mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #1987 from thomasopsomer/span-sent
Make span.sent work when only manual / custom sbd
This commit is contained in:
commit
1e5aeb4eec
|
@ -19,6 +19,15 @@ def doc(en_tokenizer):
|
||||||
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc_not_parsed(en_tokenizer):
|
||||||
|
text = "This is a sentence. This is another sentence. And a third."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
d = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
|
d.is_parsed = False
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sent_spans(doc):
|
def test_spans_sent_spans(doc):
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert sents[0].start == 0
|
assert sents[0].start == 0
|
||||||
|
@ -34,6 +43,7 @@ def test_spans_root(doc):
|
||||||
assert span.root.text == 'sentence'
|
assert span.root.text == 'sentence'
|
||||||
assert span.root.head.text == 'is'
|
assert span.root.head.text == 'is'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_string_fn(doc):
|
def test_spans_string_fn(doc):
|
||||||
span = doc[0:4]
|
span = doc[0:4]
|
||||||
assert len(span) == 4
|
assert len(span) == 4
|
||||||
|
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
|
||||||
assert span.upper_ == 'THIS IS A SENTENCE'
|
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||||
assert span.lower_ == 'this is a sentence'
|
assert span.lower_ == 'this is a sentence'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_root2(en_tokenizer):
|
def test_spans_root2(en_tokenizer):
|
||||||
text = "through North and South Carolina"
|
text = "through North and South Carolina"
|
||||||
heads = [0, 3, -1, -2, -4]
|
heads = [0, 3, -1, -2, -4]
|
||||||
|
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
|
||||||
assert doc[-2:].root.text == 'Carolina'
|
assert doc[-2:].root.text == 'Carolina'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_span_sent(doc):
|
def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
"""Test span.sent property"""
|
"""Test span.sent property"""
|
||||||
assert len(list(doc.sents))
|
assert len(list(doc.sents))
|
||||||
assert doc[:2].sent.root.text == 'is'
|
assert doc[:2].sent.root.text == 'is'
|
||||||
assert doc[:2].sent.text == 'This is a sentence .'
|
assert doc[:2].sent.text == 'This is a sentence .'
|
||||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||||
|
# test on manual sbd
|
||||||
|
doc_not_parsed[0].is_sent_start = True
|
||||||
|
doc_not_parsed[5].is_sent_start = True
|
||||||
|
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
|
||||||
|
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||||
|
|
||||||
|
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
|
|
|
@ -285,16 +285,45 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
# This should raise if we're not parsed.
|
# This should raise if we're not parsed
|
||||||
|
# or doesen't have any sbd component :)
|
||||||
self.doc.sents
|
self.doc.sents
|
||||||
|
# if doc is parsed we can use the deps to find the sentence
|
||||||
|
# otherwise we use the `sent_start` token attribute
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
root = &self.doc.c[self.start]
|
cdef int i
|
||||||
while root.head != 0:
|
if self.doc.is_parsed:
|
||||||
root += root.head
|
root = &self.doc.c[self.start]
|
||||||
n += 1
|
while root.head != 0:
|
||||||
if n >= self.doc.length:
|
root += root.head
|
||||||
raise RuntimeError
|
n += 1
|
||||||
return self.doc[root.l_edge:root.r_edge + 1]
|
if n >= self.doc.length:
|
||||||
|
raise RuntimeError
|
||||||
|
return self.doc[root.l_edge:root.r_edge + 1]
|
||||||
|
else:
|
||||||
|
# Check if the document has sentence boundaries,
|
||||||
|
# i.e at least one tok has the sent_start == 1
|
||||||
|
for i in range(self.doc.length):
|
||||||
|
if self.doc.c[i].sent_start == 1:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Access to sentence requires either the dependency parse "
|
||||||
|
"or sentence boundaries to be set by setting " +
|
||||||
|
"doc[i].is_sent_start = True")
|
||||||
|
# find start of the sentence
|
||||||
|
start = self.start
|
||||||
|
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||||
|
start += -1
|
||||||
|
# find end of the sentence
|
||||||
|
end = self.end
|
||||||
|
while self.doc.c[end].sent_start != 1:
|
||||||
|
end += 1
|
||||||
|
if n >= self.doc.length:
|
||||||
|
break
|
||||||
|
#
|
||||||
|
return self.doc[start:end]
|
||||||
|
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user