Merge pull request #1987 from thomasopsomer/span-sent

Make span.sent work when only manual / custom sbd
2025-09-16 00:52:38 +03:00 · 2018-02-18 14:05:37 +01:00 · 2018-02-18 14:05:37 +01:00 · 1e5aeb4eec
commit 1e5aeb4eec
parent 1cf774bdc1 5d24a81c0b
2 changed files with 54 additions and 9 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -19,6 +19,15 @@ def doc(en_tokenizer):
    return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
@pytest.fixture
 def doc_not_parsed(en_tokenizer):
    text = "This is a sentence. This is another sentence. And a third."
    tokens = en_tokenizer(text)
    d = get_doc(tokens.vocab, [t.text for t in tokens])
    d.is_parsed = False
    return d
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
    assert span.root.text == 'sentence'
    assert span.root.head.text == 'is'
 def test_spans_string_fn(doc):
    span = doc[0:4]
    assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
    assert span.upper_ == 'THIS IS A SENTENCE'
    assert span.lower_ == 'this is a sentence'
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
    assert doc[-2:].root.text == 'Carolina'
-def test_spans_span_sent(doc):
+def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == 'is'
    assert doc[:2].sent.text == 'This is a sentence .'
    assert doc[6:7].sent.root.left_edge.text == 'This'
    # test on manual sbd
    doc_not_parsed[0].is_sent_start = True
    doc_not_parsed[5].is_sent_start = True
    assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
 def test_spans_lca_matrix(en_tokenizer):
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -285,16 +285,45 @@ cdef class Span:
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
-            # This should raise if we're not parsed.
+            # This should raise if we're not parsed
            # or doesen't have any sbd component :)
            self.doc.sents
            # if doc is parsed we can use the deps to find the sentence
            # otherwise we use the `sent_start` token attribute
            cdef int n = 0
-            root = &self.doc.c[self.start]
+            cdef int i
-            while root.head != 0:
+            if self.doc.is_parsed:
-                root += root.head
+                root = &self.doc.c[self.start]
-                n += 1
+                while root.head != 0:
-                if n >= self.doc.length:
+                    root += root.head
-                    raise RuntimeError
+                    n += 1
-            return self.doc[root.l_edge:root.r_edge + 1]
+                    if n >= self.doc.length:
                        raise RuntimeError
                return self.doc[root.l_edge:root.r_edge + 1]
            else:
                # Check if the document has sentence boundaries,
                # i.e at least one tok has the sent_start == 1
                for i in range(self.doc.length):
                    if self.doc.c[i].sent_start == 1:
                        break
                else:
                    raise ValueError(
                        "Access to sentence requires either the dependency parse "
                        "or sentence boundaries to be set by setting " +
                        "doc[i].is_sent_start = True")
                # find start of the sentence
                start = self.start
                while self.doc.c[start].sent_start != 1 and start > 0:
                    start += -1
                # find end of the sentence
                end = self.end
                while self.doc.c[end].sent_start != 1:
                    end += 1
                    if n >= self.doc.length:
                        break
                #
                return self.doc[start:end]
    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.