Merge pull request #1987 from thomasopsomer/span-sent

Make span.sent work when only manual / custom sbd
2025-07-15 10:42:34 +03:00 · 2018-02-18 14:05:37 +01:00 · 2018-02-18 14:05:37 +01:00 · 1e5aeb4eec
commit 1e5aeb4eec
parent 1cf774bdc1 5d24a81c0b
2 changed files with 54 additions and 9 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -19,6 +19,15 @@ def doc(en_tokenizer):
    return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)


+@pytest.fixture
+def doc_not_parsed(en_tokenizer):
+    text = "This is a sentence. This is another sentence. And a third."
+    tokens = en_tokenizer(text)
+    d = get_doc(tokens.vocab, [t.text for t in tokens])
+    d.is_parsed = False
+    return d
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
    assert span.root.text == 'sentence'
    assert span.root.head.text == 'is'

+
 def test_spans_string_fn(doc):
    span = doc[0:4]
    assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
    assert span.upper_ == 'THIS IS A SENTENCE'
    assert span.lower_ == 'this is a sentence'

+
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
    assert doc[-2:].root.text == 'Carolina'


-def test_spans_span_sent(doc):
+def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == 'is'
    assert doc[:2].sent.text == 'This is a sentence .'
    assert doc[6:7].sent.root.left_edge.text == 'This'
+    # test on manual sbd
+    doc_not_parsed[0].is_sent_start = True
+    doc_not_parsed[5].is_sent_start = True
+    assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
+    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]


 def test_spans_lca_matrix(en_tokenizer):
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -285,16 +285,45 @@ cdef class Span:
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
-            # This should raise if we're not parsed.
+            # This should raise if we're not parsed
+            # or doesen't have any sbd component :)
            self.doc.sents
+            # if doc is parsed we can use the deps to find the sentence
+            # otherwise we use the `sent_start` token attribute
            cdef int n = 0
-            root = &self.doc.c[self.start]
-            while root.head != 0:
-                root += root.head
-                n += 1
-                if n >= self.doc.length:
-                    raise RuntimeError
-            return self.doc[root.l_edge:root.r_edge + 1]
+            cdef int i
+            if self.doc.is_parsed:
+                root = &self.doc.c[self.start]
+                while root.head != 0:
+                    root += root.head
+                    n += 1
+                    if n >= self.doc.length:
+                        raise RuntimeError
+                return self.doc[root.l_edge:root.r_edge + 1]
+            else:
+                # Check if the document has sentence boundaries,
+                # i.e at least one tok has the sent_start == 1
+                for i in range(self.doc.length):
+                    if self.doc.c[i].sent_start == 1:
+                        break
+                else:
+                    raise ValueError(
+                        "Access to sentence requires either the dependency parse "
+                        "or sentence boundaries to be set by setting " +
+                        "doc[i].is_sent_start = True")
+                # find start of the sentence
+                start = self.start
+                while self.doc.c[start].sent_start != 1 and start > 0:
+                    start += -1
+                # find end of the sentence
+                end = self.end
+                while self.doc.c[end].sent_start != 1:
+                    end += 1
+                    if n >= self.doc.length:
+                        break
+                #
+                return self.doc[start:end]
+

    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.