Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869 * Document that doc[0].is_sent_start defaults True.
2025-07-15 10:42:34 +03:00 · 2019-02-27 11:17:17 +01:00 · 2019-02-27 11:17:17 +01:00 · 4a3371acd5
commit 4a3371acd5
parent 2d3ce89b78
3 changed files with 18 additions and 4 deletions
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -192,3 +192,9 @@ def test_tokens_sent(doc):
    assert doc[7].sent.text == "This is another sentence ."
    assert doc[1].sent.root.left_edge.text == "This"
    assert doc[7].sent.root.left_edge.text == "This"
 def test_token0_has_sent_start_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_start is True
    assert doc[1].is_sent_start is None
    assert not doc.is_sentenced
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -188,13 +188,18 @@ cdef class Doc:
    @property
    def is_sentenced(self):
-        # Check if the document has sentence boundaries,
+        """Check if the document has sentence boundaries assigned. This is
-        # i.e at least one tok has the sent_start in (-1, 1)
+        defined as having at least one of the following:
        a) An entry "sents" in doc.user_hooks";
        b) sent.is_parsed is set to True;
        c) At least one token other than the first where sent_start is not None.
        """
        if 'sents' in self.user_hooks:
            return True
        if self.is_parsed:
            return True
-        for i in range(self.length):
+        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        else:
@ -569,6 +574,9 @@ cdef class Doc:
            raise ValueError(Errors.E031.format(i=self.length))
        t.spacy = has_space
        self.length += 1
        if self.length == 1:
            # Set token.sent_start to 1 for first token. See issue #2869
            self.c[0].sent_start = 1
        return t.idx + t.lex.length + t.spacy
    @cython.boundscheck(False)
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
 ## Token.is_sent_start {#is_sent_start tag="property" new="2"}
 A boolean value indicating whether the token starts a sentence. `None` if
-unknown.
+unknown. Defaults to `True` for the first token in the `doc`.
 > #### Example
 >