diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 5713c5c07..22da293a8 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -192,3 +192,9 @@ def test_tokens_sent(doc): assert doc[7].sent.text == "This is another sentence ." assert doc[1].sent.root.left_edge.text == "This" assert doc[7].sent.root.left_edge.text == "This" + +def test_token0_has_sent_start_true(): + doc = Doc(Vocab(), words=["hello", "world"]) + assert doc[0].is_sent_start is True + assert doc[1].is_sent_start is None + assert not doc.is_sentenced diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e1c2fc575..97ac10f76 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -188,13 +188,18 @@ cdef class Doc: @property def is_sentenced(self): - # Check if the document has sentence boundaries, - # i.e at least one tok has the sent_start in (-1, 1) + """Check if the document has sentence boundaries assigned. This is + defined as having at least one of the following: + + a) An entry "sents" in doc.user_hooks"; + b) sent.is_parsed is set to True; + c) At least one token other than the first where sent_start is not None. + """ if 'sents' in self.user_hooks: return True if self.is_parsed: return True - for i in range(self.length): + for i in range(1, self.length): if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: return True else: @@ -569,6 +574,9 @@ cdef class Doc: raise ValueError(Errors.E031.format(i=self.length)) t.spacy = has_space self.length += 1 + if self.length == 1: + # Set token.sent_start to 1 for first token. See issue #2869 + self.c[0].sent_start = 1 return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 5c22eaae3..1089d2329 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants. ## Token.is_sent_start {#is_sent_start tag="property" new="2"} A boolean value indicating whether the token starts a sentence. `None` if -unknown. +unknown. Defaults to `True` for the first token in the `doc`. > #### Example >