Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869

* Document that doc[0].is_sent_start defaults True.
This commit is contained in:
Matthew Honnibal 2019-02-27 11:17:17 +01:00 committed by GitHub
parent 2d3ce89b78
commit 4a3371acd5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 18 additions and 4 deletions

View File

@ -192,3 +192,9 @@ def test_tokens_sent(doc):
assert doc[7].sent.text == "This is another sentence ." assert doc[7].sent.text == "This is another sentence ."
assert doc[1].sent.root.left_edge.text == "This" assert doc[1].sent.root.left_edge.text == "This"
assert doc[7].sent.root.left_edge.text == "This" assert doc[7].sent.root.left_edge.text == "This"
def test_token0_has_sent_start_true():
doc = Doc(Vocab(), words=["hello", "world"])
assert doc[0].is_sent_start is True
assert doc[1].is_sent_start is None
assert not doc.is_sentenced

View File

@ -188,13 +188,18 @@ cdef class Doc:
@property @property
def is_sentenced(self): def is_sentenced(self):
# Check if the document has sentence boundaries, """Check if the document has sentence boundaries assigned. This is
# i.e at least one tok has the sent_start in (-1, 1) defined as having at least one of the following:
a) An entry "sents" in doc.user_hooks";
b) sent.is_parsed is set to True;
c) At least one token other than the first where sent_start is not None.
"""
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
return True return True
if self.is_parsed: if self.is_parsed:
return True return True
for i in range(self.length): for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True return True
else: else:
@ -569,6 +574,9 @@ cdef class Doc:
raise ValueError(Errors.E031.format(i=self.length)) raise ValueError(Errors.E031.format(i=self.length))
t.spacy = has_space t.spacy = has_space
self.length += 1 self.length += 1
if self.length == 1:
# Set token.sent_start to 1 for first token. See issue #2869
self.c[0].sent_start = 1
return t.idx + t.lex.length + t.spacy return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False) @cython.boundscheck(False)

View File

@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
## Token.is_sent_start {#is_sent_start tag="property" new="2"} ## Token.is_sent_start {#is_sent_start tag="property" new="2"}
A boolean value indicating whether the token starts a sentence. `None` if A boolean value indicating whether the token starts a sentence. `None` if
unknown. unknown. Defaults to `True` for the first token in the `doc`.
> #### Example > #### Example
> >