mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Make doc[0] have sent_start True. Closes #2869 * Document that doc[0].is_sent_start defaults True.
This commit is contained in:
parent
2d3ce89b78
commit
4a3371acd5
|
@ -192,3 +192,9 @@ def test_tokens_sent(doc):
|
|||
assert doc[7].sent.text == "This is another sentence ."
|
||||
assert doc[1].sent.root.left_edge.text == "This"
|
||||
assert doc[7].sent.root.left_edge.text == "This"
|
||||
|
||||
def test_token0_has_sent_start_true():
|
||||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_start is True
|
||||
assert doc[1].is_sent_start is None
|
||||
assert not doc.is_sentenced
|
||||
|
|
|
@ -188,13 +188,18 @@ cdef class Doc:
|
|||
|
||||
@property
|
||||
def is_sentenced(self):
|
||||
# Check if the document has sentence boundaries,
|
||||
# i.e at least one tok has the sent_start in (-1, 1)
|
||||
"""Check if the document has sentence boundaries assigned. This is
|
||||
defined as having at least one of the following:
|
||||
|
||||
a) An entry "sents" in doc.user_hooks";
|
||||
b) sent.is_parsed is set to True;
|
||||
c) At least one token other than the first where sent_start is not None.
|
||||
"""
|
||||
if 'sents' in self.user_hooks:
|
||||
return True
|
||||
if self.is_parsed:
|
||||
return True
|
||||
for i in range(self.length):
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||
return True
|
||||
else:
|
||||
|
@ -569,6 +574,9 @@ cdef class Doc:
|
|||
raise ValueError(Errors.E031.format(i=self.length))
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
if self.length == 1:
|
||||
# Set token.sent_start to 1 for first token. See issue #2869
|
||||
self.c[0].sent_start = 1
|
||||
return t.idx + t.lex.length + t.spacy
|
||||
|
||||
@cython.boundscheck(False)
|
||||
|
|
|
@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
|
|||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
||||
|
||||
A boolean value indicating whether the token starts a sentence. `None` if
|
||||
unknown.
|
||||
unknown. Defaults to `True` for the first token in the `doc`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
Loading…
Reference in New Issue
Block a user