mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Make doc[0] have sent_start True. Closes #2869 * Document that doc[0].is_sent_start defaults True.
This commit is contained in:
parent
2d3ce89b78
commit
4a3371acd5
|
@ -192,3 +192,9 @@ def test_tokens_sent(doc):
|
||||||
assert doc[7].sent.text == "This is another sentence ."
|
assert doc[7].sent.text == "This is another sentence ."
|
||||||
assert doc[1].sent.root.left_edge.text == "This"
|
assert doc[1].sent.root.left_edge.text == "This"
|
||||||
assert doc[7].sent.root.left_edge.text == "This"
|
assert doc[7].sent.root.left_edge.text == "This"
|
||||||
|
|
||||||
|
def test_token0_has_sent_start_true():
|
||||||
|
doc = Doc(Vocab(), words=["hello", "world"])
|
||||||
|
assert doc[0].is_sent_start is True
|
||||||
|
assert doc[1].is_sent_start is None
|
||||||
|
assert not doc.is_sentenced
|
||||||
|
|
|
@ -188,13 +188,18 @@ cdef class Doc:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_sentenced(self):
|
def is_sentenced(self):
|
||||||
# Check if the document has sentence boundaries,
|
"""Check if the document has sentence boundaries assigned. This is
|
||||||
# i.e at least one tok has the sent_start in (-1, 1)
|
defined as having at least one of the following:
|
||||||
|
|
||||||
|
a) An entry "sents" in doc.user_hooks";
|
||||||
|
b) sent.is_parsed is set to True;
|
||||||
|
c) At least one token other than the first where sent_start is not None.
|
||||||
|
"""
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
return True
|
return True
|
||||||
if self.is_parsed:
|
if self.is_parsed:
|
||||||
return True
|
return True
|
||||||
for i in range(self.length):
|
for i in range(1, self.length):
|
||||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
@ -569,6 +574,9 @@ cdef class Doc:
|
||||||
raise ValueError(Errors.E031.format(i=self.length))
|
raise ValueError(Errors.E031.format(i=self.length))
|
||||||
t.spacy = has_space
|
t.spacy = has_space
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
if self.length == 1:
|
||||||
|
# Set token.sent_start to 1 for first token. See issue #2869
|
||||||
|
self.c[0].sent_start = 1
|
||||||
return t.idx + t.lex.length + t.spacy
|
return t.idx + t.lex.length + t.spacy
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
|
|
|
@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
|
||||||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
||||||
|
|
||||||
A boolean value indicating whether the token starts a sentence. `None` if
|
A boolean value indicating whether the token starts a sentence. `None` if
|
||||||
unknown.
|
unknown. Defaults to `True` for the first token in the `doc`.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user