mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Use Token.sent_start for Span.sent (#5439)
Use `Token.sent_start` for sentence boundaries in `Span.sent` so that `Doc.sents` and `Span.sent` return the same sentence boundaries.
This commit is contained in:
parent
780b869345
commit
e63880e081
|
@ -389,19 +389,9 @@ cdef class Span:
|
||||||
return self.doc.user_span_hooks["sent"](self)
|
return self.doc.user_span_hooks["sent"](self)
|
||||||
# This should raise if not parsed / no custom sentence boundaries
|
# This should raise if not parsed / no custom sentence boundaries
|
||||||
self.doc.sents
|
self.doc.sents
|
||||||
# If doc is parsed we can use the deps to find the sentence
|
# Use `sent_start` token attribute to find sentence boundaries
|
||||||
# otherwise we use the `sent_start` token attribute
|
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
cdef int i
|
if self.doc.is_sentenced:
|
||||||
if self.doc.is_parsed:
|
|
||||||
root = &self.doc.c[self.start]
|
|
||||||
while root.head != 0:
|
|
||||||
root += root.head
|
|
||||||
n += 1
|
|
||||||
if n >= self.doc.length:
|
|
||||||
raise RuntimeError(Errors.E038)
|
|
||||||
return self.doc[root.l_edge:root.r_edge + 1]
|
|
||||||
elif self.doc.is_sentenced:
|
|
||||||
# Find start of the sentence
|
# Find start of the sentence
|
||||||
start = self.start
|
start = self.start
|
||||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user