Use Token.sent_start for Span.sent (#5439)

Use `Token.sent_start` for sentence boundaries in `Span.sent` so that
`Doc.sents` and `Span.sent` return the same sentence boundaries.
This commit is contained in:
adrianeboyd 2020-05-14 18:22:51 +02:00 committed by GitHub
parent 780b869345
commit e63880e081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -389,19 +389,9 @@ cdef class Span:
return self.doc.user_span_hooks["sent"](self) return self.doc.user_span_hooks["sent"](self)
# This should raise if not parsed / no custom sentence boundaries # This should raise if not parsed / no custom sentence boundaries
self.doc.sents self.doc.sents
# If doc is parsed we can use the deps to find the sentence # Use `sent_start` token attribute to find sentence boundaries
# otherwise we use the `sent_start` token attribute
cdef int n = 0 cdef int n = 0
cdef int i if self.doc.is_sentenced:
if self.doc.is_parsed:
root = &self.doc.c[self.start]
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError(Errors.E038)
return self.doc[root.l_edge:root.r_edge + 1]
elif self.doc.is_sentenced:
# Find start of the sentence # Find start of the sentence
start = self.start start = self.start
while self.doc.c[start].sent_start != 1 and start > 0: while self.doc.c[start].sent_start != 1 and start > 0: