Use Token.sent_start for Span.sent (#5439)

Use `Token.sent_start` for sentence boundaries in `Span.sent` so that `Doc.sents` and `Span.sent` return the same sentence boundaries.
2025-10-19 02:04:19 +03:00 · 2020-05-14 18:22:51 +02:00 · 2020-05-14 18:22:51 +02:00 · e63880e081
commit e63880e081
parent 780b869345
1 changed files with 2 additions and 12 deletions
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -389,19 +389,9 @@ cdef class Span:
            return self.doc.user_span_hooks["sent"](self)
        # This should raise if not parsed / no custom sentence boundaries
        self.doc.sents
-        # If doc is parsed we can use the deps to find the sentence
+        # Use `sent_start` token attribute to find sentence boundaries
        # otherwise we use the `sent_start` token attribute
        cdef int n = 0
-        cdef int i
+        if self.doc.is_sentenced:
        if self.doc.is_parsed:
            root = &self.doc.c[self.start]
            while root.head != 0:
                root += root.head
                n += 1
                if n >= self.doc.length:
                    raise RuntimeError(Errors.E038)
            return self.doc[root.l_edge:root.r_edge + 1]
        elif self.doc.is_sentenced:
            # Find start of the sentence
            start = self.start
            while self.doc.c[start].sent_start != 1 and start > 0: