From e63880e0812b4bf45a8f4a96bc26c3f4a10d9fb7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 18:22:51 +0200 Subject: [PATCH] Use Token.sent_start for Span.sent (#5439) Use `Token.sent_start` for sentence boundaries in `Span.sent` so that `Doc.sents` and `Span.sent` return the same sentence boundaries. --- spacy/tokens/span.pyx | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 347916a0a..2f1418a5b 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -389,19 +389,9 @@ cdef class Span: return self.doc.user_span_hooks["sent"](self) # This should raise if not parsed / no custom sentence boundaries self.doc.sents - # If doc is parsed we can use the deps to find the sentence - # otherwise we use the `sent_start` token attribute + # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 - cdef int i - if self.doc.is_parsed: - root = &self.doc.c[self.start] - while root.head != 0: - root += root.head - n += 1 - if n >= self.doc.length: - raise RuntimeError(Errors.E038) - return self.doc[root.l_edge:root.r_edge + 1] - elif self.doc.is_sentenced: + if self.doc.is_sentenced: # Find start of the sentence start = self.start while self.doc.c[start].sent_start != 1 and start > 0: