From deab391cbf64f405151818ca9779e65565d99618 Mon Sep 17 00:00:00 2001 From: Thomas Opsomer Date: Thu, 15 Feb 2018 16:58:30 +0100 Subject: [PATCH] correct check on sent_start & raise if no boundaries --- spacy/tokens/span.pyx | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index da2bc800f..cc4b0a26a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -291,6 +291,7 @@ cdef class Span: # if doc is parsed we can use the deps to find the sentence # otherwise we use the `sent_start` token attribute cdef int n = 0 + cdef int i if self.doc.is_parsed: root = &self.doc.c[self.start] while root.head != 0: @@ -300,19 +301,30 @@ cdef class Span: raise RuntimeError return self.doc[root.l_edge:root.r_edge + 1] else: + # Check if the document has sentence boundaries, + # i.e at least one tok has the sent_start == 1 + for i in range(self.doc.length): + if self.doc.c[i].sent_start == 1: + break + else: + raise ValueError( + "Access to sentence requires either the dependency parse " + "or sentence boundaries to be set by setting " + + "doc[i].is_sent_start = True") # find start of the sentence start = self.start - while not self.doc.c[start].sent_start and start > 0: + while self.doc.c[start].sent_start != 1 and start > 0: start += -1 # find end of the sentence end = self.end - while not self.doc.c[end].sent_start: + while self.doc.c[end].sent_start != 1: end += 1 if n >= self.doc.length: break # return self.doc[start:end] + property has_vector: """RETURNS (bool): Whether a word vector is associated with the object. """