From cdc10e9a1ca68e29c6c97e57325962d60c86e103 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 May 2016 10:14:06 +0200 Subject: [PATCH] * Fix Issue #375: noun phrase iteration results in index error if noun phrases are merged during the loop. Fix by accumulating the spans inside the noun_chunks property, allowing the Span index tricks to work. --- spacy/tokens/doc.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index eaec68675..e432c83be 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -247,8 +247,15 @@ cdef class Doc: "requires data to be installed. If you haven't done so, run: " "\npython -m spacy.%s.download all\n" "to install the data" % self.vocab.lang) + # Accumulate the result before beginning to iterate over it. This prevents + # the tokenisation from being changed out from under us during the iteration. + # The tricky thing here is that Span accepts its tokenisation changing, + # so it's okay once we have the Span objects. See Issue #375 + spans = [] for start, end, label in self.noun_chunks_iterator(self): - yield Span(self, start, end, label=label) + spans.append(Span(self, start, end, label=label)) + for span in spans: + yield span @property def sents(self):