* Fix Issue #375: noun phrase iteration results in index error if noun phrases are merged during the loop. Fix by accumulating the spans inside the noun_chunks property, allowing the Span index tricks to work.

2025-08-02 03:10:22 +03:00 · 2016-05-20 10:14:06 +02:00 · 2016-05-20 10:14:06 +02:00 · cdc10e9a1c
commit cdc10e9a1c
parent 13fad36e49
1 changed files with 8 additions and 1 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -247,8 +247,15 @@ cdef class Doc:
                "requires data to be installed. If you haven't done so, run: "
                "\npython -m spacy.%s.download all\n"
                "to install the data" % self.vocab.lang)
+        # Accumulate the result before beginning to iterate over it. This prevents
+        # the tokenisation from being changed out from under us during the iteration.
+        # The tricky thing here is that Span accepts its tokenisation changing,
+        # so it's okay once we have the Span objects. See Issue #375
+        spans = []
        for start, end, label in self.noun_chunks_iterator(self):
-            yield Span(self, start, end, label=label)
+            spans.append(Span(self, start, end, label=label))
+        for span in spans:
+            yield span

    @property
    def sents(self):