raise NotImplementedError when noun_chunks iterator is not implemented (#6711)

* raise NotImplementedError when noun_chunks iterator is not implemented * bring back, fix and document span.noun_chunks * formatting Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-08-08 06:04:57 +03:00 · 2021-01-17 12:56:05 +01:00 · 2021-01-17 12:56:05 +01:00 · fed8f48965
commit fed8f48965
parent bf0cdae8d4
10 changed files with 93 additions and 52 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -463,6 +463,8 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")

    # TODO: fix numbering after merging develop into master
+    E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
+            "'{lang}'.")
    E895 = ("The 'textcat' component received gold-standard annotations with "
            "multiple labels per document. In spaCy 3 you should use the "
            "'textcat_multilabel' component for this instead. "
--- a/spacy/lang/he/lex_attrs.py
+++ b/spacy/lang/he/lex_attrs.py
@ -86,7 +86,7 @@ def like_num(text):
    if text in _num_words:
        return True

-    # CHeck ordinal number
+    # Check ordinal number
    if text in _ordinal_words:
        return True
    return False
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,8 @@ import pytest
 import numpy
 import logging
 import mock
+
+from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.lexeme import Lexeme
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
        doc.ents = spans


+def test_doc_noun_chunks_not_implemented():
+    """Test that a language without noun_chunk iterator, throws a NotImplementedError"""
+    text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
+    nlp = MultiLanguage()
+    doc = nlp(text)
+    with pytest.raises(NotImplementedError):
+        chunks = list(doc.noun_chunks)
+
 def test_span_groups(en_tokenizer):
    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
    doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -1,11 +1,16 @@
-import numpy
-from spacy.attrs import HEAD, DEP
-from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
-from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tokens import Doc
 import pytest


+@pytest.fixture
+def doc(en_vocab):
+    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
+    heads = [1, 1, 6, 6, 3, 3, 1]
+    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
+    pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
+    return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
 def test_noun_chunks_is_parsed(en_tokenizer):
    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
    doc = en_tokenizer("This is a sentence")
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
        list(doc.noun_chunks)


-def test_en_noun_chunks_not_nested(en_vocab):
-    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
-    heads = [1, 1, 6, 6, 3, 3, 1]
-    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    doc.from_array(
-        [HEAD, DEP],
-        numpy.asarray(
-            [
-                [1, nsubj],
-                [0, root],
-                [4, amod],
-                [3, nmod],
-                [-1, cc],
-                [-2, conj],
-                [-5, dobj],
-            ],
-            dtype="uint64",
-        ),
-    )
-    doc.noun_chunks_iterator = noun_chunks
+def test_en_noun_chunks_not_nested(doc, en_vocab):
+    """Test that each token only appears in one noun chunk at most"""
    word_occurred = {}
-    for chunk in doc.noun_chunks:
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) > 1
+    for chunk in chunks:
        for word in chunk:
            word_occurred.setdefault(word.text, 0)
            word_occurred[word.text] += 1
+    assert len(word_occurred) > 0
    for word, freq in word_occurred.items():
        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
+
+
+def test_noun_chunks_span(doc, en_tokenizer):
+    """Test that the span.noun_chunks property works correctly"""
+    doc_chunks = list(doc.noun_chunks)
+    span = doc[0:3]
+    span_chunks = list(span.noun_chunks)
+    assert 0 < len(span_chunks) < len(doc_chunks)
+    for chunk in span_chunks:
+        assert chunk in doc_chunks
+        assert chunk.start >= 0
+        assert chunk.end <= 3
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -81,7 +81,8 @@ def test_issue3199():
    """
    words = ["This", "is", "a", "sentence"]
    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
-    assert list(doc[0:3].noun_chunks) == []
+    with pytest.raises(NotImplementedError):
+        list(doc[0:3].noun_chunks)


 def test_issue3209():
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -816,8 +816,10 @@ cdef class Doc:
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the document. Yields base
-        noun-phrase #[code Span] objects, if the document has been
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
+        Raises a NotImplementedError otherwise.
+
+        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
@ -826,16 +828,17 @@ cdef class Doc:

        DOCS: https://nightly.spacy.io/api/doc#noun_chunks
        """
+        if self.noun_chunks_iterator is None:
+            raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))

        # Accumulate the result before beginning to iterate over it. This
-        # prevents the tokenisation from being changed out from under us
+        # prevents the tokenization from being changed out from under us
        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenisation changing, so it's okay once we have the Span
+        # its tokenization changing, so it's okay once we have the Span
        # objects. See Issue #375.
        spans = []
-        if self.noun_chunks_iterator is not None:
-            for start, end, label in self.noun_chunks_iterator(self):
-                spans.append(Span(self, start, end, label=label))
+        for start, end, label in self.noun_chunks_iterator(self):
+            spans.append(Span(self, start, end, label=label))
        for span in spans:
            yield span

--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -487,30 +487,25 @@ cdef class Span:
        """
        return "".join([t.text_with_ws for t in self])

+
    @property
    def noun_chunks(self):
-        """Yields base noun-phrase `Span` objects, if the document has been
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        """Iterate over the base noun phrases in the span. Yields base
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
+        Raises a NotImplementedError otherwise.
+
+        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.

-        YIELDS (Span): Base noun-phrase `Span` objects.
+        YIELDS (Span): Noun chunks in the span.

        DOCS: https://nightly.spacy.io/api/span#noun_chunks
        """
-        # Accumulate the result before beginning to iterate over it. This
-        # prevents the tokenisation from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenisation changing, so it's okay once we have the Span
-        # objects. See Issue #375
-        spans = []
-        cdef attr_t label
-        if self.doc.noun_chunks_iterator is not None:
-            for start, end, label in self.doc.noun_chunks_iterator(self):
-                spans.append(Span(self.doc, start, end, label=label))
-        for span in spans:
-            yield span
+        for span in self.doc.noun_chunks:
+            if span.start >= self.start and span.end <= self.end:
+                yield span

    @property
    def root(self):
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
 nested within it – so no NP-level coordination, no prepositional phrases, and no
 relative clauses.

+If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
+not been implemeted for the given language, a `NotImplementedError` is raised.
+
 > #### Example
 >
 > ```python
 > doc = nlp("A phrase with another phrase occurs.")
 > chunks = list(doc.noun_chunks)
+> assert len(chunks) == 2
 > assert chunks[0].text == "A phrase"
 > assert chunks[1].text == "another phrase"
 > ```
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
 | Name                                 | Description                                                                               |
 | ------------------------------------ | ----------------------------------------------------------------------------------------- |
 | `start`                              | The index of the first character of the span. ~~int~~                                     |
-| `end`                                | The index of the last character after the span. ~~int~~                                    |
+| `end`                                | The index of the last character after the span. ~~int~~                                   |
 | `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
 | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
 | `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
@ -274,6 +274,31 @@ if the entity recognizer has been applied.
 | ----------- | ----------------------------------------------------------------- |
 | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |

+## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
+
+Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
+objects, if the document has been syntactically parsed. A base noun phrase, or
+"NP chunk", is a noun phrase that does not permit other NPs to be nested within
+it – so no NP-level coordination, no prepositional phrases, and no relative
+clauses.
+
+If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
+not been implemeted for the given language, a `NotImplementedError` is raised.
+
+> #### Example
+>
+> ```python
+> doc = nlp("A phrase with another phrase occurs.")
+> span = doc[3:5]
+> chunks = list(span.noun_chunks)
+> assert len(chunks) == 1
+> assert chunks[0].text == "another phrase"
+> ```
+
+| Name       | Description                       |
+| ---------- | --------------------------------- |
+| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+
 ## Span.as_doc {#as_doc tag="method"}

 Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their
 head. You can think of noun chunks as a noun plus the words describing the noun
 – for example, "the lavish green grass" or "the world’s largest tech fund". To
 get the noun chunks in a document, simply iterate over
-[`Doc.noun_chunks`](/api/doc#noun_chunks)
+[`Doc.noun_chunks`](/api/doc#noun_chunks).

 ```python
 ### {executable="true"}