Adding noun_chunks to the DUTCH language model (nl) (#8529)

* ✨ implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * 🐛 fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * ✅ fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2026-02-17 12:40:46 +03:00 · 2021-07-14 14:01:02 +02:00 · 2021-07-14 14:01:02 +02:00 · e117573822
commit e117573822
parent 2a8eeed5da
5 changed files with 295 additions and 3 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -864,6 +864,9 @@ class Errors:
    E1018 = ("Knowledge base for component '{name}' is not set. "
             "Make sure either `nel.initialize` or `nel.set_kb` "
             "is called with a `kb_loader` function.")
+    E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
+             "statistical model to be installed and loaded. For more info, see "
+             "the documentation:\nhttps://spacy.io/usage/models")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,12 +1,14 @@
 from typing import Optional
+
 from thinc.api import Model

-from .stop_words import STOP_WORDS
+from .lemmatizer import DutchLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
-from .lemmatizer import DutchLemmatizer
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ...language import Language


@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults):
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS


--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@ -0,0 +1,72 @@
+from typing import Union, Iterator
+
+from ...symbols import NOUN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+    """
+    Detect base noun phrases from a dependency parse. Works on Doc and Span.
+    The definition is inspired by https://www.nltk.org/book/ch07.html
+    Consider : [Noun + determinant / adjective] and also [Pronoun]
+    """
+    # fmt: off
+    # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    # fmt: on
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+
+    # Check for dependencies: POS, DEP
+    if not doc.has_annotation("POS"):
+        raise ValueError(Errors.E1019)
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    # See UD tags: https://universaldependencies.org/u/dep/index.html
+    # amod = adjectival modifier
+    # nmod:poss = possessive nominal modifier
+    # nummod = numeric modifier
+    # det = determiner
+    # det:poss = possessive determiner
+    noun_deps = [
+        doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
+    ]
+
+    # nsubj = nominal subject
+    # nsubj:pass = passive nominal subject
+    pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
+
+    # Label NP for the Span to identify it as Noun-Phrase
+    span_label = doc.vocab.strings.add("NP")
+
+    # Only NOUNS and PRONOUNS matter
+    for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
+        # For NOUNS
+        # Pick children from syntactic parse (only those with certain dependencies)
+        if word.pos == NOUN:
+            # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
+            # We check if the word has a "nsubj", if it's the case, we eliminate it
+            nsubjs = filter(
+                lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
+            )
+            next_word = next(nsubjs, None)
+            if next_word is not None:
+                # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
+                continue
+
+            children = filter(lambda x: x.dep in noun_deps, word.children)
+            children_i = [c.i for c in children] + [word.i]
+
+            start_span = min(children_i)
+            end_span = max(children_i) + 1
+            yield start_span, end_span, span_label
+
+        # PRONOUNS only if it is the subject of a verb
+        elif word.pos == PRON:
+            if word.dep in pronoun_deps:
+                start_span = word.i
+                end_span = word.i + 1
+                yield start_span, end_span, span_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -202,6 +202,11 @@ def ne_tokenizer():
    return get_lang_class("ne")().tokenizer


+@pytest.fixture(scope="session")
+def nl_vocab():
+    return get_lang_class("nl")().vocab
+
+
@pytest.fixture(scope="session")
 def nl_tokenizer():
    return get_lang_class("nl")().tokenizer
--- a/spacy/tests/lang/nl/test_noun_chunks.py
+++ b/spacy/tests/lang/nl/test_noun_chunks.py
@ -0,0 +1,209 @@
+from spacy.tokens import Doc
+import pytest
+
+
+@pytest.fixture
+def nl_sample(nl_vocab):
+    # TEXT :
+    # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
+    # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
+    # geen avondeten gekocht.
+    words = [
+        "Haar",
+        "vriend",
+        "lacht",
+        "luid",
+        ".",
+        "We",
+        "kregen",
+        "alweer",
+        "ruzie",
+        "toen",
+        "we",
+        "de",
+        "supermarkt",
+        "ingingen",
+        ".",
+        "Aan",
+        "het",
+        "begin",
+        "van",
+        "de",
+        "supermarkt",
+        "is",
+        "al",
+        "het",
+        "fruit",
+        "en",
+        "de",
+        "groentes",
+        ".",
+        "Uiteindelijk",
+        "hebben",
+        "we",
+        "dan",
+        "ook",
+        "geen",
+        "avondeten",
+        "gekocht",
+        ".",
+    ]
+    heads = [
+        1,
+        2,
+        2,
+        2,
+        2,
+        6,
+        6,
+        6,
+        6,
+        13,
+        13,
+        12,
+        13,
+        6,
+        6,
+        17,
+        17,
+        24,
+        20,
+        20,
+        17,
+        24,
+        24,
+        24,
+        24,
+        27,
+        27,
+        24,
+        24,
+        36,
+        36,
+        36,
+        36,
+        36,
+        35,
+        36,
+        36,
+        36,
+    ]
+    deps = [
+        "nmod:poss",
+        "nsubj",
+        "ROOT",
+        "advmod",
+        "punct",
+        "nsubj",
+        "ROOT",
+        "advmod",
+        "obj",
+        "mark",
+        "nsubj",
+        "det",
+        "obj",
+        "advcl",
+        "punct",
+        "case",
+        "det",
+        "obl",
+        "case",
+        "det",
+        "nmod",
+        "cop",
+        "advmod",
+        "det",
+        "ROOT",
+        "cc",
+        "det",
+        "conj",
+        "punct",
+        "advmod",
+        "aux",
+        "nsubj",
+        "advmod",
+        "advmod",
+        "det",
+        "obj",
+        "ROOT",
+        "punct",
+    ]
+    pos = [
+        "PRON",
+        "NOUN",
+        "VERB",
+        "ADJ",
+        "PUNCT",
+        "PRON",
+        "VERB",
+        "ADV",
+        "NOUN",
+        "SCONJ",
+        "PRON",
+        "DET",
+        "NOUN",
+        "NOUN",
+        "PUNCT",
+        "ADP",
+        "DET",
+        "NOUN",
+        "ADP",
+        "DET",
+        "NOUN",
+        "AUX",
+        "ADV",
+        "DET",
+        "NOUN",
+        "CCONJ",
+        "DET",
+        "NOUN",
+        "PUNCT",
+        "ADJ",
+        "AUX",
+        "PRON",
+        "ADV",
+        "ADV",
+        "DET",
+        "NOUN",
+        "VERB",
+        "PUNCT",
+    ]
+    return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
+@pytest.fixture
+def nl_reference_chunking():
+    # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
+    return [
+        "haar vriend",
+        "we",
+        "ruzie",
+        "we",
+        "de supermarkt",
+        "het begin",
+        "de supermarkt",
+        "het fruit",
+        "de groentes",
+        "we",
+        "geen avondeten",
+    ]
+
+
+def test_need_dep(nl_tokenizer):
+    """
+    Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
+    """
+    txt = "Haar vriend lacht luid."
+    doc = nl_tokenizer(txt)
+
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+def test_chunking(nl_sample, nl_reference_chunking):
+    """
+    Test the noun chunks of a sample text. Uses a sample.
+    The sample text simulates a Doc object as would be produced by nl_core_news_md.
+    """
+    chunks = [s.text.lower() for s in nl_sample.noun_chunks]
+    assert chunks == nl_reference_chunking