Adding noun_chunks to the DUTCH language model (nl) (#8529)

* ✨ implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * 🐛 fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * ✅ fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-12-21 17:14:34 +03:00 · 2021-07-14 14:01:02 +02:00 · 2021-07-14 14:01:02 +02:00 · e117573822
commit e117573822
parent 2a8eeed5da
5 changed files with 295 additions and 3 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -864,6 +864,9 @@ class Errors:
    E1018 = ("Knowledge base for component '{name}' is not set. "
             "Make sure either `nel.initialize` or `nel.set_kb` "
             "is called with a `kb_loader` function.")
    E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
             "statistical model to be installed and loaded. For more info, see "
             "the documentation:\nhttps://spacy.io/usage/models")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,12 +1,14 @@
 from typing import Optional
 from thinc.api import Model
-from .stop_words import STOP_WORDS
+from .lemmatizer import DutchLemmatizer
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
-from .lemmatizer import DutchLemmatizer
+from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ...language import Language
@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults):
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@ -0,0 +1,72 @@
 from typing import Union, Iterator
 from ...symbols import NOUN, PRON
 from ...errors import Errors
 from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    """
    Detect base noun phrases from a dependency parse. Works on Doc and Span.
    The definition is inspired by https://www.nltk.org/book/ch07.html
    Consider : [Noun + determinant / adjective] and also [Pronoun]
    """
    # fmt: off
    # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
    # Check for dependencies: POS, DEP
    if not doc.has_annotation("POS"):
        raise ValueError(Errors.E1019)
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    # See UD tags: https://universaldependencies.org/u/dep/index.html
    # amod = adjectival modifier
    # nmod:poss = possessive nominal modifier
    # nummod = numeric modifier
    # det = determiner
    # det:poss = possessive determiner
    noun_deps = [
        doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
    ]
    # nsubj = nominal subject
    # nsubj:pass = passive nominal subject
    pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
    # Label NP for the Span to identify it as Noun-Phrase
    span_label = doc.vocab.strings.add("NP")
    # Only NOUNS and PRONOUNS matter
    for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
        # For NOUNS
        # Pick children from syntactic parse (only those with certain dependencies)
        if word.pos == NOUN:
            # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
            # We check if the word has a "nsubj", if it's the case, we eliminate it
            nsubjs = filter(
                lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
            )
            next_word = next(nsubjs, None)
            if next_word is not None:
                # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
                continue
            children = filter(lambda x: x.dep in noun_deps, word.children)
            children_i = [c.i for c in children] + [word.i]
            start_span = min(children_i)
            end_span = max(children_i) + 1
            yield start_span, end_span, span_label
        # PRONOUNS only if it is the subject of a verb
        elif word.pos == PRON:
            if word.dep in pronoun_deps:
                start_span = word.i
                end_span = word.i + 1
                yield start_span, end_span, span_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -202,6 +202,11 @@ def ne_tokenizer():
    return get_lang_class("ne")().tokenizer
@pytest.fixture(scope="session")
 def nl_vocab():
    return get_lang_class("nl")().vocab
@pytest.fixture(scope="session")
 def nl_tokenizer():
    return get_lang_class("nl")().tokenizer
--- a/spacy/tests/lang/nl/test_noun_chunks.py
+++ b/spacy/tests/lang/nl/test_noun_chunks.py
@ -0,0 +1,209 @@
 from spacy.tokens import Doc
 import pytest
@pytest.fixture
 def nl_sample(nl_vocab):
    # TEXT :
    # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
    # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
    # geen avondeten gekocht.
    words = [
        "Haar",
        "vriend",
        "lacht",
        "luid",
        ".",
        "We",
        "kregen",
        "alweer",
        "ruzie",
        "toen",
        "we",
        "de",
        "supermarkt",
        "ingingen",
        ".",
        "Aan",
        "het",
        "begin",
        "van",
        "de",
        "supermarkt",
        "is",
        "al",
        "het",
        "fruit",
        "en",
        "de",
        "groentes",
        ".",
        "Uiteindelijk",
        "hebben",
        "we",
        "dan",
        "ook",
        "geen",
        "avondeten",
        "gekocht",
        ".",
    ]
    heads = [
        1,
        2,
        2,
        2,
        2,
        6,
        6,
        6,
        6,
        13,
        13,
        12,
        13,
        6,
        6,
        17,
        17,
        24,
        20,
        20,
        17,
        24,
        24,
        24,
        24,
        27,
        27,
        24,
        24,
        36,
        36,
        36,
        36,
        36,
        35,
        36,
        36,
        36,
    ]
    deps = [
        "nmod:poss",
        "nsubj",
        "ROOT",
        "advmod",
        "punct",
        "nsubj",
        "ROOT",
        "advmod",
        "obj",
        "mark",
        "nsubj",
        "det",
        "obj",
        "advcl",
        "punct",
        "case",
        "det",
        "obl",
        "case",
        "det",
        "nmod",
        "cop",
        "advmod",
        "det",
        "ROOT",
        "cc",
        "det",
        "conj",
        "punct",
        "advmod",
        "aux",
        "nsubj",
        "advmod",
        "advmod",
        "det",
        "obj",
        "ROOT",
        "punct",
    ]
    pos = [
        "PRON",
        "NOUN",
        "VERB",
        "ADJ",
        "PUNCT",
        "PRON",
        "VERB",
        "ADV",
        "NOUN",
        "SCONJ",
        "PRON",
        "DET",
        "NOUN",
        "NOUN",
        "PUNCT",
        "ADP",
        "DET",
        "NOUN",
        "ADP",
        "DET",
        "NOUN",
        "AUX",
        "ADV",
        "DET",
        "NOUN",
        "CCONJ",
        "DET",
        "NOUN",
        "PUNCT",
        "ADJ",
        "AUX",
        "PRON",
        "ADV",
        "ADV",
        "DET",
        "NOUN",
        "VERB",
        "PUNCT",
    ]
    return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
@pytest.fixture
 def nl_reference_chunking():
    # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
    return [
        "haar vriend",
        "we",
        "ruzie",
        "we",
        "de supermarkt",
        "het begin",
        "de supermarkt",
        "het fruit",
        "de groentes",
        "we",
        "geen avondeten",
    ]
 def test_need_dep(nl_tokenizer):
    """
    Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
    """
    txt = "Haar vriend lacht luid."
    doc = nl_tokenizer(txt)
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
 def test_chunking(nl_sample, nl_reference_chunking):
    """
    Test the noun chunks of a sample text. Uses a sample.
    The sample text simulates a Doc object as would be produced by nl_core_news_md.
    """
    chunks = [s.text.lower() for s in nl_sample.noun_chunks]
    assert chunks == nl_reference_chunking