spaCy/spacy/lang/nl/syntax_iterators.py

from typing import Union, Iterator

from ...symbols import NOUN, PRON
from ...errors import Errors
from ...tokens import Doc, Span


def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    """
    Detect base noun phrases from a dependency parse. Works on Doc and Span.
    The definition is inspired by https://www.nltk.org/book/ch07.html
    Consider : [Noun + determinant / adjective] and also [Pronoun]
    """
    # fmt: off
    # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.

    # Check for dependencies: POS, DEP
    if not doc.has_annotation("POS"):
        raise ValueError(Errors.E1019)
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)

    # See UD tags: https://universaldependencies.org/u/dep/index.html
    # amod = adjectival modifier
    # nmod:poss = possessive nominal modifier
    # nummod = numeric modifier
    # det = determiner
    # det:poss = possessive determiner
    noun_deps = [
        doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
    ]

    # nsubj = nominal subject
    # nsubj:pass = passive nominal subject
    pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]

    # Label NP for the Span to identify it as Noun-Phrase
    span_label = doc.vocab.strings.add("NP")

    # Only NOUNS and PRONOUNS matter
    for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
        # For NOUNS
        # Pick children from syntactic parse (only those with certain dependencies)
        if word.pos == NOUN:
            # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
            # We check if the word has a "nsubj", if it's the case, we eliminate it
            nsubjs = filter(
                lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
            )
            next_word = next(nsubjs, None)
            if next_word is not None:
                # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
                continue

            children = filter(lambda x: x.dep in noun_deps, word.children)
            children_i = [c.i for c in children] + [word.i]

            start_span = min(children_i)
            end_span = max(children_i) + 1
            yield start_span, end_span, span_label

        # PRONOUNS only if it is the subject of a verb
        elif word.pos == PRON:
            if word.dep in pronoun_deps:
                start_span = word.i
                end_span = word.i + 1
                yield start_span, end_span, span_label


SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
Adding noun_chunks to the DUTCH language model (nl) (#8529) * :sparkles: implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * :bug: fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * :white_check_mark: fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2021-07-14 15:01:02 +03:00			`from typing import Union, Iterator`

			`from ...symbols import NOUN, PRON`
			`from ...errors import Errors`
			`from ...tokens import Doc, Span`


			`def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:`
			`"""`
			`Detect base noun phrases from a dependency parse. Works on Doc and Span.`
			`The definition is inspired by https://www.nltk.org/book/ch07.html`
			`Consider : [Noun + determinant / adjective] and also [Pronoun]`
			`"""`
			`# fmt: off`
			`# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]`
			`# fmt: on`
			`doc = doclike.doc # Ensure works on both Doc and Span.`

			`# Check for dependencies: POS, DEP`
			`if not doc.has_annotation("POS"):`
			`raise ValueError(Errors.E1019)`
			`if not doc.has_annotation("DEP"):`
			`raise ValueError(Errors.E029)`

			`# See UD tags: https://universaldependencies.org/u/dep/index.html`
			`# amod = adjectival modifier`
			`# nmod:poss = possessive nominal modifier`
			`# nummod = numeric modifier`
			`# det = determiner`
			`# det:poss = possessive determiner`
			`noun_deps = [`
			`doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]`
			`]`

			`# nsubj = nominal subject`
			`# nsubj:pass = passive nominal subject`
			`pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]`

			`# Label NP for the Span to identify it as Noun-Phrase`
			`span_label = doc.vocab.strings.add("NP")`

			`# Only NOUNS and PRONOUNS matter`
			`for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):`
			`# For NOUNS`
			`# Pick children from syntactic parse (only those with certain dependencies)`
			`if word.pos == NOUN:`
			`# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS`
			`# We check if the word has a "nsubj", if it's the case, we eliminate it`
			`nsubjs = filter(`
			`lambda x: x.dep == doc.vocab.strings["nsubj"], word.children`
			`)`
			`next_word = next(nsubjs, None)`
			`if next_word is not None:`
			`# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN`
			`continue`

			`children = filter(lambda x: x.dep in noun_deps, word.children)`
			`children_i = [c.i for c in children] + [word.i]`

			`start_span = min(children_i)`
			`end_span = max(children_i) + 1`
			`yield start_span, end_span, span_label`

			`# PRONOUNS only if it is the subject of a verb`
			`elif word.pos == PRON:`
			`if word.dep in pronoun_deps:`
			`start_span = word.i`
			`end_span = word.i + 1`
			`yield start_span, end_span, span_label`


			`SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}`