Adding noun_chunks to the DUTCH language model (nl) (#8529)

*  implement noun_chunks for dutch language

* copy/paste FR and SV syntax iterators to accomodate UD tags
* added tests with dutch text
* signed contributor agreement

* 🐛 fix noun chunks generator

* built from scratch
* define noun chunk as a single Noun-Phrase
* includes some corner cases debugging (incorrect POS tagging)
* test with provided annotated sample (POS, DEP)

*  fix failing test

* CI pipeline did not like the added sample file
* add the sample as a pytest fixture

* Update spacy/lang/nl/syntax_iterators.py

* Update spacy/lang/nl/syntax_iterators.py

Code readability

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/tests/lang/nl/test_noun_chunks.py

correct comment

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* finalize code

* change "if next_word" into "if next_word is not None"

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Julien Rossi 2021-07-14 14:01:02 +02:00 committed by GitHub
parent 2a8eeed5da
commit e117573822
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 295 additions and 3 deletions

View File

@ -864,6 +864,9 @@ class Errors:
E1018 = ("Knowledge base for component '{name}' is not set. "
"Make sure either `nel.initialize` or `nel.set_kb` "
"is called with a `kb_loader` function.")
E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,12 +1,14 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language
@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS

View File

@ -0,0 +1,72 @@
from typing import Union, Iterator
from ...symbols import NOUN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""
Detect base noun phrases from a dependency parse. Works on Doc and Span.
The definition is inspired by https://www.nltk.org/book/ch07.html
Consider : [Noun + determinant / adjective] and also [Pronoun]
"""
# fmt: off
# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
# Check for dependencies: POS, DEP
if not doc.has_annotation("POS"):
raise ValueError(Errors.E1019)
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
# See UD tags: https://universaldependencies.org/u/dep/index.html
# amod = adjectival modifier
# nmod:poss = possessive nominal modifier
# nummod = numeric modifier
# det = determiner
# det:poss = possessive determiner
noun_deps = [
doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
]
# nsubj = nominal subject
# nsubj:pass = passive nominal subject
pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
# Label NP for the Span to identify it as Noun-Phrase
span_label = doc.vocab.strings.add("NP")
# Only NOUNS and PRONOUNS matter
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
if word.pos == NOUN:
# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
# We check if the word has a "nsubj", if it's the case, we eliminate it
nsubjs = filter(
lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
)
next_word = next(nsubjs, None)
if next_word is not None:
# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
continue
children = filter(lambda x: x.dep in noun_deps, word.children)
children_i = [c.i for c in children] + [word.i]
start_span = min(children_i)
end_span = max(children_i) + 1
yield start_span, end_span, span_label
# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
end_span = word.i + 1
yield start_span, end_span, span_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -202,6 +202,11 @@ def ne_tokenizer():
return get_lang_class("ne")().tokenizer
@pytest.fixture(scope="session")
def nl_vocab():
return get_lang_class("nl")().vocab
@pytest.fixture(scope="session")
def nl_tokenizer():
return get_lang_class("nl")().tokenizer

View File

@ -0,0 +1,209 @@
from spacy.tokens import Doc
import pytest
@pytest.fixture
def nl_sample(nl_vocab):
# TEXT :
# Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
# Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
# geen avondeten gekocht.
words = [
"Haar",
"vriend",
"lacht",
"luid",
".",
"We",
"kregen",
"alweer",
"ruzie",
"toen",
"we",
"de",
"supermarkt",
"ingingen",
".",
"Aan",
"het",
"begin",
"van",
"de",
"supermarkt",
"is",
"al",
"het",
"fruit",
"en",
"de",
"groentes",
".",
"Uiteindelijk",
"hebben",
"we",
"dan",
"ook",
"geen",
"avondeten",
"gekocht",
".",
]
heads = [
1,
2,
2,
2,
2,
6,
6,
6,
6,
13,
13,
12,
13,
6,
6,
17,
17,
24,
20,
20,
17,
24,
24,
24,
24,
27,
27,
24,
24,
36,
36,
36,
36,
36,
35,
36,
36,
36,
]
deps = [
"nmod:poss",
"nsubj",
"ROOT",
"advmod",
"punct",
"nsubj",
"ROOT",
"advmod",
"obj",
"mark",
"nsubj",
"det",
"obj",
"advcl",
"punct",
"case",
"det",
"obl",
"case",
"det",
"nmod",
"cop",
"advmod",
"det",
"ROOT",
"cc",
"det",
"conj",
"punct",
"advmod",
"aux",
"nsubj",
"advmod",
"advmod",
"det",
"obj",
"ROOT",
"punct",
]
pos = [
"PRON",
"NOUN",
"VERB",
"ADJ",
"PUNCT",
"PRON",
"VERB",
"ADV",
"NOUN",
"SCONJ",
"PRON",
"DET",
"NOUN",
"NOUN",
"PUNCT",
"ADP",
"DET",
"NOUN",
"ADP",
"DET",
"NOUN",
"AUX",
"ADV",
"DET",
"NOUN",
"CCONJ",
"DET",
"NOUN",
"PUNCT",
"ADJ",
"AUX",
"PRON",
"ADV",
"ADV",
"DET",
"NOUN",
"VERB",
"PUNCT",
]
return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
@pytest.fixture
def nl_reference_chunking():
# Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
return [
"haar vriend",
"we",
"ruzie",
"we",
"de supermarkt",
"het begin",
"de supermarkt",
"het fruit",
"de groentes",
"we",
"geen avondeten",
]
def test_need_dep(nl_tokenizer):
"""
Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
"""
txt = "Haar vriend lacht luid."
doc = nl_tokenizer(txt)
with pytest.raises(ValueError):
list(doc.noun_chunks)
def test_chunking(nl_sample, nl_reference_chunking):
"""
Test the noun chunks of a sample text. Uses a sample.
The sample text simulates a Doc object as would be produced by nl_core_news_md.
"""
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
assert chunks == nl_reference_chunking