mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Adding noun_chunks to the DUTCH language model (nl) (#8529)
* ✨ implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * 🐛 fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * ✅ fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
2a8eeed5da
commit
e117573822
|
@ -864,6 +864,9 @@ class Errors:
|
||||||
E1018 = ("Knowledge base for component '{name}' is not set. "
|
E1018 = ("Knowledge base for component '{name}' is not set. "
|
||||||
"Make sure either `nel.initialize` or `nel.set_kb` "
|
"Make sure either `nel.initialize` or `nel.set_kb` "
|
||||||
"is called with a `kb_loader` function.")
|
"is called with a `kb_loader` function.")
|
||||||
|
E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
|
||||||
|
"statistical model to be installed and loaded. For more info, see "
|
||||||
|
"the documentation:\nhttps://spacy.io/usage/models")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .lemmatizer import DutchLemmatizer
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
72
spacy/lang/nl/syntax_iterators.py
Normal file
72
spacy/lang/nl/syntax_iterators.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on Doc and Span.
|
||||||
|
The definition is inspired by https://www.nltk.org/book/ch07.html
|
||||||
|
Consider : [Noun + determinant / adjective] and also [Pronoun]
|
||||||
|
"""
|
||||||
|
# fmt: off
|
||||||
|
# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
# fmt: on
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
# Check for dependencies: POS, DEP
|
||||||
|
if not doc.has_annotation("POS"):
|
||||||
|
raise ValueError(Errors.E1019)
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
# See UD tags: https://universaldependencies.org/u/dep/index.html
|
||||||
|
# amod = adjectival modifier
|
||||||
|
# nmod:poss = possessive nominal modifier
|
||||||
|
# nummod = numeric modifier
|
||||||
|
# det = determiner
|
||||||
|
# det:poss = possessive determiner
|
||||||
|
noun_deps = [
|
||||||
|
doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
|
||||||
|
]
|
||||||
|
|
||||||
|
# nsubj = nominal subject
|
||||||
|
# nsubj:pass = passive nominal subject
|
||||||
|
pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
|
||||||
|
|
||||||
|
# Label NP for the Span to identify it as Noun-Phrase
|
||||||
|
span_label = doc.vocab.strings.add("NP")
|
||||||
|
|
||||||
|
# Only NOUNS and PRONOUNS matter
|
||||||
|
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
||||||
|
# For NOUNS
|
||||||
|
# Pick children from syntactic parse (only those with certain dependencies)
|
||||||
|
if word.pos == NOUN:
|
||||||
|
# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
|
||||||
|
# We check if the word has a "nsubj", if it's the case, we eliminate it
|
||||||
|
nsubjs = filter(
|
||||||
|
lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
|
||||||
|
)
|
||||||
|
next_word = next(nsubjs, None)
|
||||||
|
if next_word is not None:
|
||||||
|
# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
|
||||||
|
continue
|
||||||
|
|
||||||
|
children = filter(lambda x: x.dep in noun_deps, word.children)
|
||||||
|
children_i = [c.i for c in children] + [word.i]
|
||||||
|
|
||||||
|
start_span = min(children_i)
|
||||||
|
end_span = max(children_i) + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
|
# PRONOUNS only if it is the subject of a verb
|
||||||
|
elif word.pos == PRON:
|
||||||
|
if word.dep in pronoun_deps:
|
||||||
|
start_span = word.i
|
||||||
|
end_span = word.i + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -202,6 +202,11 @@ def ne_tokenizer():
|
||||||
return get_lang_class("ne")().tokenizer
|
return get_lang_class("ne")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nl_vocab():
|
||||||
|
return get_lang_class("nl")().vocab
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def nl_tokenizer():
|
def nl_tokenizer():
|
||||||
return get_lang_class("nl")().tokenizer
|
return get_lang_class("nl")().tokenizer
|
||||||
|
|
209
spacy/tests/lang/nl/test_noun_chunks.py
Normal file
209
spacy/tests/lang/nl/test_noun_chunks.py
Normal file
|
@ -0,0 +1,209 @@
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nl_sample(nl_vocab):
|
||||||
|
# TEXT :
|
||||||
|
# Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
|
||||||
|
# Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
|
||||||
|
# geen avondeten gekocht.
|
||||||
|
words = [
|
||||||
|
"Haar",
|
||||||
|
"vriend",
|
||||||
|
"lacht",
|
||||||
|
"luid",
|
||||||
|
".",
|
||||||
|
"We",
|
||||||
|
"kregen",
|
||||||
|
"alweer",
|
||||||
|
"ruzie",
|
||||||
|
"toen",
|
||||||
|
"we",
|
||||||
|
"de",
|
||||||
|
"supermarkt",
|
||||||
|
"ingingen",
|
||||||
|
".",
|
||||||
|
"Aan",
|
||||||
|
"het",
|
||||||
|
"begin",
|
||||||
|
"van",
|
||||||
|
"de",
|
||||||
|
"supermarkt",
|
||||||
|
"is",
|
||||||
|
"al",
|
||||||
|
"het",
|
||||||
|
"fruit",
|
||||||
|
"en",
|
||||||
|
"de",
|
||||||
|
"groentes",
|
||||||
|
".",
|
||||||
|
"Uiteindelijk",
|
||||||
|
"hebben",
|
||||||
|
"we",
|
||||||
|
"dan",
|
||||||
|
"ook",
|
||||||
|
"geen",
|
||||||
|
"avondeten",
|
||||||
|
"gekocht",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
heads = [
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
6,
|
||||||
|
6,
|
||||||
|
6,
|
||||||
|
6,
|
||||||
|
13,
|
||||||
|
13,
|
||||||
|
12,
|
||||||
|
13,
|
||||||
|
6,
|
||||||
|
6,
|
||||||
|
17,
|
||||||
|
17,
|
||||||
|
24,
|
||||||
|
20,
|
||||||
|
20,
|
||||||
|
17,
|
||||||
|
24,
|
||||||
|
24,
|
||||||
|
24,
|
||||||
|
24,
|
||||||
|
27,
|
||||||
|
27,
|
||||||
|
24,
|
||||||
|
24,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
35,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
36,
|
||||||
|
]
|
||||||
|
deps = [
|
||||||
|
"nmod:poss",
|
||||||
|
"nsubj",
|
||||||
|
"ROOT",
|
||||||
|
"advmod",
|
||||||
|
"punct",
|
||||||
|
"nsubj",
|
||||||
|
"ROOT",
|
||||||
|
"advmod",
|
||||||
|
"obj",
|
||||||
|
"mark",
|
||||||
|
"nsubj",
|
||||||
|
"det",
|
||||||
|
"obj",
|
||||||
|
"advcl",
|
||||||
|
"punct",
|
||||||
|
"case",
|
||||||
|
"det",
|
||||||
|
"obl",
|
||||||
|
"case",
|
||||||
|
"det",
|
||||||
|
"nmod",
|
||||||
|
"cop",
|
||||||
|
"advmod",
|
||||||
|
"det",
|
||||||
|
"ROOT",
|
||||||
|
"cc",
|
||||||
|
"det",
|
||||||
|
"conj",
|
||||||
|
"punct",
|
||||||
|
"advmod",
|
||||||
|
"aux",
|
||||||
|
"nsubj",
|
||||||
|
"advmod",
|
||||||
|
"advmod",
|
||||||
|
"det",
|
||||||
|
"obj",
|
||||||
|
"ROOT",
|
||||||
|
"punct",
|
||||||
|
]
|
||||||
|
pos = [
|
||||||
|
"PRON",
|
||||||
|
"NOUN",
|
||||||
|
"VERB",
|
||||||
|
"ADJ",
|
||||||
|
"PUNCT",
|
||||||
|
"PRON",
|
||||||
|
"VERB",
|
||||||
|
"ADV",
|
||||||
|
"NOUN",
|
||||||
|
"SCONJ",
|
||||||
|
"PRON",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"NOUN",
|
||||||
|
"PUNCT",
|
||||||
|
"ADP",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"ADP",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"AUX",
|
||||||
|
"ADV",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"CCONJ",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"PUNCT",
|
||||||
|
"ADJ",
|
||||||
|
"AUX",
|
||||||
|
"PRON",
|
||||||
|
"ADV",
|
||||||
|
"ADV",
|
||||||
|
"DET",
|
||||||
|
"NOUN",
|
||||||
|
"VERB",
|
||||||
|
"PUNCT",
|
||||||
|
]
|
||||||
|
return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nl_reference_chunking():
|
||||||
|
# Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
|
||||||
|
return [
|
||||||
|
"haar vriend",
|
||||||
|
"we",
|
||||||
|
"ruzie",
|
||||||
|
"we",
|
||||||
|
"de supermarkt",
|
||||||
|
"het begin",
|
||||||
|
"de supermarkt",
|
||||||
|
"het fruit",
|
||||||
|
"de groentes",
|
||||||
|
"we",
|
||||||
|
"geen avondeten",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_need_dep(nl_tokenizer):
|
||||||
|
"""
|
||||||
|
Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
|
||||||
|
"""
|
||||||
|
txt = "Haar vriend lacht luid."
|
||||||
|
doc = nl_tokenizer(txt)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunking(nl_sample, nl_reference_chunking):
|
||||||
|
"""
|
||||||
|
Test the noun chunks of a sample text. Uses a sample.
|
||||||
|
The sample text simulates a Doc object as would be produced by nl_core_news_md.
|
||||||
|
"""
|
||||||
|
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
||||||
|
assert chunks == nl_reference_chunking
|
Loading…
Reference in New Issue
Block a user