spaCy/spacy/tests/lang/nl/test_noun_chunks.py

227 lines
4.2 KiB
Python
Raw Normal View History

import pytest
from spacy.tokens import Doc
from spacy.util import filter_spans
@pytest.fixture
def nl_sample(nl_vocab):
# TEXT :
# Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
# Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
# geen avondeten gekocht.
words = [
"Haar",
"vriend",
"lacht",
"luid",
".",
"We",
"kregen",
"alweer",
"ruzie",
"toen",
"we",
"de",
"supermarkt",
"ingingen",
".",
"Aan",
"het",
"begin",
"van",
"de",
"supermarkt",
"is",
"al",
"het",
"fruit",
"en",
"de",
"groentes",
".",
"Uiteindelijk",
"hebben",
"we",
"dan",
"ook",
"geen",
"avondeten",
"gekocht",
".",
]
heads = [
1,
2,
2,
2,
2,
6,
6,
6,
6,
13,
13,
12,
13,
6,
6,
17,
17,
24,
20,
20,
17,
24,
24,
24,
24,
27,
27,
24,
24,
36,
36,
36,
36,
36,
35,
36,
36,
36,
]
deps = [
"nmod:poss",
"nsubj",
"ROOT",
"advmod",
"punct",
"nsubj",
"ROOT",
"advmod",
"obj",
"mark",
"nsubj",
"det",
"obj",
"advcl",
"punct",
"case",
"det",
"obl",
"case",
"det",
"nmod",
"cop",
"advmod",
"det",
"ROOT",
"cc",
"det",
"conj",
"punct",
"advmod",
"aux",
"nsubj",
"advmod",
"advmod",
"det",
"obj",
"ROOT",
"punct",
]
pos = [
"PRON",
"NOUN",
"VERB",
"ADJ",
"PUNCT",
"PRON",
"VERB",
"ADV",
"NOUN",
"SCONJ",
"PRON",
"DET",
"NOUN",
"NOUN",
"PUNCT",
"ADP",
"DET",
"NOUN",
"ADP",
"DET",
"NOUN",
"AUX",
"ADV",
"DET",
"NOUN",
"CCONJ",
"DET",
"NOUN",
"PUNCT",
"ADJ",
"AUX",
"PRON",
"ADV",
"ADV",
"DET",
"NOUN",
"VERB",
"PUNCT",
]
return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
@pytest.fixture
def nl_reference_chunking():
# Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
return [
"haar vriend",
"we",
"ruzie",
"we",
"de supermarkt",
"het begin",
"de supermarkt",
"het fruit",
"de groentes",
"we",
"geen avondeten",
]
def test_need_dep(nl_tokenizer):
"""
Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
"""
txt = "Haar vriend lacht luid."
doc = nl_tokenizer(txt)
with pytest.raises(ValueError):
list(doc.noun_chunks)
def test_chunking(nl_sample, nl_reference_chunking):
"""
Test the noun chunks of a sample text. Uses a sample.
The sample text simulates a Doc object as would be produced by nl_core_news_md.
"""
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
assert chunks == nl_reference_chunking
@pytest.mark.issue(10846)
def test_no_overlapping_chunks(nl_vocab):
# fmt: off
doc = Doc(
nl_vocab,
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
)
# fmt: on
chunks = list(doc.noun_chunks)
assert filter_spans(chunks) == chunks