mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Fix Dutch noun chunks to skip overlapping spans (#11275)
* Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
231a17817d
commit
ed4ad309e6
|
@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
span_label = doc.vocab.strings.add("NP")
|
span_label = doc.vocab.strings.add("NP")
|
||||||
|
|
||||||
# Only NOUNS and PRONOUNS matter
|
# Only NOUNS and PRONOUNS matter
|
||||||
|
end_span = -1
|
||||||
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
||||||
# For NOUNS
|
# For NOUNS
|
||||||
# Pick children from syntactic parse (only those with certain dependencies)
|
# Pick children from syntactic parse (only those with certain dependencies)
|
||||||
|
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
children_i = [c.i for c in children] + [word.i]
|
children_i = [c.i for c in children] + [word.i]
|
||||||
|
|
||||||
start_span = min(children_i)
|
start_span = min(children_i)
|
||||||
end_span = max(children_i) + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = max(children_i) + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
# PRONOUNS only if it is the subject of a verb
|
# PRONOUNS only if it is the subject of a verb
|
||||||
elif word.pos == PRON:
|
elif word.pos == PRON:
|
||||||
if word.dep in pronoun_deps:
|
if word.dep in pronoun_deps:
|
||||||
start_span = word.i
|
start_span = word.i
|
||||||
end_span = word.i + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = word.i + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
|
||||||
"""
|
"""
|
||||||
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
||||||
assert chunks == nl_reference_chunking
|
assert chunks == nl_reference_chunking
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10846)
|
||||||
|
def test_no_overlapping_chunks(nl_vocab):
|
||||||
|
# fmt: off
|
||||||
|
doc = Doc(
|
||||||
|
nl_vocab,
|
||||||
|
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
|
||||||
|
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
|
||||||
|
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
|
||||||
|
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert filter_spans(chunks) == chunks
|
||||||
|
|
Loading…
Reference in New Issue
Block a user