mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-10 17:26:42 +03:00
e117573822
* ✨ implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * 🐛 fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * ✅ fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
73 lines
2.7 KiB
Python
73 lines
2.7 KiB
Python
from typing import Union, Iterator
|
|
|
|
from ...symbols import NOUN, PRON
|
|
from ...errors import Errors
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|
"""
|
|
Detect base noun phrases from a dependency parse. Works on Doc and Span.
|
|
The definition is inspired by https://www.nltk.org/book/ch07.html
|
|
Consider : [Noun + determinant / adjective] and also [Pronoun]
|
|
"""
|
|
# fmt: off
|
|
# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
|
# fmt: on
|
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
|
|
|
# Check for dependencies: POS, DEP
|
|
if not doc.has_annotation("POS"):
|
|
raise ValueError(Errors.E1019)
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
|
|
# See UD tags: https://universaldependencies.org/u/dep/index.html
|
|
# amod = adjectival modifier
|
|
# nmod:poss = possessive nominal modifier
|
|
# nummod = numeric modifier
|
|
# det = determiner
|
|
# det:poss = possessive determiner
|
|
noun_deps = [
|
|
doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
|
|
]
|
|
|
|
# nsubj = nominal subject
|
|
# nsubj:pass = passive nominal subject
|
|
pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
|
|
|
|
# Label NP for the Span to identify it as Noun-Phrase
|
|
span_label = doc.vocab.strings.add("NP")
|
|
|
|
# Only NOUNS and PRONOUNS matter
|
|
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
|
# For NOUNS
|
|
# Pick children from syntactic parse (only those with certain dependencies)
|
|
if word.pos == NOUN:
|
|
# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
|
|
# We check if the word has a "nsubj", if it's the case, we eliminate it
|
|
nsubjs = filter(
|
|
lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
|
|
)
|
|
next_word = next(nsubjs, None)
|
|
if next_word is not None:
|
|
# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
|
|
continue
|
|
|
|
children = filter(lambda x: x.dep in noun_deps, word.children)
|
|
children_i = [c.i for c in children] + [word.i]
|
|
|
|
start_span = min(children_i)
|
|
end_span = max(children_i) + 1
|
|
yield start_span, end_span, span_label
|
|
|
|
# PRONOUNS only if it is the subject of a verb
|
|
elif word.pos == PRON:
|
|
if word.dep in pronoun_deps:
|
|
start_span = word.i
|
|
end_span = word.i + 1
|
|
yield start_span, end_span, span_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|