spaCy/spacy/lang/fi/syntax_iterators.py
Antti Ajanki e9c26f2ee9
Add a noun chunker for Finnish (#10214)
with test cases
2022-02-08 08:44:11 +01:00

80 lines
2.3 KiB
Python

from typing import Iterator, Tuple, Union
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
labels = [
"appos",
"nsubj",
"nsubj:cop",
"obj",
"obl",
"ROOT",
]
extend_labels = [
"amod",
"compound",
"compound:nn",
"flat:name",
"nmod",
"nmod:gobj",
"nmod:gsubj",
"nmod:poss",
"nummod",
]
def potential_np_head(word):
return word.pos in (NOUN, PROPN) and (
word.dep in np_deps or word.head.pos == PRON
)
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
extend_deps = [doc.vocab.strings[label] for label in extend_labels]
np_label = doc.vocab.strings.add("NP")
conj_label = doc.vocab.strings.add("conj")
rbracket = 0
prev_end = -1
for i, word in enumerate(doclike):
if i < rbracket:
continue
# Is this a potential independent NP head or coordinated with
# a NOUN that is itself an independent NP head?
#
# e.g. "Terveyden ja hyvinvoinnin laitos"
if potential_np_head(word) or (
word.dep == conj_label and potential_np_head(word.head)
):
# Try to extend to the left to include adjective/num
# modifiers, compound words etc.
lbracket = word.i
for ldep in word.lefts:
if ldep.dep in extend_deps:
lbracket = ldep.left_edge.i
break
# Prevent nested chunks from being produced
if lbracket <= prev_end:
continue
rbracket = word.i
# Try to extend the span to the right to capture
# appositions and noun modifiers
for rdep in word.rights:
if rdep.dep in extend_deps:
rbracket = rdep.i
prev_end = rbracket
yield lbracket, rbracket + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}