spaCy/spacy/lang/fi/syntax_iterators.py

81 lines
2.3 KiB
Python
Raw Permalink Normal View History

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
labels = [
"appos",
"nsubj",
"nsubj:cop",
"obj",
"obl",
"ROOT",
]
extend_labels = [
"amod",
"compound",
"compound:nn",
"flat:name",
"nmod",
"nmod:gobj",
"nmod:gsubj",
"nmod:poss",
"nummod",
]
def potential_np_head(word):
return word.pos in (NOUN, PROPN) and (
word.dep in np_deps or word.head.pos == PRON
)
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
extend_deps = [doc.vocab.strings[label] for label in extend_labels]
np_label = doc.vocab.strings.add("NP")
conj_label = doc.vocab.strings.add("conj")
rbracket = 0
prev_end = -1
for i, word in enumerate(doclike):
if i < rbracket:
continue
# Is this a potential independent NP head or coordinated with
# a NOUN that is itself an independent NP head?
#
# e.g. "Terveyden ja hyvinvoinnin laitos"
if potential_np_head(word) or (
word.dep == conj_label and potential_np_head(word.head)
):
# Try to extend to the left to include adjective/num
# modifiers, compound words etc.
lbracket = word.i
for ldep in word.lefts:
if ldep.dep in extend_deps:
lbracket = ldep.left_edge.i
break
# Prevent nested chunks from being produced
if lbracket <= prev_end:
continue
rbracket = word.i
# Try to extend the span to the right to capture
# appositions and noun modifiers
for rdep in word.rights:
if rdep.dep in extend_deps:
rbracket = rdep.i
prev_end = rbracket
yield lbracket, rbracket + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}