spaCy/spacy/lang/ms/syntax_iterators.py

42 lines
1.5 KiB
Python
Raw Normal View History

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}