from typing import Union, Iterator, Tuple from ...tokens import Doc, Span from ...symbols import NOUN, PROPN from ...errors import Errors def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # fmt: off labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] np_label = doc.vocab.strings.add("NP") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN): continue # Prevent nested chunks from being produced if word.left_edge.i <= prev_end: continue if word.dep in np_deps: left = word.left_edge.i right = word.right_edge.i + 1 # leave prepositions and punctuation out of the left side of the chunk if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT": left = word.left_edge.i + 1 prev_end = word.right_edge.i # leave subordinated clauses and appositions out of the chunk a = word.i + 1 while a < word.right_edge.i: paraula = doc[a] if paraula.pos_ == "VERB": right = paraula.left_edge.i prev_end = paraula.left_edge.i - 1 elif paraula.dep_ == "appos": right = paraula.left_edge.i + 1 prev_end = paraula.left_edge.i - 1 a += 1 # leave punctuation out of the right side of the chunk if word.right_edge.pos_ == "PUNCT": right = right - 1 yield left, right, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}