from typing import Iterator, Tuple, Union from ...tokens import Doc, Span from ...symbols import NOUN, PROPN, PRON from ...errors import Errors def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on both Doc and Span.""" labels = [ "appos", "nsubj", "nsubj:cop", "obj", "obl", "ROOT", ] extend_labels = [ "amod", "compound", "compound:nn", "flat:name", "nmod", "nmod:gobj", "nmod:gsubj", "nmod:poss", "nummod", ] def potential_np_head(word): return word.pos in (NOUN, PROPN) and ( word.dep in np_deps or word.head.pos == PRON ) doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] extend_deps = [doc.vocab.strings[label] for label in extend_labels] np_label = doc.vocab.strings.add("NP") conj_label = doc.vocab.strings.add("conj") rbracket = 0 prev_end = -1 for i, word in enumerate(doclike): if i < rbracket: continue # Is this a potential independent NP head or coordinated with # a NOUN that is itself an independent NP head? # # e.g. "Terveyden ja hyvinvoinnin laitos" if potential_np_head(word) or ( word.dep == conj_label and potential_np_head(word.head) ): # Try to extend to the left to include adjective/num # modifiers, compound words etc. lbracket = word.i for ldep in word.lefts: if ldep.dep in extend_deps: lbracket = ldep.left_edge.i break # Prevent nested chunks from being produced if lbracket <= prev_end: continue rbracket = word.i # Try to extend the span to the right to capture # appositions and noun modifiers for rdep in word.rights: if rdep.dep in extend_deps: rbracket = rdep.i prev_end = rbracket yield lbracket, rbracket + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}