from typing import Iterator, Tuple, Union from ...errors import Errors from ...symbols import NOUN, PRON from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """ Detect base noun phrases from a dependency parse. Works on Doc and Span. The definition is inspired by https://www.nltk.org/book/ch07.html Consider : [Noun + determinant / adjective] and also [Pronoun] """ # fmt: off # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. # Check for dependencies: POS, DEP if not doc.has_annotation("POS"): raise ValueError(Errors.E1019) if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) # See UD tags: https://universaldependencies.org/u/dep/index.html # amod = adjectival modifier # nmod:poss = possessive nominal modifier # nummod = numeric modifier # det = determiner # det:poss = possessive determiner noun_deps = [ doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"] ] # nsubj = nominal subject # nsubj:pass = passive nominal subject pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]] # Label NP for the Span to identify it as Noun-Phrase span_label = doc.vocab.strings.add("NP") # Only NOUNS and PRONOUNS matter end_span = -1 for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): # For NOUNS # Pick children from syntactic parse (only those with certain dependencies) if word.pos == NOUN: # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS # We check if the word has a "nsubj", if it's the case, we eliminate it nsubjs = filter( lambda x: x.dep == doc.vocab.strings["nsubj"], word.children ) next_word = next(nsubjs, None) if next_word is not None: # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN continue children = filter(lambda x: x.dep in noun_deps, word.children) children_i = [c.i for c in children] + [word.i] start_span = min(children_i) if start_span >= end_span: end_span = max(children_i) + 1 yield start_span, end_span, span_label # PRONOUNS only if it is the subject of a verb elif word.pos == PRON: if word.dep in pronoun_deps: start_span = word.i if start_span >= end_span: end_span = word.i + 1 yield start_span, end_span, span_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}