spaCy/spacy/lang/pt/syntax_iterators.py
Duygu Altinok 6e6650307d
Portuguese noun chunks review (#9559)
* added tests

* added pt vocab

* transferred spanish

* added syntax iters

* fixed parenthesis

* added nmod example

* added relative pron

* fixed rel pron

* added rel subclause

* corrected typo

* added more NP chains

* long sentence

* fixed typo

* fixed typo

* fixed typo

* corrected heads

* added passive subj

* added pass subj

* added passive obj

* refinement to rights

* went back to odl

* fixed test

* fixed typo

* fixed typo

* formatted

* Format

* Format test cases

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2021-11-04 23:55:49 +01:00

86 lines
3.0 KiB
Python

from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}