mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
f0e8c9fe58
* updated syntax iters * formatted the code * added prepositional objects * code clean up * eliminated left attached adp * added es vocab * added basic tests * fixed typo * fixed typo * list to set * fixed doc name * added code for conj * more tests * differentiated adjectives and flat * fixed typo * added compounds * more compounds * tests for compounds * tests for nominal modifiers * fixed typo * fixed typo * formatted file * reformatted tests * fixed typo * fixed punct typo * formatted after changes * added indirect object * added full sentence examples * added longer full sentence examples * fixed sentence length of test * added passive subj * added test case by Damian
77 lines
2.7 KiB
Python
77 lines
2.7 KiB
Python
from typing import Union, Iterator, Tuple
|
|
|
|
from ...symbols import NOUN, PROPN, PRON
|
|
from ...errors import Errors
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|
"""
|
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
|
"""
|
|
labels = [
|
|
"nsubj",
|
|
"nsubj:pass",
|
|
"obj",
|
|
"obl",
|
|
"nmod",
|
|
"pcomp",
|
|
"appos",
|
|
"ROOT",
|
|
]
|
|
post_modifiers = ["flat", "fixed", "compound"]
|
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
|
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
|
np_label = doc.vocab.strings.add("NP")
|
|
adj_label = doc.vocab.strings.add("amod")
|
|
adp_label = doc.vocab.strings.add("ADP")
|
|
conj = doc.vocab.strings.add("conj")
|
|
conj_pos = doc.vocab.strings.add("CCONJ")
|
|
prev_end = -1
|
|
for i, word in enumerate(doclike):
|
|
if word.pos not in (NOUN, PROPN, PRON):
|
|
continue
|
|
# Prevent nested chunks from being produced
|
|
if word.left_edge.i <= prev_end:
|
|
continue
|
|
if word.dep in np_deps:
|
|
right_childs = list(word.rights)
|
|
right_child = right_childs[0] if right_childs else None
|
|
|
|
if right_child:
|
|
if right_child.dep == adj_label:
|
|
right_end = right_child.right_edge
|
|
elif right_child.dep in np_modifs: # Check if we can expand to right
|
|
right_end = word.right_edge
|
|
else:
|
|
right_end = word
|
|
else:
|
|
right_end = word
|
|
prev_end = right_end.i
|
|
|
|
left_index = word.left_edge.i
|
|
left_index = (
|
|
left_index + 1 if word.left_edge.pos == adp_label else left_index
|
|
) # Eliminate left attached de, del
|
|
|
|
yield left_index, right_end.i + 1, np_label
|
|
elif word.dep == conj:
|
|
head = word.head
|
|
while head.dep == conj and head.head.i < head.i:
|
|
head = head.head
|
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
if head.dep in np_deps:
|
|
prev_end = word.i
|
|
|
|
left_index = word.left_edge.i # eliminate left attached conjunction
|
|
left_index = (
|
|
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
|
)
|
|
yield left_index, word.i + 1, np_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|