spaCy/spacy/lang/tr/syntax_iterators.py
Duygu Altinok 7e821c2776
Turkish language syntax iterators (#6191)
* added tr_vocab to config

* basic test

* added syntax iterator to Turkish lang class

* first version for Turkish syntax iter, without flat

* added simple tests with nmod, amod, det

* more tests to amod and nmod

* separated noun chunks and parser test

* rearrangement after nchunk parser separation

* added recursive NPs

* tests with complicated recursive NPs

* tests with conjed NPs

* additional tests for conj NP

* small modification for shaving off conj from NP

* added tests with flat

* more tests with flat

* added examples with flats conjed

* added inner func for flat trick

* corrected parse

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2020-10-07 11:07:52 +02:00

60 lines
1.7 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# Please see documentation for Turkish NP structure
labels = [
"nsubj",
"iobj",
"obj",
"obl",
"appos",
"orphan",
"dislocated",
"ROOT",
]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
flat = doc.vocab.strings.add("flat")
np_label = doc.vocab.strings.add("NP")
def extend_right(w): # Playing a trick for flat
rindex = w.i + 1
for rdep in doc[w.i].rights: # Extend the span to right if there is a flat
if rdep.dep == flat and rdep.pos in (NOUN, PROPN):
rindex = rdep.i + 1
else:
break
return rindex
prev_end = len(doc) + 1
for i, word in reversed(list(enumerate(doclike))):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i >= prev_end:
continue
if word.dep in np_deps:
prev_end = word.left_edge.i
yield word.left_edge.i, extend_right(word), np_label
elif word.dep == conj:
cc_token = word.left_edge
prev_end = cc_token.i
yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}