From 57e8254f63574ef6349d384132043e0bccf32041 Mon Sep 17 00:00:00 2001 From: Tpt Date: Mon, 12 Jun 2017 15:20:49 +0200 Subject: [PATCH] Adds function to extract french noun chunks --- spacy/syntax/iterators.pyx | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index c14541d22..557616d18 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -110,5 +110,35 @@ def es_noun_chunks(obj): token = next_token(token) +def french_noun_chunks(obj): + labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings[label] for label in labels] + conj = doc.vocab.strings.add('conj') + np_label = doc.vocab.strings.add('NP') + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) + yield word.left_edge.i, word.right_edge.i+1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) + yield word.left_edge.i, word.right_edge.i+1, np_label + + CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, - 'es': es_noun_chunks} + 'es': es_noun_chunks, 'fr': french_noun_chunks}