From a06fca9fdf48353aa671a84c090f4d21d53b6ec6 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 16:58:59 +0200 Subject: [PATCH] German noun chunk iterator now doesn't return tokens more than once --- spacy/symbols.pxd | 1 - spacy/symbols.pyx | 1 - spacy/syntax/iterators.pyx | 10 +++-- spacy/tests/unit/test_parser.py | 72 +++++++++++++++++++++++++-------- 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index d577eaf6d..942d8aa9c 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -382,7 +382,6 @@ cpdef enum symbol_t: cc ccomp complm - compound conj csubj csubjpass diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 0e8dcda13..712bef9a3 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -381,7 +381,6 @@ IDS = { "cc": cc, "ccomp": ccomp, "complm": complm, - "compound": compound, "conj": conj, "csubj": csubj, "csubjpass": csubjpass, diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b8b810d36..395f772ce 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -32,7 +32,9 @@ def german_noun_chunks(doc): np_deps = set(doc.vocab.strings[label] for label in labels) close_app = doc.vocab.strings['nk'] - for word in doc: + i = 0 + while i < len(doc): + word = doc[i] if word.pos == NOUN and word.dep in np_deps: rbracket = word.i+1 # try to extend the span to the right @@ -40,7 +42,9 @@ def german_noun_chunks(doc): for rdep in doc[word.i].rights: if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 - yield word.l_edge, rbracket, np_label - + yield word.left_edge.i, rbracket, np_label + i = rbracket + continue + i += 1 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} diff --git a/spacy/tests/unit/test_parser.py b/spacy/tests/unit/test_parser.py index ba224b9ec..78bfad293 100644 --- a/spacy/tests/unit/test_parser.py +++ b/spacy/tests/unit/test_parser.py @@ -1,10 +1,11 @@ +# -*- coding: utf-8 -*- + from __future__ import unicode_literals import pytest import numpy from spacy.attrs import HEAD, DEP -from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj @pytest.mark.models @@ -13,6 +14,7 @@ class TestNounChunks: def ex1_en(self, EN): example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) + det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -30,6 +32,7 @@ class TestNounChunks: def ex2_en(self, EN): example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) + det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -52,6 +55,7 @@ class TestNounChunks: def ex3_en(self, EN): example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) + det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -65,22 +69,43 @@ class TestNounChunks: ], dtype='int32')) return example - # @pytest.fixture(score="class") - # def ex1_de(self, DE): - # example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) - # EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) - # example.from_array([HEAD, DEP], - # numpy.asarray( - # [ - # [1, det], - # [4, nsubj], - # [-1, prep], - # [1, det], - # [-2, pobj], - # [0, root], - # [-1, punct] - # ], dtype='int32')) - # return example + @pytest.fixture(scope="class") + def ex1_de(self, DE): + example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) + DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) + nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct']) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [1, nk], + [1, sb], + [0, root], + [-1, mo], + [1, nk], + [-2, nk], + [-3, punct] + ], dtype='int32')) + return example + + @pytest.fixture(scope="class") + def ex2_de(self, DE): + example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' ')) + DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' ')) + nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa']) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [1, nk], + [1, sb], + [0, root], + [-1, mo], + [1, nk], + [-2, nk], + [-1, nk], + [-5, oa], + [-6, punct] + ], dtype='int32')) + return example def test_en_standard_chunk(self, ex1_en): chunks = list(ex1_en.noun_chunks) @@ -98,3 +123,16 @@ class TestNounChunks: assert len(chunks) == 2 assert chunks[0].string == 'A phrase ' assert chunks[1].string == 'another phrase ' + + def test_de_standard_chunk(self, ex1_de): + chunks = list(ex1_de.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].string == 'Eine Tasse ' + assert chunks[1].string == 'dem Tisch ' + + def test_de_extended_chunk(self, ex2_de): + chunks = list(ex2_de.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].string == 'Die Sängerin ' + assert chunks[1].string == 'einer Tasse Kaffee ' + assert chunks[2].string == 'Arien '