German noun chunk iterator now doesn't return tokens more than once

2025-07-13 01:32:32 +03:00 · 2016-05-03 16:58:59 +02:00 · 2016-05-03 16:58:59 +02:00 · a06fca9fdf
commit a06fca9fdf
parent 7825b75548
4 changed files with 62 additions and 22 deletions
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -382,7 +382,6 @@ cpdef enum symbol_t:
    cc
    ccomp
    complm
    compound
    conj
    csubj
    csubjpass
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -381,7 +381,6 @@ IDS = {
    "cc": cc,
    "ccomp": ccomp,
    "complm": complm,
    "compound": compound,
    "conj": conj,
    "csubj": csubj,
    "csubjpass": csubjpass,
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -32,7 +32,9 @@ def german_noun_chunks(doc):
    np_deps = set(doc.vocab.strings[label] for label in labels)
    close_app = doc.vocab.strings['nk']
-    for word in doc:
+    i = 0
    while i  < len(doc):
        word = doc[i]
        if word.pos == NOUN and word.dep in np_deps:
            rbracket = word.i+1
            # try to extend the span to the right
@ -40,7 +42,9 @@ def german_noun_chunks(doc):
            for rdep in doc[word.i].rights:
                if rdep.pos == NOUN and rdep.dep == close_app:
                    rbracket = rdep.i+1
-            yield word.l_edge, rbracket, np_label
+            yield word.left_edge.i, rbracket, np_label
-
+            i = rbracket
            continue
        i += 1
 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
--- a/spacy/tests/unit/test_parser.py
+++ b/spacy/tests/unit/test_parser.py
@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import pytest
 import numpy
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj
@pytest.mark.models
@ -13,6 +14,7 @@ class TestNounChunks:
    def ex1_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
        det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
@ -30,6 +32,7 @@ class TestNounChunks:
    def ex2_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
        det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
@ -52,6 +55,7 @@ class TestNounChunks:
    def ex3_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
        det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
@ -65,22 +69,43 @@ class TestNounChunks:
            ], dtype='int32'))
        return example
-    # @pytest.fixture(score="class")
+    @pytest.fixture(scope="class")
-    # def ex1_de(self, DE):
+    def ex1_de(self, DE):
-    #     example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
+        example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
-    #     EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
+        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
-    #     example.from_array([HEAD, DEP],
+        nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
-    #     numpy.asarray(
+        example.from_array([HEAD, DEP],
-    #         [
+        numpy.asarray(
-    #             [1, det],
+            [
-    #             [4, nsubj],
+                [1, nk],
-    #             [-1, prep],
+                [1, sb],
-    #             [1, det],
+                [0, root],
-    #             [-2, pobj],
+                [-1, mo],
-    #             [0, root],
+                [1, nk],
-    #             [-1, punct]
+                [-2, nk],
-    #         ], dtype='int32'))
+                [-3, punct]
-    #     return example
+            ], dtype='int32'))
        return example
    @pytest.fixture(scope="class")
    def ex2_de(self, DE):
        example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
        nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, nk],
                [1, sb],
                [0, root],
                [-1, mo],
                [1, nk],
                [-2, nk],
                [-1, nk],
                [-5, oa],
                [-6, punct]
            ], dtype='int32'))
        return example
    def test_en_standard_chunk(self, ex1_en):
        chunks = list(ex1_en.noun_chunks)
@ -98,3 +123,16 @@ class TestNounChunks:
        assert len(chunks) == 2
        assert chunks[0].string == 'A phrase '
        assert chunks[1].string == 'another phrase '
    def test_de_standard_chunk(self, ex1_de):
        chunks = list(ex1_de.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'Eine Tasse '
        assert chunks[1].string == 'dem Tisch '
    def test_de_extended_chunk(self, ex2_de):
        chunks = list(ex2_de.noun_chunks)
        assert len(chunks) == 3
        assert chunks[0].string == 'Die Sängerin '
        assert chunks[1].string == 'einer Tasse Kaffee '
        assert chunks[2].string == 'Arien '