German noun chunk iterator now doesn't return tokens more than once

This commit is contained in:
Wolfgang Seeker 2016-05-03 16:58:59 +02:00
parent 7825b75548
commit a06fca9fdf
4 changed files with 62 additions and 22 deletions

View File

@ -382,7 +382,6 @@ cpdef enum symbol_t:
cc cc
ccomp ccomp
complm complm
compound
conj conj
csubj csubj
csubjpass csubjpass

View File

@ -381,7 +381,6 @@ IDS = {
"cc": cc, "cc": cc,
"ccomp": ccomp, "ccomp": ccomp,
"complm": complm, "complm": complm,
"compound": compound,
"conj": conj, "conj": conj,
"csubj": csubj, "csubj": csubj,
"csubjpass": csubjpass, "csubjpass": csubjpass,

View File

@ -32,7 +32,9 @@ def german_noun_chunks(doc):
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings['nk']
for word in doc: i = 0
while i < len(doc):
word = doc[i]
if word.pos == NOUN and word.dep in np_deps: if word.pos == NOUN and word.dep in np_deps:
rbracket = word.i+1 rbracket = word.i+1
# try to extend the span to the right # try to extend the span to the right
@ -40,7 +42,9 @@ def german_noun_chunks(doc):
for rdep in doc[word.i].rights: for rdep in doc[word.i].rights:
if rdep.pos == NOUN and rdep.dep == close_app: if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1 rbracket = rdep.i+1
yield word.l_edge, rbracket, np_label yield word.left_edge.i, rbracket, np_label
i = rbracket
continue
i += 1
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}

View File

@ -1,10 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import numpy import numpy
from spacy.attrs import HEAD, DEP from spacy.attrs import HEAD, DEP
from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj
@pytest.mark.models @pytest.mark.models
@ -13,6 +14,7 @@ class TestNounChunks:
def ex1_en(self, EN): def ex1_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
example.from_array([HEAD, DEP], example.from_array([HEAD, DEP],
numpy.asarray( numpy.asarray(
[ [
@ -30,6 +32,7 @@ class TestNounChunks:
def ex2_en(self, EN): def ex2_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
example.from_array([HEAD, DEP], example.from_array([HEAD, DEP],
numpy.asarray( numpy.asarray(
[ [
@ -52,6 +55,7 @@ class TestNounChunks:
def ex3_en(self, EN): def ex3_en(self, EN):
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
example.from_array([HEAD, DEP], example.from_array([HEAD, DEP],
numpy.asarray( numpy.asarray(
[ [
@ -65,22 +69,43 @@ class TestNounChunks:
], dtype='int32')) ], dtype='int32'))
return example return example
# @pytest.fixture(score="class") @pytest.fixture(scope="class")
# def ex1_de(self, DE): def ex1_de(self, DE):
# example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
# EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
# example.from_array([HEAD, DEP], nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
# numpy.asarray( example.from_array([HEAD, DEP],
# [ numpy.asarray(
# [1, det], [
# [4, nsubj], [1, nk],
# [-1, prep], [1, sb],
# [1, det], [0, root],
# [-2, pobj], [-1, mo],
# [0, root], [1, nk],
# [-1, punct] [-2, nk],
# ], dtype='int32')) [-3, punct]
# return example ], dtype='int32'))
return example
@pytest.fixture(scope="class")
def ex2_de(self, DE):
example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
example.from_array([HEAD, DEP],
numpy.asarray(
[
[1, nk],
[1, sb],
[0, root],
[-1, mo],
[1, nk],
[-2, nk],
[-1, nk],
[-5, oa],
[-6, punct]
], dtype='int32'))
return example
def test_en_standard_chunk(self, ex1_en): def test_en_standard_chunk(self, ex1_en):
chunks = list(ex1_en.noun_chunks) chunks = list(ex1_en.noun_chunks)
@ -98,3 +123,16 @@ class TestNounChunks:
assert len(chunks) == 2 assert len(chunks) == 2
assert chunks[0].string == 'A phrase ' assert chunks[0].string == 'A phrase '
assert chunks[1].string == 'another phrase ' assert chunks[1].string == 'another phrase '
def test_de_standard_chunk(self, ex1_de):
chunks = list(ex1_de.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'Eine Tasse '
assert chunks[1].string == 'dem Tisch '
def test_de_extended_chunk(self, ex2_de):
chunks = list(ex2_de.noun_chunks)
assert len(chunks) == 3
assert chunks[0].string == 'Die Sängerin '
assert chunks[1].string == 'einer Tasse Kaffee '
assert chunks[2].string == 'Arien '