mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
German noun chunk iterator now doesn't return tokens more than once
This commit is contained in:
parent
7825b75548
commit
a06fca9fdf
|
@ -382,7 +382,6 @@ cpdef enum symbol_t:
|
||||||
cc
|
cc
|
||||||
ccomp
|
ccomp
|
||||||
complm
|
complm
|
||||||
compound
|
|
||||||
conj
|
conj
|
||||||
csubj
|
csubj
|
||||||
csubjpass
|
csubjpass
|
||||||
|
|
|
@ -381,7 +381,6 @@ IDS = {
|
||||||
"cc": cc,
|
"cc": cc,
|
||||||
"ccomp": ccomp,
|
"ccomp": ccomp,
|
||||||
"complm": complm,
|
"complm": complm,
|
||||||
"compound": compound,
|
|
||||||
"conj": conj,
|
"conj": conj,
|
||||||
"csubj": csubj,
|
"csubj": csubj,
|
||||||
"csubjpass": csubjpass,
|
"csubjpass": csubjpass,
|
||||||
|
|
|
@ -32,7 +32,9 @@ def german_noun_chunks(doc):
|
||||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||||
close_app = doc.vocab.strings['nk']
|
close_app = doc.vocab.strings['nk']
|
||||||
|
|
||||||
for word in doc:
|
i = 0
|
||||||
|
while i < len(doc):
|
||||||
|
word = doc[i]
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
rbracket = word.i+1
|
rbracket = word.i+1
|
||||||
# try to extend the span to the right
|
# try to extend the span to the right
|
||||||
|
@ -40,7 +42,9 @@ def german_noun_chunks(doc):
|
||||||
for rdep in doc[word.i].rights:
|
for rdep in doc[word.i].rights:
|
||||||
if rdep.pos == NOUN and rdep.dep == close_app:
|
if rdep.pos == NOUN and rdep.dep == close_app:
|
||||||
rbracket = rdep.i+1
|
rbracket = rdep.i+1
|
||||||
yield word.l_edge, rbracket, np_label
|
yield word.left_edge.i, rbracket, np_label
|
||||||
|
i = rbracket
|
||||||
|
continue
|
||||||
|
i += 1
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
|
@ -13,6 +14,7 @@ class TestNounChunks:
|
||||||
def ex1_en(self, EN):
|
def ex1_en(self, EN):
|
||||||
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
|
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
|
||||||
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
|
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
|
||||||
|
det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
|
||||||
example.from_array([HEAD, DEP],
|
example.from_array([HEAD, DEP],
|
||||||
numpy.asarray(
|
numpy.asarray(
|
||||||
[
|
[
|
||||||
|
@ -30,6 +32,7 @@ class TestNounChunks:
|
||||||
def ex2_en(self, EN):
|
def ex2_en(self, EN):
|
||||||
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
|
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
|
||||||
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
|
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
|
||||||
|
det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
|
||||||
example.from_array([HEAD, DEP],
|
example.from_array([HEAD, DEP],
|
||||||
numpy.asarray(
|
numpy.asarray(
|
||||||
[
|
[
|
||||||
|
@ -52,6 +55,7 @@ class TestNounChunks:
|
||||||
def ex3_en(self, EN):
|
def ex3_en(self, EN):
|
||||||
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
|
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
|
||||||
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
|
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
|
||||||
|
det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
|
||||||
example.from_array([HEAD, DEP],
|
example.from_array([HEAD, DEP],
|
||||||
numpy.asarray(
|
numpy.asarray(
|
||||||
[
|
[
|
||||||
|
@ -65,22 +69,43 @@ class TestNounChunks:
|
||||||
], dtype='int32'))
|
], dtype='int32'))
|
||||||
return example
|
return example
|
||||||
|
|
||||||
# @pytest.fixture(score="class")
|
@pytest.fixture(scope="class")
|
||||||
# def ex1_de(self, DE):
|
def ex1_de(self, DE):
|
||||||
# example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
|
example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
|
||||||
# EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
|
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
|
||||||
# example.from_array([HEAD, DEP],
|
nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
|
||||||
# numpy.asarray(
|
example.from_array([HEAD, DEP],
|
||||||
# [
|
numpy.asarray(
|
||||||
# [1, det],
|
[
|
||||||
# [4, nsubj],
|
[1, nk],
|
||||||
# [-1, prep],
|
[1, sb],
|
||||||
# [1, det],
|
[0, root],
|
||||||
# [-2, pobj],
|
[-1, mo],
|
||||||
# [0, root],
|
[1, nk],
|
||||||
# [-1, punct]
|
[-2, nk],
|
||||||
# ], dtype='int32'))
|
[-3, punct]
|
||||||
# return example
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex2_de(self, DE):
|
||||||
|
example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
|
||||||
|
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
|
||||||
|
nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[1, nk],
|
||||||
|
[1, sb],
|
||||||
|
[0, root],
|
||||||
|
[-1, mo],
|
||||||
|
[1, nk],
|
||||||
|
[-2, nk],
|
||||||
|
[-1, nk],
|
||||||
|
[-5, oa],
|
||||||
|
[-6, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
def test_en_standard_chunk(self, ex1_en):
|
def test_en_standard_chunk(self, ex1_en):
|
||||||
chunks = list(ex1_en.noun_chunks)
|
chunks = list(ex1_en.noun_chunks)
|
||||||
|
@ -98,3 +123,16 @@ class TestNounChunks:
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].string == 'A phrase '
|
assert chunks[0].string == 'A phrase '
|
||||||
assert chunks[1].string == 'another phrase '
|
assert chunks[1].string == 'another phrase '
|
||||||
|
|
||||||
|
def test_de_standard_chunk(self, ex1_de):
|
||||||
|
chunks = list(ex1_de.noun_chunks)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].string == 'Eine Tasse '
|
||||||
|
assert chunks[1].string == 'dem Tisch '
|
||||||
|
|
||||||
|
def test_de_extended_chunk(self, ex2_de):
|
||||||
|
chunks = list(ex2_de.noun_chunks)
|
||||||
|
assert len(chunks) == 3
|
||||||
|
assert chunks[0].string == 'Die Sängerin '
|
||||||
|
assert chunks[1].string == 'einer Tasse Kaffee '
|
||||||
|
assert chunks[2].string == 'Arien '
|
||||||
|
|
Loading…
Reference in New Issue
Block a user