reformulate noun chunk tests for English

This commit is contained in:
Wolfgang Seeker 2016-05-03 14:24:35 +02:00
parent 1786331cd8
commit 7b246c13cb
7 changed files with 111 additions and 21 deletions

View File

@ -382,6 +382,7 @@ cpdef enum symbol_t:

View File

@ -381,6 +381,7 @@ IDS = {
"cc": cc,
"ccomp": ccomp,
"complm": complm,
"compound": compound,
"conj": conj,
"csubj": csubj,
"csubjpass": csubjpass,

View File

@ -225,6 +225,11 @@ cdef class Parser:
def step_through(self, Doc doc):
return StepwiseState(self, doc)
def from_transition_sequence(self, Doc doc, sequence):
with self.step_through(doc) as stepwise:
for transition in sequence:
def add_label(self, label):
for action in self.moves.action_types:
self.moves.add_action(action, label)

View File

@ -7,7 +7,7 @@ import spacy
def EN():
return spacy.load("en")
def DE():
return spacy.load("de")

View File

@ -2,30 +2,30 @@ from __future__ import unicode_literals
import pytest
def test_nsubj(EN):
sent = EN(u'A base phrase should be recognized.')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 1
assert base_nps[0].string == 'A base phrase '
# @pytest.mark.models
# def test_nsubj(EN):
# sent = EN(u'A base phrase should be recognized.')
# base_nps = list(sent.noun_chunks)
# assert len(base_nps) == 1
# assert base_nps[0].string == 'A base phrase '
def test_coord(EN):
sent = EN(u'A base phrase and a good phrase are often the same.')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2
assert base_nps[0].string == 'A base phrase '
assert base_nps[1].string == 'a good phrase '
# @pytest.mark.models
# def test_coord(EN):
# sent = EN(u'A base phrase and a good phrase are often the same.')
# base_nps = list(sent.noun_chunks)
# assert len(base_nps) == 2
# assert base_nps[0].string == 'A base phrase '
# assert base_nps[1].string == 'a good phrase '
def test_pp(EN):
sent = EN(u'A phrase with another phrase occurs')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2
assert base_nps[0].string == 'A phrase '
assert base_nps[1].string == 'another phrase '
# @pytest.mark.models
# def test_pp(EN):
# sent = EN(u'A phrase with another phrase occurs')
# base_nps = list(sent.noun_chunks)
# assert len(base_nps) == 2
# assert base_nps[0].string == 'A phrase '
# assert base_nps[1].string == 'another phrase '

View File

View File

@ -0,0 +1,83 @@
from __future__ import unicode_literals
import pytest
import numpy
from spacy.attrs import HEAD, DEP
from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj
class TestNounChunks:
def ex1_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
example.from_array([HEAD, DEP],
[2, det],
[1, compound],
[3, nsubjpass],
[2, aux],
[1, auxpass],
[0, root],
[-1, punct]
], dtype='int32'))
return example
def ex2_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
example.from_array([HEAD, DEP],
[2, det],
[1, compound],
[5, nsubj],
[-1, cc],
[1, det],
[1, amod],
[-4, conj],
[0, root],
[-1, advmod],
[1, det],
[-3, attr],
[-4, punct]
], dtype='int32'))
return example
def ex3_en(self, EN):
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
example.from_array([HEAD, DEP],
[1, det],
[4, nsubj],
[-1, prep],
[1, det],
[-2, pobj],
[0, root],
[-1, punct]
], dtype='int32'))
return example
def test_standard_chunk(self, ex1_en):
chunks = list(ex1_en.noun_chunks)
assert len(chunks) == 1
assert chunks[0].string == 'A base phrase '
def test_coordinated_chunks(self, ex2_en):
chunks = list(ex2_en.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'A base phrase '
assert chunks[1].string == 'a good phrase '
def test_pp_chunks(self, ex3_en):
chunks = list(ex3_en.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'A phrase '
assert chunks[1].string == 'another phrase '