remove old tests for sentence boundary detection

This commit is contained in:
Wolfgang Seeker 2016-05-02 14:36:35 +02:00
parent fa961ea694
commit b11cbb06c6
2 changed files with 53 additions and 169 deletions

View File

@ -25,74 +25,3 @@ def apply_transition_sequence(model, doc, sequence):
with model.parser.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
@pytest.mark.models
def test_arc_eager_finalize_state(EN):
# right branching
example = EN.tokenizer.tokens_from_list(u"a b c d e".split(' '))
apply_transition_sequence(EN, example, ['R-nsubj','D','R-nsubj','R-nsubj','D','R-ROOT'])
assert example[0].n_lefts == 0
assert example[0].n_rights == 2
assert example[0].left_edge.i == 0
assert example[0].right_edge.i == 3
assert example[0].head.i == 0
assert example[1].n_lefts == 0
assert example[1].n_rights == 0
assert example[1].left_edge.i == 1
assert example[1].right_edge.i == 1
assert example[1].head.i == 0
assert example[2].n_lefts == 0
assert example[2].n_rights == 1
assert example[2].left_edge.i == 2
assert example[2].right_edge.i == 3
assert example[2].head.i == 0
assert example[3].n_lefts == 0
assert example[3].n_rights == 0
assert example[3].left_edge.i == 3
assert example[3].right_edge.i == 3
assert example[3].head.i == 2
assert example[4].n_lefts == 0
assert example[4].n_rights == 0
assert example[4].left_edge.i == 4
assert example[4].right_edge.i == 4
assert example[4].head.i == 4
# left branching
example = EN.tokenizer.tokens_from_list(u"a b c d e".split(' '))
apply_transition_sequence(EN, example, ['S','L-nsubj','L-ROOT','S','L-nsubj','L-nsubj'])
assert example[0].n_lefts == 0
assert example[0].n_rights == 0
assert example[0].left_edge.i == 0
assert example[0].right_edge.i == 0
assert example[0].head.i == 0
assert example[1].n_lefts == 0
assert example[1].n_rights == 0
assert example[1].left_edge.i == 1
assert example[1].right_edge.i == 1
assert example[1].head.i == 2
assert example[2].n_lefts == 1
assert example[2].n_rights == 0
assert example[2].left_edge.i == 1
assert example[2].right_edge.i == 2
assert example[2].head.i == 4
assert example[3].n_lefts == 0
assert example[3].n_rights == 0
assert example[3].left_edge.i == 3
assert example[3].right_edge.i == 3
assert example[3].head.i == 4
assert example[4].n_lefts == 2
assert example[4].n_rights == 0
assert example[4].left_edge.i == 1
assert example[4].right_edge.i == 4
assert example[4].head.i == 4

View File

@ -2,6 +2,7 @@ from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
@pytest.mark.models
@ -41,88 +42,42 @@ def test_single_question(EN):
@pytest.mark.models
def test_sentence_breaks_no_space(EN):
def test_sentence_breaks(EN):
doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' '))
EN.tagger(doc)
with EN.parser.step_through(doc) as stepwise:
# stack empty, automatic Shift (This)
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-nsubj') # attach This
# stack empty, automatic Shift (is)
stepwise.transition('L-nsubj')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('S') # shift a
stepwise.transition('S')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-det') # attach a
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-attr') # attach sentence
stepwise.transition('D') # remove sentence
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-punct') # attach .
stepwise.transition('L-det')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('B-ROOT') # set sentence start on This
# automatic reduction of the stack, automatic Shift to start second sentence
stepwise.transition('R-attr')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-nsubj') # attach This
# stack empty, automatic Shift (is)
stepwise.transition('D')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('S') # shift another
stepwise.transition('R-punct')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-attr') # attach another
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-attr') # attach one
stepwise.transition('B-ROOT')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('D') # remove one
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-punct') # attach .
# buffer empty, automatic cleanup
stepwise.transition('L-nsubj')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('S')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-attr')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-attr')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('D')
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-punct')
assert len(list(doc.sents)) == 2
for tok in doc:
assert tok.dep != 0 or tok.is_space
assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6]
@pytest.mark.models
def test_sentence_breaks_with_space(EN):
doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
EN.tagger(doc)
with EN.parser.step_through(doc) as stepwise:
# stack empty, automatic Shift (This)
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-nsubj') # attach This
# stack empty, automatic Shift (is)
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('S') # shift a
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-det') # attach a
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-attr') # attach sentence
stepwise.transition('D') # remove sentence
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-punct') # attach .
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('B-ROOT') # set sentence start on This
# automatic reduction of the stack, automatic Shift to start second sentence
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-nsubj') # attach This
# stack empty, automatic Shift (is)
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('S') # shift another
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('L-attr') # attach another
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-attr') # attach one
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('D') # remove one
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
stepwise.transition('R-punct') # attach .
# buffer empty, automatic cleanup
assert len(list(doc.sents)) == 2
for tok in doc:
assert tok.dep != 0 or tok.is_space
assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13]
def apply_transition_sequence(model, doc, sequence):
with model.parser.step_through(doc) as stepwise:
for transition in sequence:
@ -130,46 +85,46 @@ def apply_transition_sequence(model, doc, sequence):
@pytest.mark.models
def test_sbd_for_root_label_dependents(EN):
def test_sbd_serialization_projective(EN):
"""
make sure that the parser properly introduces a sentence boundary without
the break transition by checking for dependents with the root label
"""
example = EN.tokenizer.tokens_from_list(u"I saw a firefly It glowed".split(' '))
EN.tagger(example)
apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','S','L-nsubj','R-ROOT'])
assert example[1].head.i == 1
assert example[5].head.i == 5
sents = list(example.sents)
assert len(sents) == 2
assert sents[1][0].orth_ == u'It'
@pytest.mark.models
def test_sbd_serialization(EN):
"""
test that before and after serialization, the sentence boundaries are the same even
if the parser predicted two roots for the sentence that were made into two sentences
after parsing by arc_eager.finalize()
This is actually an interaction between the sentence boundary prediction and doc.from_array
The process is the following: if the parser doesn't predict a sentence boundary but attaches
a word with the ROOT label, the second root node is made root of its own sentence after parsing.
During serialization, sentence boundary information is lost and reintroduced when the code
is deserialized by introducing sentence starts at every left-edge of every root node.
BUG that is tested here: So far, the parser wasn't introducing a sentence start when
it introduced the second root node.
test that before and after serialization, the sentence boundaries are the same.
"""
example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
EN.tagger(example)
apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
assert example.to_bytes() == example_serialized.to_bytes()
assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
# TODO:
# @pytest.mark.models
# def test_sbd_serialization_nonprojective(DE):
# """
# test that before and after serialization, the sentence boundaries are the same in a non-projective sentence.
# """
# example = EN.tokenizer.tokens_from_list(u"Den Mann hat Peter nicht gesehen . Er war zu langsam .".split(' '))
# EN.tagger(example)
# apply_transition_sequence(EN, example, ['L-nk','L-oa||oc','R-sb','D','S','L-ng','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
# print [(t.dep_,t.head.i) for t in example]
# example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
# assert example.to_bytes() == example_serialized.to_bytes()
# assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]