2015-01-31 05:46:11 +03:00
from __future__ import unicode_literals
import pytest
2016-04-21 17:50:53 +03:00
from spacy . tokens import Doc
2016-05-02 15:36:35 +03:00
from spacy . syntax . nonproj import PseudoProjectivity
2015-04-19 22:39:18 +03:00
2016-04-13 16:28:28 +03:00
2015-09-21 12:23:38 +03:00
@pytest.mark.models
2015-01-31 05:46:11 +03:00
def test_single_period ( EN ) :
string = ' A test sentence. '
words = EN ( string )
assert len ( words ) == 4
2015-03-14 18:10:42 +03:00
assert len ( list ( words . sents ) ) == 1
assert sum ( len ( sent ) for sent in words . sents ) == len ( words )
2015-01-31 05:46:11 +03:00
2015-09-21 12:23:38 +03:00
@pytest.mark.models
2015-01-31 05:46:11 +03:00
def test_single_no_period ( EN ) :
string = ' A test sentence '
words = EN ( string )
assert len ( words ) == 3
2015-03-14 18:10:42 +03:00
assert len ( list ( words . sents ) ) == 1
assert sum ( len ( sent ) for sent in words . sents ) == len ( words )
2015-01-31 05:46:11 +03:00
2015-09-21 12:23:38 +03:00
@pytest.mark.models
2015-01-31 05:46:11 +03:00
def test_single_exclamation ( EN ) :
string = ' A test sentence! '
words = EN ( string )
assert len ( words ) == 4
2015-03-14 18:10:42 +03:00
assert len ( list ( words . sents ) ) == 1
assert sum ( len ( sent ) for sent in words . sents ) == len ( words )
2015-01-31 05:46:11 +03:00
2015-09-21 12:23:38 +03:00
@pytest.mark.models
2015-01-31 05:46:11 +03:00
def test_single_question ( EN ) :
string = ' A test sentence? '
2016-05-02 16:26:07 +03:00
words = EN ( string , tag = False , parse = True )
2015-01-31 05:46:11 +03:00
assert len ( words ) == 4
2015-03-14 18:10:42 +03:00
assert len ( list ( words . sents ) ) == 1
assert sum ( len ( sent ) for sent in words . sents ) == len ( words )
2016-04-13 16:28:28 +03:00
@pytest.mark.models
2016-05-02 15:36:35 +03:00
def test_sentence_breaks ( EN ) :
2016-04-21 17:50:53 +03:00
doc = EN . tokenizer . tokens_from_list ( u ' This is a sentence . This is another one . ' . split ( ' ' ) )
2016-04-13 16:28:28 +03:00
EN . tagger ( doc )
with EN . parser . step_through ( doc ) as stepwise :
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' L-nsubj ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' S ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' L-det ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' R-attr ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' D ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' R-punct ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' B-ROOT ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' L-nsubj ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' S ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' L-attr ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' R-attr ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' D ' )
2016-04-13 16:28:28 +03:00
assert EN . parser . moves . is_valid ( stepwise . stcls , ' B-ROOT ' )
2016-05-02 15:36:35 +03:00
stepwise . transition ( ' R-punct ' )
2016-04-13 16:28:28 +03:00
assert len ( list ( doc . sents ) ) == 2
for tok in doc :
assert tok . dep != 0 or tok . is_space
2016-05-02 15:36:35 +03:00
assert [ tok . head . i for tok in doc ] == [ 1 , 1 , 3 , 1 , 1 , 6 , 6 , 8 , 6 , 6 ]
2016-04-21 17:50:53 +03:00
2016-04-21 18:15:10 +03:00
def apply_transition_sequence ( model , doc , sequence ) :
with model . parser . step_through ( doc ) as stepwise :
for transition in sequence :
stepwise . transition ( transition )
2016-04-21 17:50:53 +03:00
2016-04-21 18:15:10 +03:00
@pytest.mark.models
2016-05-02 15:36:35 +03:00
def test_sbd_serialization_projective ( EN ) :
2016-04-21 17:50:53 +03:00
"""
2016-05-02 15:36:35 +03:00
test that before and after serialization , the sentence boundaries are the same .
2016-04-21 17:50:53 +03:00
"""
2016-05-02 15:36:35 +03:00
example = EN . tokenizer . tokens_from_list ( u " I bought a couch from IKEA. It was n ' t very comfortable . " . split ( ' ' ) )
2016-04-21 18:15:10 +03:00
EN . tagger ( example )
2016-05-02 15:36:35 +03:00
apply_transition_sequence ( EN , example , [ ' L-nsubj ' , ' S ' , ' L-det ' , ' R-dobj ' , ' D ' , ' R-prep ' , ' R-pobj ' , ' B-ROOT ' , ' L-nsubj ' , ' R-neg ' , ' D ' , ' S ' , ' L-advmod ' , ' R-acomp ' , ' D ' , ' R-punct ' ] )
example_serialized = Doc ( EN . vocab ) . from_bytes ( example . to_bytes ( ) )
assert example . to_bytes ( ) == example_serialized . to_bytes ( )
assert [ s . text for s in example . sents ] == [ s . text for s in example_serialized . sents ]
2016-05-04 17:00:28 +03:00
def test_sbd_empty_string ( EN ) :
''' Test Issue #309: SBD fails on empty string
'''
doc = EN ( u ' ' )
doc . is_parsed = True
assert len ( doc ) == 1
sents = list ( doc . sents )
assert len ( sents ) == 1
2016-05-02 15:36:35 +03:00
# TODO:
# @pytest.mark.models
# def test_sbd_serialization_nonprojective(DE):
# """
# test that before and after serialization, the sentence boundaries are the same in a non-projective sentence.
# """
# example = EN.tokenizer.tokens_from_list(u"Den Mann hat Peter nicht gesehen . Er war zu langsam .".split(' '))
# EN.tagger(example)
# apply_transition_sequence(EN, example, ['L-nk','L-oa||oc','R-sb','D','S','L-ng','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
# print [(t.dep_,t.head.i) for t in example]
# example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
# assert example.to_bytes() == example_serialized.to_bytes()
# assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
2016-04-21 17:50:53 +03:00
2016-04-21 18:15:10 +03:00
2016-04-21 17:50:53 +03:00