spaCy/spacy/tests/parser/test_preset_sbd.py

'''Test that the parser respects preset sentence boundaries.'''
from __future__ import unicode_literals
import pytest
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps

from ...attrs import NORM
from ...gold import GoldParse
from ...vocab import Vocab
from ...tokens import Doc
from ...pipeline import DependencyParser

@pytest.fixture
def vocab():
    return Vocab(lex_attr_getters={NORM: lambda s: s})

@pytest.fixture
def parser(vocab):
    parser = DependencyParser(vocab)
    parser.cfg['token_vector_width'] = 4
    parser.cfg['hidden_width'] = 32
    #parser.add_label('right')
    parser.add_label('left')
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
        gold = GoldParse(doc, heads=[1, 1, 3, 3],
                deps=['left', 'ROOT', 'left', 'ROOT'])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser

def test_no_sentences(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc = parser(doc)
    assert len(list(doc.sents)) >= 1


def test_sents_1(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[2].sent_start = True
    doc = parser(doc)
    assert len(list(doc.sents)) >= 2
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = False
    doc[2].sent_start = True
    doc[3].sent_start = False
    doc = parser(doc)
    assert len(list(doc.sents)) == 2


def test_sents_1_2(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = True
    doc[2].sent_start = True
    doc = parser(doc)
    assert len(list(doc.sents)) >= 3


def test_sents_1_3(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = True
    doc[3].sent_start = True
    doc = parser(doc)
    assert len(list(doc.sents)) >= 3
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = True
    doc[2].sent_start = False
    doc[3].sent_start = True
    doc = parser(doc)
    assert len(list(doc.sents)) == 3
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00			`'''Test that the parser respects preset sentence boundaries.'''`
Fix string-to-unicode problem 2017-10-09 01:59:49 +03:00			`from __future__ import unicode_literals`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00			`import pytest`
			`from thinc.neural.optimizers import Adam`
			`from thinc.neural.ops import NumpyOps`

			`from ...attrs import NORM`
			`from ...gold import GoldParse`
			`from ...vocab import Vocab`
			`from ...tokens import Doc`
Fix names of pipeline components NeuralDependencyParser --> DependencyParser NeuralEntityRecognizer --> EntityRecognizer TokenVectorEncoder --> Tensorizer NeuralLabeller --> MultitaskObjective 2017-10-26 13:38:23 +03:00			`from ...pipeline import DependencyParser`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00
			`@pytest.fixture`
			`def vocab():`
			`return Vocab(lex_attr_getters={NORM: lambda s: s})`

			`@pytest.fixture`
			`def parser(vocab):`
Fix names of pipeline components NeuralDependencyParser --> DependencyParser NeuralEntityRecognizer --> EntityRecognizer TokenVectorEncoder --> Tensorizer NeuralLabeller --> MultitaskObjective 2017-10-26 13:38:23 +03:00			`parser = DependencyParser(vocab)`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00			`parser.cfg['token_vector_width'] = 4`
			`parser.cfg['hidden_width'] = 32`
			`#parser.add_label('right')`
			`parser.add_label('left')`
			`parser.begin_training([], **parser.cfg)`
			`sgd = Adam(NumpyOps(), 0.001)`

			`for i in range(10):`
			`losses = {}`
			`doc = Doc(vocab, words=['a', 'b', 'c', 'd'])`
			`gold = GoldParse(doc, heads=[1, 1, 3, 3],`
			`deps=['left', 'ROOT', 'left', 'ROOT'])`
			`parser.update([doc], [gold], sgd=sgd, losses=losses)`
			`return parser`

			`def test_no_sentences(parser):`
			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc = parser(doc)`
Fix SBD test 2017-10-12 22:18:22 +03:00			`assert len(list(doc.sents)) >= 1`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00

			`def test_sents_1(parser):`
			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc[2].sent_start = True`
			`doc = parser(doc)`
Fix test 2017-10-09 01:29:37 +03:00			`assert len(list(doc.sents)) >= 2`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc[1].sent_start = False`
			`doc[2].sent_start = True`
			`doc[3].sent_start = False`
			`doc = parser(doc)`
			`assert len(list(doc.sents)) == 2`


			`def test_sents_1_2(parser):`
			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc[1].sent_start = True`
			`doc[2].sent_start = True`
			`doc = parser(doc)`
Make test less flakey 2017-11-03 16:36:08 +03:00			`assert len(list(doc.sents)) >= 3`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00

			`def test_sents_1_3(parser):`
			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc[1].sent_start = True`
			`doc[3].sent_start = True`
			`doc = parser(doc)`
Fix failing test 2017-10-11 09:38:34 +03:00			`assert len(list(doc.sents)) >= 3`
Add tests for sentence segmentation presetting 2017-10-09 01:02:23 +03:00			`doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])`
			`doc[1].sent_start = True`
			`doc[2].sent_start = False`
			`doc[3].sent_start = True`
			`doc = parser(doc)`
			`assert len(list(doc.sents)) == 3`