spaCy/spacy/tests/serialize/test_serialization.py

from __future__ import unicode_literals
import pytest

from spacy.tokens import Doc
import spacy.en
from spacy.serialize.packer import Packer


def equal(doc1, doc2):
    # tokens
    assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]

    # tags
    assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]
    assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]

    # parse
    assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]
    assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]
    if doc1.is_parsed and doc2.is_parsed:
        assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]

    # entities
    assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
    assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
    assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]


@pytest.mark.models
def test_serialize_tokens(EN):
    doc1 = EN(u'This is a test sentence.',tag=False, parse=False, entity=False)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_tags(EN):
    doc1 = EN(u'This is a test sentence.',tag=True, parse=False, entity=False)
    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_parse(EN):
    doc1 = EN(u'This is a test sentence.',tag=False, parse=True, entity=False)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_ner(EN):
    doc1 = EN(u'This is a test sentence.', tag=False, parse=False, entity=True)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_tags_parse(EN):
    doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=False)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_tags_ner(EN):
    doc1 = EN(u'This is a test sentence.', tag=True, parse=False, entity=True)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_ner_parse(EN):
    doc1 = EN(u'This is a test sentence.', tag=False, parse=True, entity=True)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


@pytest.mark.models
def test_serialize_tokens_tags_parse_ner(EN):
    doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=True)

    doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
    equal(doc1, doc2)


def test_serialize_empty_doc():
    vocab = spacy.en.English.Defaults.create_vocab()
    doc = Doc(vocab)
    packer = Packer(vocab, {})
    b = packer.pack(doc)
    assert b == b''
    loaded = Doc(vocab).from_bytes(b)
    assert len(loaded) == 0


def test_serialize_after_adding_entity():
    # Re issue #514
    vocab = spacy.en.English.Defaults.create_vocab()
    entity_recognizer = spacy.en.English.Defaults.create_entity()

    doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())
    entity_recognizer.add_label('Food')
    entity_recognizer(doc)

    label_id = vocab.strings[u'Food']
    doc.ents = [(label_id, 5,6)]

    assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]

    byte_string = doc.to_bytes()


@pytest.mark.models
def test_serialize_after_adding_entity(EN):
    EN.entity.add_label(u'Food')
    doc = EN(u'This is a sentence about pasta.')
    label_id = EN.vocab.strings[u'Food']
    doc.ents = [(label_id, 5,6)]
    byte_string = doc.to_bytes()
    doc2 = Doc(EN.vocab).from_bytes(byte_string)
    assert [(ent.label_, ent.text) for ent in doc2.ents] == [(u'Food', u'pasta')]
add tests for serialization bug 2016-05-02 12:01:56 +03:00			`from __future__ import unicode_literals`
			`import pytest`

			`from spacy.tokens import Doc`
Test Issue #459: Fail to deserialize empty doc 2016-10-23 17:30:22 +03:00			`import spacy.en`
			`from spacy.serialize.packer import Packer`

add tests for serialization bug 2016-05-02 12:01:56 +03:00
			`def equal(doc1, doc2):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`# tokens`
			`assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`# tags`
			`assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]`
			`assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`# parse`
			`assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]`
			`assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]`
			`if doc1.is_parsed and doc2.is_parsed:`
			`assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`# entities`
			`assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]`
			`assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]`
			`assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.',tag=False, parse=False, entity=False)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_tags(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.',tag=True, parse=False, entity=False)`
			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_parse(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.',tag=False, parse=True, entity=False)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_ner(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.', tag=False, parse=False, entity=True)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_tags_parse(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=False)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_tags_ner(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.', tag=True, parse=False, entity=True)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_ner_parse(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.', tag=False, parse=True, entity=True)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00

			`@pytest.mark.models`
			`def test_serialize_tokens_tags_parse_ner(EN):`
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=True)`
add tests for serialization bug 2016-05-02 12:01:56 +03:00
* Fix formatting on serializer tests 2016-05-02 17:07:21 +03:00			`doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`equal(doc1, doc2)`
Test Issue #459: Fail to deserialize empty doc 2016-10-23 17:30:22 +03:00

			`def test_serialize_empty_doc():`
			`vocab = spacy.en.English.Defaults.create_vocab()`
			`doc = Doc(vocab)`
			`packer = Packer(vocab, {})`
			`b = packer.pack(doc)`
			`assert b == b''`
			`loaded = Doc(vocab).from_bytes(b)`
			`assert len(loaded) == 0`
Test Issue #514: Serialization fails after adding a new entity label. 2016-10-23 17:40:27 +03:00

			`def test_serialize_after_adding_entity():`
			`# Re issue #514`
			`vocab = spacy.en.English.Defaults.create_vocab()`
			`entity_recognizer = spacy.en.English.Defaults.create_entity()`

			`doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())`
			`entity_recognizer.add_label('Food')`
			`entity_recognizer(doc)`

			`label_id = vocab.strings[u'Food']`
			`doc.ents = [(label_id, 5,6)]`

			`assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]`

			`byte_string = doc.to_bytes()`
Test Issue #514: Serializer fails when new entity type has been added. 2016-10-23 18:41:32 +03:00

			`@pytest.mark.models`
			`def test_serialize_after_adding_entity(EN):`
			`EN.entity.add_label(u'Food')`
			`doc = EN(u'This is a sentence about pasta.')`
			`label_id = EN.vocab.strings[u'Food']`
			`doc.ents = [(label_id, 5,6)]`
			`byte_string = doc.to_bytes()`
			`doc2 = Doc(EN.vocab).from_bytes(byte_string)`
			`assert [(ent.label_, ent.text) for ent in doc2.ents] == [(u'Food', u'pasta')]`