From 6deb1e84b60fa65d10b091838aef059e69e9ae6c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 27 Jul 2015 21:25:48 +0200
Subject: [PATCH] * Upd serialization tests

---
 tests/serialize/test_packer.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py
index 86e715b52..5770a8938 100644
--- a/tests/serialize/test_packer.py
+++ b/tests/serialize/test_packer.py
@@ -17,7 +17,7 @@ from spacy.serialize.packer import Packer
 from spacy.serialize.bits import BitArray
 
 
-def get_lex_props(string, prob=-22):
+def get_lex_props(string, prob=-22, is_oov=False):
     return {
         'flags': 0,
         'length': len(string),
@@ -120,3 +120,22 @@ def test_packer_annotated(tokenizer):
     assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
     assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
     assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
+
+
+def test_packer_bad_chars(tokenizer):
+    string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
+    packer = Packer(tokenizer.vocab, [])
+
+    doc = tokenizer(string)
+    bits = packer.pack(doc)
+    result = packer.unpack(bits)
+    assert result.string == doc.string
+
+
+@pytest.mark.models
+def test_packer_bad_chars(EN):
+    string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
+    doc = EN(string)
+    byte_string = doc.to_bytes()
+    result = Doc(EN.vocab).from_bytes(byte_string)
+    assert [t.tag_ for t in result] == [t.tag_ for t in doc]