# coding: utf8 from __future__ import unicode_literals import pytest @pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"]) def test_issue886(en_tokenizer, text): """Test that token.idx matches the original text index for texts with newlines.""" doc = en_tokenizer(text) for token in doc: assert len(token.text) == len(token.text_with_ws) assert text[token.idx] == token.text[0]