# coding: utf-8 from __future__ import unicode_literals import pytest @pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadamente"), ("esq.", "esquina"), ("pág.", "página"), ("p.ej.", "por ejemplo") ]) def test_tokenizer_handles_abbr(es_tokenizer, text, lemma): tokens = es_tokenizer(text) assert len(tokens) == 1 assert tokens[0].lemma_ == lemma def test_tokenizer_handles_exc_in_text(es_tokenizer): text = "Mariano Rajoy ha corrido aprox. medio kilómetro" tokens = es_tokenizer(text) assert len(tokens) == 7 assert tokens[4].text == "aprox." assert tokens[4].lemma_ == "aproximadamente"