# coding: utf8 from __future__ import unicode_literals import pytest DOT_TESTS = [ ("tel.", ["tel."]), ("np.", ["np."]), ("godz. 21:37", ["godz.", "21:37"]), ("inż.", ["inż."]), ("gosp.-polit.", ["gosp.-polit."]), ("ppoż", ["ppoż"]), ("płn", ["płn"]), ("ul.", ["ul."]), ("jw.", ["jw."]), ("itd.", ["itd."]), ("cdn.", ["cdn."]), ("itp.", ["itp."]), ("10,- zł", ["10,-", "zł"]), ("0 zł 99 gr", ["0", "zł", "99", "gr"]), ("0,99 rub.", ["0,99", "rub."]), ("dol.", ["dol."]), ("1000 m n.p.m.", ["1000", "m", "n.p.m."]), ("m.in.", ["m.in."]), ("p.n.e.", ["p.n.e."]), ("Sz.P.", ["Sz.P."]), ("p.o.", ["p.o."]), ("k.o.", ["k.o."]), ("m.st.", ["m.st."]), ("dra.", ["dra", "."]), ("pp.", ["pp."]), ("oo.", ["oo."]), ] HYPHEN_TESTS = [ ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]), ("NESS-040C5", ["NESS-040C5"]), ("JTE-7-31", ["JTE-7-31"]), ("BAY-59-3074", ["BAY-59-3074"]), ("BAY-38-7271", ["BAY-38-7271"]), ("STS-135", ["STS-135"]), ("5F-PB-22", ["5F-PB-22"]), ("cztero-", ["cztero-"]), ("jedno-", ["jedno-"]), ("dwu-", ["dwu-"]), ("trzy-", ["trzy-"]), ("b-adoratorzy", ["b-adoratorzy"]), ("2-3-4 drzewa", ["2-3-4", "drzewa"]), ("b-drzewa", ["b-drzewa"]), ] TESTCASES = DOT_TESTS + HYPHEN_TESTS @pytest.mark.parametrize("text,expected_tokens", TESTCASES) def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): tokens = pl_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list