2019-02-08 06:27:21 +03:00
|
|
|
import pytest
|
|
|
|
|
2019-02-08 16:14:49 +03:00
|
|
|
DOT_TESTS = [
|
|
|
|
("tel.", ["tel."]),
|
|
|
|
("np.", ["np."]),
|
|
|
|
("godz. 21:37", ["godz.", "21:37"]),
|
|
|
|
("inż.", ["inż."]),
|
|
|
|
("gosp.-polit.", ["gosp.-polit."]),
|
|
|
|
("ppoż", ["ppoż"]),
|
|
|
|
("płn", ["płn"]),
|
|
|
|
("ul.", ["ul."]),
|
|
|
|
("jw.", ["jw."]),
|
|
|
|
("itd.", ["itd."]),
|
|
|
|
("cdn.", ["cdn."]),
|
|
|
|
("itp.", ["itp."]),
|
|
|
|
("10,- zł", ["10,-", "zł"]),
|
|
|
|
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
|
|
|
("0,99 rub.", ["0,99", "rub."]),
|
|
|
|
("dol.", ["dol."]),
|
|
|
|
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
|
|
|
|
("m.in.", ["m.in."]),
|
|
|
|
("p.n.e.", ["p.n.e."]),
|
|
|
|
("Sz.P.", ["Sz.P."]),
|
|
|
|
("p.o.", ["p.o."]),
|
|
|
|
("k.o.", ["k.o."]),
|
|
|
|
("m.st.", ["m.st."]),
|
|
|
|
("dra.", ["dra", "."]),
|
|
|
|
("pp.", ["pp."]),
|
|
|
|
("oo.", ["oo."]),
|
2019-02-08 06:27:21 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
HYPHEN_TESTS = [
|
2019-02-08 16:14:49 +03:00
|
|
|
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
|
|
|
|
("NESS-040C5", ["NESS-040C5"]),
|
|
|
|
("JTE-7-31", ["JTE-7-31"]),
|
|
|
|
("BAY-59-3074", ["BAY-59-3074"]),
|
|
|
|
("BAY-38-7271", ["BAY-38-7271"]),
|
|
|
|
("STS-135", ["STS-135"]),
|
|
|
|
("5F-PB-22", ["5F-PB-22"]),
|
|
|
|
("cztero-", ["cztero-"]),
|
|
|
|
("jedno-", ["jedno-"]),
|
|
|
|
("dwu-", ["dwu-"]),
|
|
|
|
("trzy-", ["trzy-"]),
|
|
|
|
("b-adoratorzy", ["b-adoratorzy"]),
|
|
|
|
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
|
|
|
|
("b-drzewa", ["b-drzewa"]),
|
2019-02-08 06:27:21 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
TESTCASES = DOT_TESTS + HYPHEN_TESTS
|
|
|
|
|
|
|
|
|
2019-02-08 16:14:49 +03:00
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
2019-02-08 06:27:21 +03:00
|
|
|
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
|
|
|
|
tokens = pl_tokenizer(text)
|
|
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
|
|
assert expected_tokens == token_list
|