spaCy/spacy/tests/lang/pl/test_tokenizer.py
Stanisław Giziński 1448ad100c Improved polish tokenizer and stop words. (#2974)
* Improved stop words list

* Removed some wrong stop words form list

* Improved stop words list

* Removed some wrong stop words form list

* Improved Polish Tokenizer (#38)

* Add tests for polish tokenizer

* Add polish tokenizer exceptions

* Don't split any words containing hyphens

* Fix test case with wrong model answer

* Remove commented out line of code until better solution is found

* Add source srx' license

* Rename exception_list.py to match spaCy conventionality

* Add a brief explanation of where the exception list comes from

* Add newline after reach exception

* Rename COPYING.txt to LICENSE

* Delete old files

* Add header to the license

* Agreements signed

* Stanisław Giziński agreement

* Krzysztof Kowalczyk - signed agreement

* Mateusz Olko agreement

* Add DoomCoder's contributor agreement

* Improve like number checking in polish lang


* like num tests added

* all from SI system added

* Final licence and removed splitting exceptions

* Added polish stop words to LEX_ATTRA

* Add encoding info to pl tokenizer exceptions
2019-02-08 14:27:21 +11:00

61 lines
1.6 KiB
Python

# coding: utf8
from __future__ import unicode_literals
import pytest
DOT_TESTS = [
('tel.', ['tel.']),
('np.', ['np.']),
('godz. 21:37', ['godz.', '21:37']),
('inż.', ['inż.']),
('gosp.-polit.', ['gosp.-polit.']),
('ppoż', ['ppoż']),
('płn', ['płn']),
('ul.', ['ul.']),
('jw.', ['jw.']),
('itd.', ['itd.']),
('cdn.', ['cdn.']),
('itp.', ['itp.']),
('10,- zł', ['10,-', '']),
('0 zł 99 gr', ['0', '', '99', 'gr']),
('0,99 rub.', ['0,99', 'rub.']),
('dol.', ['dol.']),
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
('m.in.', ['m.in.']),
('p.n.e.', ['p.n.e.']),
('Sz.P.', ['Sz.P.']),
('p.o.', ['p.o.']),
('k.o.', ['k.o.']),
('m.st.', ['m.st.']),
('dra.', ['dra', '.']),
('pp.', ['pp.']),
('oo.', ['oo.'])
]
HYPHEN_TESTS = [
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
('NESS-040C5', ['NESS-040C5']),
('JTE-7-31', ['JTE-7-31']),
('BAY-59-3074', ['BAY-59-3074']),
('BAY-38-7271', ['BAY-38-7271']),
('STS-135', ['STS-135']),
('5F-PB-22', ['5F-PB-22']),
('cztero-', ['cztero-']),
('jedno-', ['jedno-']),
('dwu-', ['dwu-']),
('trzy-', ['trzy-']),
('b-adoratorzy', ['b-adoratorzy']),
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
('b-drzewa', ['b-drzewa'])
]
TESTCASES = DOT_TESTS + HYPHEN_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
tokens = pl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list