diff --git a/spacy/tests/lang/ga/__init__.py b/spacy/tests/lang/ga/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py new file mode 100644 index 000000000..fe5cb0b2f --- /dev/null +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +SV_TOKEN_EXCEPTION_TESTS = [ + ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), + ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) +] + + +@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS) +def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): + tokens = ga_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list +