spaCy/spacy/tests/lang/sv/test_prefix_suffix_infix.py
Björn Lennartsson b892b446cc Updates to Swedish Language (#3164)
* Added the same punctuation rules as danish language.

* Added abbreviations and also the possibility to have capitalized abbreviations on some. Added a few specific cases too

* Added test for long texts in swedish

* Added morph rules, infixes and suffixes to __init__.py for swedish

* Added some tests for prefixes, infixes and suffixes

* Added tests for lemma

* Renamed files to follow convention

* [sv] Removed ambigious abbreviations

* Added more tests for tokenizer exceptions

* Added test for problem with punctuation in issue #2578

* Contributor agreement

* Removed faulty lemmatization of 'jag' ('I') as it was lemmatized to 'jaga' ('hunt')
2019-01-16 13:45:50 +01:00

38 lines
1.2 KiB
Python

# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(under)"])
def test_tokenizer_splits_no_special(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"])
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"])
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"])
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"])
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3