mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
b892b446cc
* Added the same punctuation rules as danish language. * Added abbreviations and also the possibility to have capitalized abbreviations on some. Added a few specific cases too * Added test for long texts in swedish * Added morph rules, infixes and suffixes to __init__.py for swedish * Added some tests for prefixes, infixes and suffixes * Added tests for lemma * Renamed files to follow convention * [sv] Removed ambigious abbreviations * Added more tests for tokenizer exceptions * Added test for problem with punctuation in issue #2578 * Contributor agreement * Removed faulty lemmatization of 'jag' ('I') as it was lemmatized to 'jaga' ('hunt')
38 lines
1.2 KiB
Python
38 lines
1.2 KiB
Python
# coding: utf-8
|
|
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
@pytest.mark.parametrize('text', ["(under)"])
|
|
def test_tokenizer_splits_no_special(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"])
|
|
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 1
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"])
|
|
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"])
|
|
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
assert tokens[0].text == text.split(",")[0]
|
|
assert tokens[1].text == ","
|
|
assert tokens[2].text == text.split(",")[1]
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"])
|
|
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|