mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
db55577c45
* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
35 lines
1.0 KiB
Python
35 lines
1.0 KiB
Python
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["(under)"])
|
|
def test_tokenizer_splits_no_special(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"])
|
|
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 1
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"])
|
|
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["Hej,Världen", "en,två"])
|
|
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|
|
assert tokens[0].text == text.split(",")[0]
|
|
assert tokens[1].text == ","
|
|
assert tokens[2].text == text.split(",")[1]
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"])
|
|
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 3
|