2019-10-14 13:27:50 +03:00
|
|
|
|
# coding: utf-8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2019-12-21 21:04:17 +03:00
|
|
|
|
|
2019-10-14 13:27:50 +03:00
|
|
|
|
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
|
|
|
|
|
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
|
|
|
|
tokens = lb_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 1
|
2019-12-02 01:08:21 +03:00
|
|
|
|
|
2019-12-21 21:04:17 +03:00
|
|
|
|
|
2019-12-02 01:08:21 +03:00
|
|
|
|
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
|
|
|
|
|
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
|
|
|
|
|
tokens = lb_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 2
|
|
|
|
|
|
2019-12-21 21:04:17 +03:00
|
|
|
|
|
2019-12-02 01:08:21 +03:00
|
|
|
|
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
|
|
|
|
text = "Mee 't ass net evident, d'Liewen."
|
|
|
|
|
tokens = lb_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 9
|
|
|
|
|
assert tokens[1].text == "'t"
|
|
|
|
|
assert tokens[1].lemma_ == "et"
|