Adding unitest for tokenization in french (with title)

This commit is contained in:
Gregory Howard 2017-04-27 10:59:38 +02:00
parent ad8129cb45
commit 44cb486849

View File

@ -38,3 +38,26 @@ def test_tokenizer_handles_exc_in_text_2(fr_tokenizer):
assert len(tokens) == 11
assert tokens[1].text == "après-midi"
assert tokens[9].text == "italo-mexicain"
def test_tokenizer_handles_title(fr_tokenizer):
text = "N'est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert tokens[0].text == "N'"
assert tokens[0].lemma_ == "ne"
assert tokens[2].text == "-ce"
assert tokens[2].lemma_ == "ce"
def test_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être"
def test_tokenizer_handles_title_2(fr_tokenizer):
text = "Qu'est-ce que tu fais?"
tokens = fr_tokenizer(text)
assert len(tokens) == 7
assert tokens[0].text == "Qu'"
assert tokens[0].lemma_ == "que"