spaCy/spacy/tests/lang/fr/test_exceptions.py
Sofie 9a478b6db8 Clean up of char classes, few tokenizer fixes and faster default French tokenizer (#3293)
* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* remove duplicate

* remove xfail for Issue #2179 fixed by Matt

* adjust documentation and remove reference to regex lib
2019-02-20 22:10:13 +01:00

104 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text",
[
u"aujourd'hui",
u"Aujourd'hui",
u"prud'hommes",
u"prudhommal",
u"audio-numérique",
u"Audio-numérique",
u"entr'amis",
u"entr'abat",
u"rentr'ouvertes",
u"grand'hamien",
u"Châteauneuf-la-Forêt",
u"Château-Guibert",
u"11-septembre",
u"11-Septembre",
u"refox-trottâmes",
# u"K-POP",
# u"K-Pop",
# u"K-pop",
u"z'yeutes",
u"black-outeront",
u"états-unienne",
u"courtes-pattes",
u"court-pattes",
u"saut-de-ski",
u"Écourt-Saint-Quentin",
u"Bout-de-l'Îlien",
u"pet-en-l'air",
],
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize(
"text,lemma",
[
("janv.", "janvier"),
("juill.", "juillet"),
("Dr.", "docteur"),
("av.", "avant"),
("sept.", "septembre"),
],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
text = "Je suis allé au mois de janv. aux prudhommes."
tokens = fr_tokenizer(text)
assert len(tokens) == 10
assert tokens[6].text == "janv."
assert tokens[6].lemma_ == "janvier"
assert tokens[8].text == "prudhommes"
def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].text == "après-midi"
assert tokens[9].text == "italo-mexicain"
def test_fr_tokenizer_handles_title(fr_tokenizer):
text = "N'est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert tokens[0].text == "N'"
assert tokens[0].lemma_ == "ne"
assert tokens[1].text == "est"
assert tokens[1].lemma_ == "être"
assert tokens[2].text == "-ce"
assert tokens[2].lemma_ == "ce"
@pytest.mark.xfail
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être"
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
text = "Qu'est-ce que tu fais?"
tokens = fr_tokenizer(text)
assert len(tokens) == 7
assert tokens[0].text == "Qu'"
assert tokens[0].lemma_ == "que"