spaCy/spacy/tests/lang/fr/test_exceptions.py
Sofie Van Landeghem c9da9605f7
Test suite clean up (#5781)
* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
2020-07-20 14:49:54 +02:00

100 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
@pytest.mark.parametrize(
"text",
[
"aujourd'hui",
"Aujourd'hui",
"prud'hommes",
"prudhommal",
"audio-numérique",
"Audio-numérique",
"entr'amis",
"entr'abat",
"rentr'ouvertes",
"grand'hamien",
"Châteauneuf-la-Forêt",
"Château-Guibert",
"refox-trottâmes",
# u"K-POP",
# u"K-Pop",
# u"K-pop",
"z'yeutes",
"black-outeront",
"états-unienne",
"courtes-pattes",
"court-pattes",
"saut-de-ski",
"Écourt-Saint-Quentin",
"Bout-de-l'Îlien",
"pet-en-l'air",
],
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize(
"text,lemma",
[
("janv.", "janvier"),
("juill.", "juillet"),
("Dr.", "docteur"),
("av.", "avant"),
("sept.", "septembre"),
],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
text = "Je suis allé au mois de janv. aux prudhommes."
tokens = fr_tokenizer(text)
assert len(tokens) == 10
assert tokens[6].text == "janv."
assert tokens[6].lemma_ == "janvier"
assert tokens[8].text == "prudhommes"
def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].text == "après-midi"
assert tokens[9].text == "italo-mexicain"
def test_fr_tokenizer_handles_title(fr_tokenizer):
text = "N'est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert tokens[0].text == "N'"
assert tokens[0].lemma_ == "ne"
assert tokens[1].text == "est"
assert tokens[1].lemma_ == "être"
assert tokens[2].text == "-ce"
assert tokens[2].lemma_ == "ce"
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 5
assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être"
assert tokens[1].text == "-ce"
assert tokens[1].lemma_ == "ce"
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
text = "Qu'est-ce que tu fais?"
tokens = fr_tokenizer(text)
assert len(tokens) == 7
assert tokens[0].text == "Qu'"
assert tokens[0].lemma_ == "que"