spaCy/spacy/tests/lang/fr/test_exceptions.py

import pytest


@pytest.mark.parametrize(
    "text",
    [
        "aujourd'hui",
        "Aujourd'hui",
        "prud'hommes",
        "prud’hommal",
        "audio-numérique",
        "Audio-numérique",
        "entr'amis",
        "entr'abat",
        "rentr'ouvertes",
        "grand'hamien",
        "Châteauneuf-la-Forêt",
        "Château-Guibert",
        "refox-trottâmes",
        # u"K-POP",
        # u"K-Pop",
        # u"K-pop",
        "z'yeutes",
        "black-outeront",
        "états-unienne",
        "courtes-pattes",
        "court-pattes",
        "saut-de-ski",
        "Écourt-Saint-Quentin",
        "Bout-de-l'Îlien",
        "pet-en-l'air",
    ],
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize(
    "text", ["janv.", "juill.", "Dr.", "av.", "sept."],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1


def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
    text = "Je suis allé au mois de janv. aux prud’hommes."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 10
    assert tokens[6].text == "janv."
    assert tokens[8].text == "prud’hommes"


def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 11
    assert tokens[1].text == "après-midi"
    assert tokens[9].text == "italo-mexicain"


def test_fr_tokenizer_handles_title(fr_tokenizer):
    text = "N'est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[0].text == "N'"
    assert tokens[1].text == "est"
    assert tokens[2].text == "-ce"


def test_fr_tokenizer_handles_title_2(fr_tokenizer):
    text = "Est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 5
    assert tokens[0].text == "Est"
    assert tokens[1].text == "-ce"


def test_fr_tokenizer_handles_title_3(fr_tokenizer):
    text = "Qu'est-ce que tu fais?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 7
    assert tokens[0].text == "Qu'"
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 12:55:02 +03:00
+								import pytest
-												Try to fix memory error by moving fr_tokenizer to module scope

											
										
										
											2018-07-24 21:09:06 +03:00
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 12:55:02 +03:00
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								@pytest.mark.parametrize(
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 15:48:10 +03:00
+								    "text",
 								    [
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								        "aujourd'hui",
 								        "Aujourd'hui",
 								        "prud'hommes",
 								        "prud’hommal",
 								        "audio-numérique",
 								        "Audio-numérique",
 								        "entr'amis",
 								        "entr'abat",
 								        "rentr'ouvertes",
 								        "grand'hamien",
 								        "Châteauneuf-la-Forêt",
 								        "Château-Guibert",
 								        "refox-trottâmes",
-												Clean up of char classes, few tokenizer fixes and faster default French tokenizer (#3293)

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* remove duplicate

* remove xfail for Issue #2179 fixed by Matt

* adjust documentation and remove reference to regex lib

											
										
										
											2019-02-21 00:10:13 +03:00
+								        # u"K-POP",
 								        # u"K-Pop",
 								        # u"K-pop",
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								        "z'yeutes",
 								        "black-outeront",
 								        "états-unienne",
 								        "courtes-pattes",
 								        "court-pattes",
 								        "saut-de-ski",
 								        "Écourt-Saint-Quentin",
 								        "Bout-de-l'Îlien",
 								        "pet-en-l'air",
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 15:48:10 +03:00
+								    ],
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
+								def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 12:55:02 +03:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								@pytest.mark.parametrize(
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								    "text", ["janv.", "juill.", "Dr.", "av.", "sept."],
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								)
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 12:55:02 +03:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
+								def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 12:55:02 +03:00
+								    text = "Je suis allé au mois de janv. aux prud’hommes."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 10
 								    assert tokens[6].text == "janv."
 								    assert tokens[8].text == "prud’hommes"
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
+								def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
+								    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 11
 								    assert tokens[1].text == "après-midi"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    assert tokens[9].text == "italo-mexicain"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
 								def test_fr_tokenizer_handles_title(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    text = "N'est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 6
 								    assert tokens[0].text == "N'"
-												correcting tokenizer exception.
Adding tests for lemmatization

											
										
										
											2017-04-27 12:52:14 +03:00
+								    assert tokens[1].text == "est"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    assert tokens[2].text == "-ce"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
 								def test_fr_tokenizer_handles_title_2(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    text = "Est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
-												Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
											
										
										
											2020-07-20 15:49:54 +03:00
+								    assert len(tokens) == 5
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    assert tokens[0].text == "Est"
-												Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
											
										
										
											2020-07-20 15:49:54 +03:00
+								    assert tokens[1].text == "-ce"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								def test_fr_tokenizer_handles_title_3(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 11:59:38 +03:00
+								    text = "Qu'est-ce que tu fais?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 7
 								    assert tokens[0].text == "Qu'"