spaCy/spacy/tests/lang/tt/test_tokenizer.py

# coding: utf8
from __future__ import unicode_literals

import pytest


INFIX_HYPHEN_TESTS = [
    ("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
    ("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
]

PUNC_INSIDE_WORDS_TESTS = [
    (
        "Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
        "Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
        " 783,9 млн. кеше / елда .".split(),
    ),
    ('Ту"кай', 'Ту " кай'.split()),
]

MIXED_ORDINAL_NUMS_TESTS = [
    ("Иртәгә 22нче гыйнвар...", "Иртәгә 22нче гыйнвар ...".split())
]

ABBREV_TESTS = [
    ("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
    ("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()),
]

NAME_ABBREV_TESTS = [
    ("Ә.Тукай", "Ә.Тукай".split()),
    ("Ә.тукай", "Ә.тукай".split()),
    ("ә.Тукай", "ә . Тукай".split()),
    ("Миләүшә.", "Миләүшә .".split()),
]

TYPOS_IN_PUNC_TESTS = [
    ("«3 елда , туган", "« 3 елда , туган".split()),
    ("«3 елда,туган", "« 3 елда , туган".split()),
    ("«3 елда,туган.", "« 3 елда , туган .".split()),
    ("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
    ("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()),  # "?)" => "?)" or "? )"
]

LONG_TEXTS_TESTS = [
    (
        "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы "
        "якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз "
        "меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең "
        "салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын "
        "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
        "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы "
        "якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз "
        "меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең "
        "салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын "
        "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(),
    )
]

TESTCASES = (
    INFIX_HYPHEN_TESTS
    + PUNC_INSIDE_WORDS_TESTS
    + MIXED_ORDINAL_NUMS_TESTS
    + ABBREV_TESTS
    + NAME_ABBREV_TESTS
    + LONG_TEXTS_TESTS
    + TYPOS_IN_PUNC_TESTS
)

NORM_TESTCASES = [
    (
        "тукымадан һ.б.ш. тегелгән.",
        ["тукымадан", "һәм башка шундыйлар", "тегелгән", "."],
    )
]


@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens):
    tokens = [token.text for token in tt_tokenizer(text) if not token.is_space]
    assert expected_tokens == tokens


@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
    tokens = tt_tokenizer(text)
    assert [token.norm_ for token in tokens] == norms
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								# coding: utf8
 								from __future__ import unicode_literals
 								import pytest
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								INFIX_HYPHEN_TESTS = [
 								    ("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    ("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								PUNC_INSIDE_WORDS_TESTS = [
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    (
 								        "Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
 								        "Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
 								        " 783,9 млн. кеше / елда .".split(),
 								    ),
 								    ('Ту"кай', 'Ту " кай'.split()),
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								MIXED_ORDINAL_NUMS_TESTS = [
 								    ("Иртәгә 22нче гыйнвар...", "Иртәгә 22нче гыйнвар ...".split())
 								]
 								ABBREV_TESTS = [
 								    ("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    ("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()),
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								NAME_ABBREV_TESTS = [
 								    ("Ә.Тукай", "Ә.Тукай".split()),
 								    ("Ә.тукай", "Ә.тукай".split()),
 								    ("ә.Тукай", "ә . Тукай".split()),
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    ("Миләүшә.", "Миләүшә .".split()),
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								TYPOS_IN_PUNC_TESTS = [
 								    ("«3 елда , туган", "« 3 елда , туган".split()),
 								    ("«3 елда,туган", "« 3 елда , туган".split()),
 								    ("«3 елда,туган.", "« 3 елда , туган .".split()),
 								    ("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    ("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()),  # "?)" => "?)" or "? )"
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								LONG_TEXTS_TESTS = [
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    (
-												Add trailing whitespace to multiline test text (#4877)


											
										
										
											2020-01-06 16:58:59 +03:00
+								        "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы "
 								        "якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз "
 								        "меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең "
 								        "салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын "
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								        "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
-												Add trailing whitespace to multiline test text (#4877)


											
										
										
											2020-01-06 16:58:59 +03:00
+								        "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы "
 								        "якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз "
 								        "меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең "
 								        "салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын "
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								        "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(),
 								    )
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								TESTCASES = (
 								    INFIX_HYPHEN_TESTS
 								    + PUNC_INSIDE_WORDS_TESTS
 								    + MIXED_ORDINAL_NUMS_TESTS
 								    + ABBREV_TESTS
 								    + NAME_ABBREV_TESTS
 								    + LONG_TEXTS_TESTS
 								    + TYPOS_IN_PUNC_TESTS
 								)
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
 								NORM_TESTCASES = [
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								    (
 								        "тукымадан һ.б.ш. тегелгән.",
 								        ["тукымадан", "һәм башка шундыйлар", "тегелгән", "."],
 								    )
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								]
 								@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
+								def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens):
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								    tokens = [token.text for token in tt_tokenizer(text) if not token.is_space]
 								    assert expected_tokens == tokens
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 03:09:36 +03:00
+								@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-25 00:38:44 +03:00
+								def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
-												Add Tatar Language Support (#2444)

* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments

											
										
										
											2018-06-19 11:17:53 +03:00
+								    tokens = tt_tokenizer(text)
 								    assert [token.norm_ for token in tokens] == norms