spaCy/spacy/tests/lang/en/test_exceptions.py

# coding: utf-8
from __future__ import unicode_literals

import pytest


def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
    text = "don't giggle"
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[1].text == "n't"
    text = "i said don't!"
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
    assert tokens[4].text == "!"


@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3


@pytest.mark.parametrize(
    "text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
)
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
    tokens = en_tokenizer(text_poss)
    assert len(tokens) == 2
    assert tokens[0].text == text
    assert tokens[1].text == "'s"


@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == text.split("'")[0]
    assert tokens[1].text == "'"


@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].text == text


@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == text.split("'")[0]
    assert tokens[1].text == "'ll"
    assert tokens[1].lemma_ == "will"


@pytest.mark.parametrize(
    "text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
)
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
    tokens_lower = en_tokenizer(text_lower)
    tokens_title = en_tokenizer(text_title)
    assert tokens_title[0].text == tokens_lower[0].text.title()
    assert tokens_lower[0].text == tokens_title[0].text.lower()
    assert tokens_lower[1].text == tokens_title[1].text


@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
    tokens = en_tokenizer(pron + contraction)
    assert tokens[0].text == pron
    assert tokens[1].text == contraction


@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
    tokens = en_tokenizer(exc)
    assert len(tokens) == 1


@pytest.mark.parametrize(
    "wo_punct,w_punct", [("We've", "`We've"), ("couldn't", "couldn't)")]
)
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
    tokens = en_tokenizer(wo_punct)
    assert len(tokens) == 2
    tokens = en_tokenizer(w_punct)
    assert len(tokens) == 3


@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 1


def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
    text = "It's mediocre i.e. bad."
    tokens = en_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[3].text == "i.e."


@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
def test_en_tokenizer_handles_times(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[1].lemma_ in ["a.m.", "p.m."]


@pytest.mark.parametrize(
    "text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]
)
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
    tokens = en_tokenizer(text)
    assert [token.norm_ for token in tokens] == norms


@pytest.mark.skip
@pytest.mark.parametrize(
    "text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
)
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
    tokens = en_tokenizer(text)
    assert tokens[0].norm_ == norm
Modernize and merge tokenizer tests for exception and emoticons 2017-01-05 15:11:31 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`


Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_basic_contraction(en_tokenizer):`
			`text = "don't giggle"`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 3`
			`assert tokens[1].text == "n't"`
			`text = "i said don't!"`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 5`
			`assert tokens[4].text == "!"`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 3`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize(`
			`"text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]`
			`)`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):`
			`tokens = en_tokenizer(text_poss)`
			`assert len(tokens) == 2`
			`assert tokens[0].text == text`
			`assert tokens[1].text == "'s"`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("text", ["schools'", "Alexis'"])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 2`
			`assert tokens[0].text == text.split("'")[0]`
			`assert tokens[1].text == "'"`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 1`
			`assert tokens[0].text == text`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 2`
			`assert tokens[0].text == text.split("'")[0]`
			`assert tokens[1].text == "'ll"`
			`assert tokens[1].lemma_ == "will"`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize(`
			`"text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]`
			`)`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):`
			`tokens_lower = en_tokenizer(text_lower)`
			`tokens_title = en_tokenizer(text_title)`
			`assert tokens_title[0].text == tokens_lower[0].text.title()`
			`assert tokens_lower[0].text == tokens_title[0].text.lower()`
			`assert tokens_lower[1].text == tokens_title[1].text`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])`
			`@pytest.mark.parametrize("contraction", ["'ll", "'d"])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):`
			`tokens = en_tokenizer(pron + contraction)`
			`assert tokens[0].text == pron`
			`assert tokens[1].text == contraction`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):`
			`tokens = en_tokenizer(exc)`
			`assert len(tokens) == 1`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize(`
Replacing regex library with re to increase tokenization speed (#3218) * replace unicode categories with raw list of code points * simplifying ranges * fixing variable length quotes * removing redundant regular expression * small cleanup of regexp notations * quotes and alpha as ranges instead of alterations * removed most regexp dependencies and features * exponential backtracking - unit tests * rewrote expression with pathological backtracking * disabling double hyphen tests for now * test additional variants of repeating punctuation * remove regex and redundant backslashes from load_reddit script * small typo fixes * disable double punctuation test for russian * clean up old comments * format block code * final cleanup * naming consistency * french strings as unicode for python 2 support * french regular expression case insensitive 2019-02-01 10:05:22 +03:00			"wo_punct,w_punct", [("We've", "`We've"), ("couldn't", "couldn't)")]
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`)`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):`
			`tokens = en_tokenizer(wo_punct)`
			`assert len(tokens) == 2`
			`tokens = en_tokenizer(w_punct)`
			`assert len(tokens) == 3`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_abbr(en_tokenizer, text):`
Modernize and merge tokenizer tests for exception and emoticons 2017-01-05 15:11:31 +03:00			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 1`


Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_exc_in_text(en_tokenizer):`
Modernize and merge tokenizer tests for exception and emoticons 2017-01-05 15:11:31 +03:00			`text = "It's mediocre i.e. bad."`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 6`
			`assert tokens[3].text == "i.e."`
Add test for English time exceptions ("1a.m." etc.) 2017-03-12 15:44:20 +03:00

💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])`
Update model fixtures and reorganise tests 2017-05-29 23:14:31 +03:00			`def test_en_tokenizer_handles_times(en_tokenizer, text):`
Add test for English time exceptions ("1a.m." etc.) 2017-03-12 15:44:20 +03:00			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 2`
			`assert tokens[1].lemma_ in ["a.m.", "p.m."]`
Add tests for English norm exceptions 2017-06-03 21:59:50 +03:00

💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize(`
			`"text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]`
			`)`
Add tests for English norm exceptions 2017-06-03 21:59:50 +03:00			`def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):`
			`tokens = en_tokenizer(text)`
			`assert [token.norm_ for token in tokens] == norms`


Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2020-05-19 16:59:14 +03:00			`@pytest.mark.skip`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`@pytest.mark.parametrize(`
			`"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]`
			`)`
Add tests for English norm exceptions 2017-06-03 21:59:50 +03:00			`def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):`
			`tokens = en_tokenizer(text)`
			`assert tokens[0].norm_ == norm`