spaCy/spacy/tests/lang/en/test_exceptions.py
adrianeboyd a5cd203284
Reduce stored lexemes data, move feats to lookups (#5238)
* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-05-19 15:59:14 +02:00

128 lines
4.0 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize(
"text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
)
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize(
"text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
)
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize(
"wo_punct,w_punct", [("We've", "`We've"), ("couldn't", "couldn't)")]
)
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
text = "It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
assert tokens[3].text == "i.e."
@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize(
"text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]
)
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
tokens = en_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.skip
@pytest.mark.parametrize(
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
)
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm