Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
This commit is contained in:
Sofie Van Landeghem 2020-07-20 14:49:54 +02:00 committed by GitHub
parent 1b2ec94382
commit c9da9605f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 63 additions and 119 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a17,<8.0.0a20", "thinc>=8.0.0a18,<8.0.0a20",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations" "pytokenizations"
] ]

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a17,<8.0.0a20 thinc>=8.0.0a18,<8.0.0a20
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a17,<8.0.0a20 thinc>=8.0.0a18,<8.0.0a20
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a17,<8.0.0a20 thinc>=8.0.0a18,<8.0.0a20
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.0,<1.1.0 wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -568,7 +568,3 @@ class MatchPatternError(ValueError):
pattern_errors = "\n".join([f"- {e}" for e in error_msgs]) pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n" msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
ValueError.__init__(self, msg) ValueError.__init__(self, msg)
class AlignmentError(ValueError):
pass

View File

@ -119,9 +119,8 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
assert tokens[4].text == "Mr." assert tokens[4].text == "Mr."
@pytest.mark.xfail @pytest.mark.xfail(reason="Issue #225 - not yet implemented")
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer( tokens = en_tokenizer(
"""Will this road take me to Puddleton?\u2014No, """ """Will this road take me to Puddleton?\u2014No, """
"""you'll have to walk there.\u2014Ariel.""" """you'll have to walk there.\u2014Ariel."""

View File

@ -14,7 +14,7 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
assert sum(len(sent) for sent in doc.sents) == len(doc) assert sum(len(sent) for sent in doc.sents) == len(doc)
@pytest.mark.xfail @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_en_sentence_breaks(en_tokenizer, en_parser): def test_en_sentence_breaks(en_tokenizer, en_parser):
# fmt: off # fmt: off
text = "This is a sentence . This is another one ." text = "This is a sentence . This is another one ."

View File

@ -81,13 +81,14 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
assert tokens[2].lemma_ == "ce" assert tokens[2].lemma_ == "ce"
@pytest.mark.xfail
def test_fr_tokenizer_handles_title_2(fr_tokenizer): def test_fr_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?" text = "Est-ce pas génial?"
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 6 assert len(tokens) == 5
assert tokens[0].text == "Est" assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être" assert tokens[0].lemma_ == "être"
assert tokens[1].text == "-ce"
assert tokens[1].lemma_ == "ce"
def test_fr_tokenizer_handles_title_3(fr_tokenizer): def test_fr_tokenizer_handles_title_3(fr_tokenizer):

View File

@ -89,7 +89,7 @@ def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
assert tokens[0].text == "'" assert tokens[0].text == "'"
@pytest.mark.xfail(reason="See #3327") @pytest.mark.skip(reason="See Issue #3327 and PR #3329")
@pytest.mark.parametrize("text", ["Тест''"]) @pytest.mark.parametrize("text", ["Тест''"])
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)

View File

@ -83,7 +83,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
assert names assert names
@pytest.mark.xfail(reason="Maybe outdated? Unsure") @pytest.mark.skip(reason="Maybe outdated? Unsure")
def test_get_oracle_moves_negative_O(tsys, vocab): def test_get_oracle_moves_negative_O(tsys, vocab):
doc = Doc(vocab, words=["A", "B", "C", "D"]) doc = Doc(vocab, words=["A", "B", "C", "D"])
entity_annots = ["O", "!O", "O", "!O"] entity_annots = ["O", "!O", "O", "!O"]
@ -95,7 +95,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
# We can't easily represent this on a Doc object. Not sure what the best solution # We can't easily represent this on a Doc object. Not sure what the best solution
# would be, but I don't think it's an important use case? # would be, but I don't think it's an important use case?
@pytest.mark.xfail(reason="No longer supported") @pytest.mark.skip(reason="No longer supported")
def test_oracle_moves_missing_B(en_vocab): def test_oracle_moves_missing_B(en_vocab):
words = ["B", "52", "Bomber"] words = ["B", "52", "Bomber"]
biluo_tags = [None, None, "L-PRODUCT"] biluo_tags = [None, None, "L-PRODUCT"]
@ -121,7 +121,7 @@ def test_oracle_moves_missing_B(en_vocab):
# We can't easily represent this on a Doc object. Not sure what the best solution # We can't easily represent this on a Doc object. Not sure what the best solution
# would be, but I don't think it's an important use case? # would be, but I don't think it's an important use case?
@pytest.mark.xfail(reason="No longer supported") @pytest.mark.skip(reason="No longer supported")
def test_oracle_moves_whitespace(en_vocab): def test_oracle_moves_whitespace(en_vocab):
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]

View File

@ -82,13 +82,13 @@ def test_update_doc(parser, model, doc, gold):
parser.update([example], sgd=optimize) parser.update([example], sgd=optimize)
@pytest.mark.xfail @pytest.mark.skip(reason="No longer supported")
def test_predict_doc_beam(parser, model, doc): def test_predict_doc_beam(parser, model, doc):
parser.model = model parser.model = model
parser(doc, beam_width=32, beam_density=0.001) parser(doc, beam_width=32, beam_density=0.001)
@pytest.mark.xfail @pytest.mark.skip(reason="No longer supported")
def test_update_doc_beam(parser, model, doc, gold): def test_update_doc_beam(parser, model, doc, gold):
parser.model = model parser.model = model

View File

@ -33,8 +33,8 @@ def test_parser_root(en_tokenizer):
assert t.dep != 0, t.text assert t.dep != 0, t.text
@pytest.mark.xfail @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
# @pytest.mark.parametrize("text", ["Hello"]) @pytest.mark.parametrize("text", ["Hello"])
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc( doc = get_doc(
@ -47,8 +47,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
assert doc[0].dep != 0 assert doc[0].dep != 0
# We removed the step_through API a while ago. we should bring it back though @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
@pytest.mark.xfail(reason="Unsupported")
def test_parser_initial(en_tokenizer, en_parser): def test_parser_initial(en_tokenizer, en_parser):
text = "I ate the pizza with anchovies." text = "I ate the pizza with anchovies."
# heads = [1, 0, 1, -2, -3, -1, -5] # heads = [1, 0, 1, -2, -3, -1, -5]
@ -93,8 +92,7 @@ def test_parser_merge_pp(en_tokenizer):
assert doc[3].text == "occurs" assert doc[3].text == "occurs"
# We removed the step_through API a while ago. we should bring it back though @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
@pytest.mark.xfail(reason="Unsupported")
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
text = "a b c d e" text = "a b c d e"

View File

@ -28,7 +28,7 @@ def test_parser_sentence_space(en_tokenizer):
assert len(list(doc.sents)) == 2 assert len(list(doc.sents)) == 2
@pytest.mark.xfail @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_leading(en_tokenizer, en_parser): def test_parser_space_attachment_leading(en_tokenizer, en_parser):
text = "\t \n This is a sentence ." text = "\t \n This is a sentence ."
heads = [1, 1, 0, 1, -2, -3] heads = [1, 1, 0, 1, -2, -3]
@ -44,7 +44,7 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
assert stepwise.stack == set([2]) assert stepwise.stack == set([2])
@pytest.mark.xfail @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
text = "This is \t a \t\n \n sentence . \n\n \n" text = "This is \t a \t\n \n sentence . \n\n \n"
heads = [1, 0, -1, 2, -1, -4, -5, -1] heads = [1, 0, -1, 2, -1, -4, -5, -1]
@ -64,7 +64,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)]) @pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
@pytest.mark.xfail @pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
doc = Doc(en_parser.vocab, words=text) doc = Doc(en_parser.vocab, words=text)
assert len(doc) == length assert len(doc) == length

View File

@ -1,10 +1,13 @@
import pytest import pytest
import random import random
from spacy import util
from spacy.gold import Example
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB from spacy.symbols import POS, VERB
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.language import Language from spacy.lang.en import English
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
@ -141,14 +144,6 @@ def test_issue588(en_vocab):
matcher.add("TEST", [[]]) matcher.add("TEST", [[]])
@pytest.mark.xfail
def test_issue589():
vocab = Vocab()
vocab.strings.set_frozen(True)
doc = Doc(vocab, words=["whata"])
assert doc
def test_issue590(en_vocab): def test_issue590(en_vocab):
"""Test overlapping matches""" """Test overlapping matches"""
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
@ -285,7 +280,7 @@ def test_control_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.xfail @pytest.mark.skip(reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,tokens", "text,tokens",
[ [
@ -417,8 +412,7 @@ def test_issue957(en_tokenizer):
assert doc assert doc
@pytest.mark.xfail def test_issue999():
def test_issue999(train_data):
"""Test that adding entities and resuming training works passably OK. """Test that adding entities and resuming training works passably OK.
There are two issues here: There are two issues here:
1) We have to re-add labels. This isn't very nice. 1) We have to re-add labels. This isn't very nice.
@ -432,27 +426,27 @@ def test_issue999(train_data):
["hello", []], ["hello", []],
["hi", []], ["hi", []],
["i'm looking for a place to eat", []], ["i'm looking for a place to eat", []],
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chinese restaurants", [(8, 15, "CUISINE")]],
["show me chines restaurants", [[8, 14, "CUISINE"]]], ["show me chines restaurants", [(8, 14, "CUISINE")]],
] ]
nlp = Language() nlp = English()
ner = nlp.create_pipe("ner") ner = nlp.create_pipe("ner", {"learn_rate": 0.001}) # will need to be {"model": ...} in upcoming PR
nlp.add_pipe(ner) nlp.add_pipe(ner)
for _, offsets in TRAIN_DATA: for _, offsets in TRAIN_DATA:
for start, end, label in offsets: for start, end, label in offsets:
ner.add_label(label) ner.add_label(label)
nlp.begin_training() nlp.begin_training()
ner.model.learn_rate = 0.001 for itn in range(20):
for itn in range(100):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA: for raw_text, entity_offsets in TRAIN_DATA:
nlp.update((raw_text, {"entities": entity_offsets})) example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets})
nlp.update([example])
with make_tempdir() as model_dir: with make_tempdir() as model_dir:
nlp.to_disk(model_dir) nlp.to_disk(model_dir)
nlp2 = Language().from_disk(model_dir) nlp2 = util.load_model_from_path(model_dir)
for raw_text, entity_offsets in TRAIN_DATA: for raw_text, entity_offsets in TRAIN_DATA:
doc = nlp2(raw_text) doc = nlp2(raw_text)

View File

@ -32,8 +32,8 @@ def test_issue1061():
assert "MATH" not in [w.text for w in doc] assert "MATH" not in [w.text for w in doc]
@pytest.mark.xfail( @pytest.mark.skip(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)" reason="Can not be fixed without variable-width look-behind (which we don't want)"
) )
def test_issue1235(): def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter""" """Test that g is not split of if preceded by a number and a letter"""

View File

@ -10,7 +10,9 @@ from spacy.lang.en import English
from ..util import add_vecs_to_vocab, get_doc from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail @pytest.mark.skip(
reason="Can not be fixed without iterative looping between prefix/suffix and infix"
)
def test_issue2070(): def test_issue2070():
"""Test that checks that a dot followed by a quote is handled """Test that checks that a dot followed by a quote is handled
appropriately. appropriately.

View File

@ -226,7 +226,7 @@ def test_issue3412():
assert best_rows[0] == 2 assert best_rows[0] == 2
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") @pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449(): def test_issue3449():
nlp = English() nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(nlp.create_pipe("sentencizer"))

View File

@ -178,12 +178,12 @@ def test_issue3549(en_vocab):
matcher.add("BAD", [[{"X": "Y"}]]) matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.xfail @pytest.mark.skip("Matching currently only works on strings and integers")
def test_issue3555(en_vocab): def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher.""" """Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None) Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", [pattern]) matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"]) doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc) matcher(doc)

View File

@ -11,7 +11,6 @@ test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
default_strings = ("_SP", "POS=SPACE") default_strings = ("_SP", "POS=SPACE")
@pytest.mark.xfail
@pytest.mark.parametrize("text", ["rat"]) @pytest.mark.parametrize("text", ["rat"])
def test_serialize_vocab(en_vocab, text): def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text) text_hash = en_vocab.strings.add(text)

View File

@ -1,5 +1,4 @@
import numpy import numpy
from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import spans_from_biluo_tags, iob_to_biluo
from spacy.gold import Corpus, docs_to_json from spacy.gold import Corpus, docs_to_json
@ -544,42 +543,6 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
# Hm, not sure where misalignment check would be handled? In the components too?
# I guess that does make sense. A text categorizer doesn't care if it's
# misaligned...
@pytest.mark.xfail(reason="Outdated")
def test_ignore_misaligned(doc):
nlp = English()
text = doc.text
with make_tempdir() as tmpdir:
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = Corpus(str(json_file), str(json_file))
with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
with make_tempdir() as tmpdir:
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = Corpus(str(json_file), str(json_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
train_reloaded_example = list(
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
)
assert len(train_reloaded_example) == 0
# We probably want the orth variant logic back, but this test won't be quite
# right -- we need to go from DocBin.
def test_make_orth_variants(doc): def test_make_orth_variants(doc):
nlp = English() nlp = English()
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:

View File

@ -7,15 +7,26 @@ from spacy.tokens import Doc
from .util import get_batch from .util import get_batch
# This fails in Thinc v7.3.1. Need to push patch
@pytest.mark.xfail
def test_empty_doc(): def test_empty_doc():
width = 128 width = 128
embed_size = 2000 embed_size = 2000
vocab = Vocab() vocab = Vocab()
doc = Doc(vocab, words=[]) doc = Doc(vocab, words=[])
# TODO: fix tok2vec arguments tok2vec = build_Tok2Vec_model(
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) width,
embed_size,
pretrained_vectors=None,
conv_depth=4,
bilstm_depth=0,
window_size=1,
maxout_pieces=3,
subword_features=True,
char_embed=False,
nM=64,
nC=8,
dropout=None,
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update([doc]) vectors, backprop = tok2vec.begin_update([doc])
assert len(vectors) == 1 assert len(vectors) == 1
assert vectors[0].shape == (0, width) assert vectors[0].shape == (0, width)

View File

@ -95,22 +95,3 @@ def test_stringstore_to_bytes(stringstore, text):
serialized = stringstore.to_bytes() serialized = stringstore.to_bytes()
new_stringstore = StringStore().from_bytes(serialized) new_stringstore = StringStore().from_bytes(serialized)
assert new_stringstore[store] == text assert new_stringstore[store] == text
@pytest.mark.xfail
@pytest.mark.parametrize("text", [["a", "b", "c"]])
def test_stringstore_freeze_oov(stringstore, text):
"""Test the possibly temporary workaround of flushing the stringstore of
OOV words."""
assert stringstore[text[0]] == 1
assert stringstore[text[1]] == 2
stringstore.set_frozen(True)
s = stringstore[text[2]]
assert s >= 4
s_ = stringstore[s]
assert s_ == text[2]
stringstore.flush_oov()
with pytest.raises(IndexError):
s_ = stringstore[s]