Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
This commit is contained in:
Sofie Van Landeghem 2020-07-20 14:49:54 +02:00 committed by GitHub
parent 1b2ec94382
commit c9da9605f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 63 additions and 119 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a17,<8.0.0a20",
"thinc>=8.0.0a18,<8.0.0a20",
"blis>=0.4.0,<0.5.0",
"pytokenizations"
]

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a17,<8.0.0a20
thinc>=8.0.0a18,<8.0.0a20
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a17,<8.0.0a20
thinc>=8.0.0a18,<8.0.0a20
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a17,<8.0.0a20
thinc>=8.0.0a18,<8.0.0a20
blis>=0.4.0,<0.5.0
wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -568,7 +568,3 @@ class MatchPatternError(ValueError):
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
ValueError.__init__(self, msg)
class AlignmentError(ValueError):
pass

View File

@ -119,9 +119,8 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
assert tokens[4].text == "Mr."
@pytest.mark.xfail
@pytest.mark.xfail(reason="Issue #225 - not yet implemented")
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer(
"""Will this road take me to Puddleton?\u2014No, """
"""you'll have to walk there.\u2014Ariel."""

View File

@ -14,7 +14,7 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
assert sum(len(sent) for sent in doc.sents) == len(doc)
@pytest.mark.xfail
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_en_sentence_breaks(en_tokenizer, en_parser):
# fmt: off
text = "This is a sentence . This is another one ."

View File

@ -81,13 +81,14 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
assert tokens[2].lemma_ == "ce"
@pytest.mark.xfail
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?"
tokens = fr_tokenizer(text)
assert len(tokens) == 6
assert len(tokens) == 5
assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être"
assert tokens[1].text == "-ce"
assert tokens[1].lemma_ == "ce"
def test_fr_tokenizer_handles_title_3(fr_tokenizer):

View File

@ -89,7 +89,7 @@ def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
assert tokens[0].text == "'"
@pytest.mark.xfail(reason="See #3327")
@pytest.mark.skip(reason="See Issue #3327 and PR #3329")
@pytest.mark.parametrize("text", ["Тест''"])
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
tokens = uk_tokenizer(text)

View File

@ -83,7 +83,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
assert names
@pytest.mark.xfail(reason="Maybe outdated? Unsure")
@pytest.mark.skip(reason="Maybe outdated? Unsure")
def test_get_oracle_moves_negative_O(tsys, vocab):
doc = Doc(vocab, words=["A", "B", "C", "D"])
entity_annots = ["O", "!O", "O", "!O"]
@ -95,7 +95,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
# We can't easily represent this on a Doc object. Not sure what the best solution
# would be, but I don't think it's an important use case?
@pytest.mark.xfail(reason="No longer supported")
@pytest.mark.skip(reason="No longer supported")
def test_oracle_moves_missing_B(en_vocab):
words = ["B", "52", "Bomber"]
biluo_tags = [None, None, "L-PRODUCT"]
@ -121,7 +121,7 @@ def test_oracle_moves_missing_B(en_vocab):
# We can't easily represent this on a Doc object. Not sure what the best solution
# would be, but I don't think it's an important use case?
@pytest.mark.xfail(reason="No longer supported")
@pytest.mark.skip(reason="No longer supported")
def test_oracle_moves_whitespace(en_vocab):
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]

View File

@ -82,13 +82,13 @@ def test_update_doc(parser, model, doc, gold):
parser.update([example], sgd=optimize)
@pytest.mark.xfail
@pytest.mark.skip(reason="No longer supported")
def test_predict_doc_beam(parser, model, doc):
parser.model = model
parser(doc, beam_width=32, beam_density=0.001)
@pytest.mark.xfail
@pytest.mark.skip(reason="No longer supported")
def test_update_doc_beam(parser, model, doc, gold):
parser.model = model

View File

@ -33,8 +33,8 @@ def test_parser_root(en_tokenizer):
assert t.dep != 0, t.text
@pytest.mark.xfail
# @pytest.mark.parametrize("text", ["Hello"])
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
@pytest.mark.parametrize("text", ["Hello"])
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
tokens = en_tokenizer(text)
doc = get_doc(
@ -47,8 +47,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
assert doc[0].dep != 0
# We removed the step_through API a while ago. we should bring it back though
@pytest.mark.xfail(reason="Unsupported")
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_initial(en_tokenizer, en_parser):
text = "I ate the pizza with anchovies."
# heads = [1, 0, 1, -2, -3, -1, -5]
@ -93,8 +92,7 @@ def test_parser_merge_pp(en_tokenizer):
assert doc[3].text == "occurs"
# We removed the step_through API a while ago. we should bring it back though
@pytest.mark.xfail(reason="Unsupported")
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
text = "a b c d e"

View File

@ -28,7 +28,7 @@ def test_parser_sentence_space(en_tokenizer):
assert len(list(doc.sents)) == 2
@pytest.mark.xfail
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_leading(en_tokenizer, en_parser):
text = "\t \n This is a sentence ."
heads = [1, 1, 0, 1, -2, -3]
@ -44,7 +44,7 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
assert stepwise.stack == set([2])
@pytest.mark.xfail
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
text = "This is \t a \t\n \n sentence . \n\n \n"
heads = [1, 0, -1, 2, -1, -4, -5, -1]
@ -64,7 +64,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
@pytest.mark.xfail
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
doc = Doc(en_parser.vocab, words=text)
assert len(doc) == length

View File

@ -1,10 +1,13 @@
import pytest
import random
from spacy import util
from spacy.gold import Example
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lang.en import English
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.tokens import Doc, Span
@ -141,14 +144,6 @@ def test_issue588(en_vocab):
matcher.add("TEST", [[]])
@pytest.mark.xfail
def test_issue589():
vocab = Vocab()
vocab.strings.set_frozen(True)
doc = Doc(vocab, words=["whata"])
assert doc
def test_issue590(en_vocab):
"""Test overlapping matches"""
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
@ -285,7 +280,7 @@ def test_control_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.xfail
@pytest.mark.skip(reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218")
@pytest.mark.parametrize(
"text,tokens",
[
@ -417,8 +412,7 @@ def test_issue957(en_tokenizer):
assert doc
@pytest.mark.xfail
def test_issue999(train_data):
def test_issue999():
"""Test that adding entities and resuming training works passably OK.
There are two issues here:
1) We have to re-add labels. This isn't very nice.
@ -432,27 +426,27 @@ def test_issue999(train_data):
["hello", []],
["hi", []],
["i'm looking for a place to eat", []],
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
["show me chines restaurants", [[8, 14, "CUISINE"]]],
["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
["show me chinese restaurants", [(8, 15, "CUISINE")]],
["show me chines restaurants", [(8, 14, "CUISINE")]],
]
nlp = Language()
ner = nlp.create_pipe("ner")
nlp = English()
ner = nlp.create_pipe("ner", {"learn_rate": 0.001}) # will need to be {"model": ...} in upcoming PR
nlp.add_pipe(ner)
for _, offsets in TRAIN_DATA:
for start, end, label in offsets:
ner.add_label(label)
nlp.begin_training()
ner.model.learn_rate = 0.001
for itn in range(100):
for itn in range(20):
random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA:
nlp.update((raw_text, {"entities": entity_offsets}))
example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets})
nlp.update([example])
with make_tempdir() as model_dir:
nlp.to_disk(model_dir)
nlp2 = Language().from_disk(model_dir)
nlp2 = util.load_model_from_path(model_dir)
for raw_text, entity_offsets in TRAIN_DATA:
doc = nlp2(raw_text)
@ -461,6 +455,6 @@ def test_issue999(train_data):
if (start, end) in ents:
assert ents[(start, end)] == label
break
else:
if entity_offsets:
raise Exception(ents)
else:
if entity_offsets:
raise Exception(ents)

View File

@ -32,8 +32,8 @@ def test_issue1061():
assert "MATH" not in [w.text for w in doc]
@pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
@pytest.mark.skip(
reason="Can not be fixed without variable-width look-behind (which we don't want)"
)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""

View File

@ -10,7 +10,9 @@ from spacy.lang.en import English
from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail
@pytest.mark.skip(
reason="Can not be fixed without iterative looping between prefix/suffix and infix"
)
def test_issue2070():
"""Test that checks that a dot followed by a quote is handled
appropriately.

View File

@ -226,7 +226,7 @@ def test_issue3412():
assert best_rows[0] == 2
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449():
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))

View File

@ -178,12 +178,12 @@ def test_issue3549(en_vocab):
matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.xfail
@pytest.mark.skip("Matching currently only works on strings and integers")
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)

View File

@ -11,7 +11,6 @@ test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
default_strings = ("_SP", "POS=SPACE")
@pytest.mark.xfail
@pytest.mark.parametrize("text", ["rat"])
def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text)

View File

@ -1,5 +1,4 @@
import numpy
from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
from spacy.gold import Corpus, docs_to_json
@ -544,42 +543,6 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
# Hm, not sure where misalignment check would be handled? In the components too?
# I guess that does make sense. A text categorizer doesn't care if it's
# misaligned...
@pytest.mark.xfail(reason="Outdated")
def test_ignore_misaligned(doc):
nlp = English()
text = doc.text
with make_tempdir() as tmpdir:
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = Corpus(str(json_file), str(json_file))
with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
with make_tempdir() as tmpdir:
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = Corpus(str(json_file), str(json_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
train_reloaded_example = list(
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
)
assert len(train_reloaded_example) == 0
# We probably want the orth variant logic back, but this test won't be quite
# right -- we need to go from DocBin.
def test_make_orth_variants(doc):
nlp = English()
with make_tempdir() as tmpdir:

View File

@ -7,15 +7,26 @@ from spacy.tokens import Doc
from .util import get_batch
# This fails in Thinc v7.3.1. Need to push patch
@pytest.mark.xfail
def test_empty_doc():
width = 128
embed_size = 2000
vocab = Vocab()
doc = Doc(vocab, words=[])
# TODO: fix tok2vec arguments
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
tok2vec = build_Tok2Vec_model(
width,
embed_size,
pretrained_vectors=None,
conv_depth=4,
bilstm_depth=0,
window_size=1,
maxout_pieces=3,
subword_features=True,
char_embed=False,
nM=64,
nC=8,
dropout=None,
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update([doc])
assert len(vectors) == 1
assert vectors[0].shape == (0, width)

View File

@ -95,22 +95,3 @@ def test_stringstore_to_bytes(stringstore, text):
serialized = stringstore.to_bytes()
new_stringstore = StringStore().from_bytes(serialized)
assert new_stringstore[store] == text
@pytest.mark.xfail
@pytest.mark.parametrize("text", [["a", "b", "c"]])
def test_stringstore_freeze_oov(stringstore, text):
"""Test the possibly temporary workaround of flushing the stringstore of
OOV words."""
assert stringstore[text[0]] == 1
assert stringstore[text[1]] == 2
stringstore.set_frozen(True)
s = stringstore[text[2]]
assert s >= 4
s_ = stringstore[s]
assert s_ == text[2]
stringstore.flush_oov()
with pytest.raises(IndexError):
s_ = stringstore[s]