mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Test suite clean up (#5781)
* step_through tests: skip instead of xfail * test_empty_doc should be fixed with new Thinc version * remove outdated test (there are other misaligned tests now) * xfail reason * fix test according to french exceptions * clarified some skipped tests * skip ukranian test instead of xfail * skip instead of xfail * skip + reason instead of xfail * removed obsolete tests referring to removed "set_frozen" functionality * fix test 999 * remove unused AlignmentError * remove xfail where possible, skip otherwise * increment thinc release for empty_doc test
This commit is contained in:
parent
1b2ec94382
commit
c9da9605f7
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a17,<8.0.0a20",
|
||||
"thinc>=8.0.0a18,<8.0.0a20",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations"
|
||||
]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -568,7 +568,3 @@ class MatchPatternError(ValueError):
|
|||
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
|
||||
class AlignmentError(ValueError):
|
||||
pass
|
||||
|
|
|
@ -119,9 +119,8 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
|
|||
assert tokens[4].text == "Mr."
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.xfail(reason="Issue #225 - not yet implemented")
|
||||
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||
# Re Issue #225
|
||||
tokens = en_tokenizer(
|
||||
"""Will this road take me to Puddleton?\u2014No, """
|
||||
"""you'll have to walk there.\u2014Ariel."""
|
||||
|
|
|
@ -14,7 +14,7 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
|||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||
# fmt: off
|
||||
text = "This is a sentence . This is another one ."
|
||||
|
|
|
@ -81,13 +81,14 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
|
|||
assert tokens[2].lemma_ == "ce"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||
text = "Est-ce pas génial?"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == "Est"
|
||||
assert tokens[0].lemma_ == "être"
|
||||
assert tokens[1].text == "-ce"
|
||||
assert tokens[1].lemma_ == "ce"
|
||||
|
||||
|
||||
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
|
||||
|
|
|
@ -89,7 +89,7 @@ def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
|
|||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="See #3327")
|
||||
@pytest.mark.skip(reason="See Issue #3327 and PR #3329")
|
||||
@pytest.mark.parametrize("text", ["Тест''"])
|
||||
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
||||
tokens = uk_tokenizer(text)
|
||||
|
|
|
@ -83,7 +83,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
|||
assert names
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Maybe outdated? Unsure")
|
||||
@pytest.mark.skip(reason="Maybe outdated? Unsure")
|
||||
def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||
entity_annots = ["O", "!O", "O", "!O"]
|
||||
|
@ -95,7 +95,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
|
|||
|
||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||
# would be, but I don't think it's an important use case?
|
||||
@pytest.mark.xfail(reason="No longer supported")
|
||||
@pytest.mark.skip(reason="No longer supported")
|
||||
def test_oracle_moves_missing_B(en_vocab):
|
||||
words = ["B", "52", "Bomber"]
|
||||
biluo_tags = [None, None, "L-PRODUCT"]
|
||||
|
@ -121,7 +121,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
|||
|
||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||
# would be, but I don't think it's an important use case?
|
||||
@pytest.mark.xfail(reason="No longer supported")
|
||||
@pytest.mark.skip(reason="No longer supported")
|
||||
def test_oracle_moves_whitespace(en_vocab):
|
||||
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
||||
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
||||
|
|
|
@ -82,13 +82,13 @@ def test_update_doc(parser, model, doc, gold):
|
|||
parser.update([example], sgd=optimize)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="No longer supported")
|
||||
def test_predict_doc_beam(parser, model, doc):
|
||||
parser.model = model
|
||||
parser(doc, beam_width=32, beam_density=0.001)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="No longer supported")
|
||||
def test_update_doc_beam(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
|
||||
|
|
|
@ -33,8 +33,8 @@ def test_parser_root(en_tokenizer):
|
|||
assert t.dep != 0, t.text
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
# @pytest.mark.parametrize("text", ["Hello"])
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
|
@ -47,8 +47,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
|||
assert doc[0].dep != 0
|
||||
|
||||
|
||||
# We removed the step_through API a while ago. we should bring it back though
|
||||
@pytest.mark.xfail(reason="Unsupported")
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_parser_initial(en_tokenizer, en_parser):
|
||||
text = "I ate the pizza with anchovies."
|
||||
# heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
|
@ -93,8 +92,7 @@ def test_parser_merge_pp(en_tokenizer):
|
|||
assert doc[3].text == "occurs"
|
||||
|
||||
|
||||
# We removed the step_through API a while ago. we should bring it back though
|
||||
@pytest.mark.xfail(reason="Unsupported")
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||
text = "a b c d e"
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ def test_parser_sentence_space(en_tokenizer):
|
|||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||
text = "\t \n This is a sentence ."
|
||||
heads = [1, 1, 0, 1, -2, -3]
|
||||
|
@ -44,7 +44,7 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
|||
assert stepwise.stack == set([2])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||
text = "This is \t a \t\n \n sentence . \n\n \n"
|
||||
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
||||
|
@ -64,7 +64,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
|||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||
doc = Doc(en_parser.vocab, words=text)
|
||||
assert len(doc) == length
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
import pytest
|
||||
import random
|
||||
|
||||
from spacy import util
|
||||
from spacy.gold import Example
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||||
from spacy.symbols import POS, VERB
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.lang.en import English
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.tokens import Doc, Span
|
||||
|
@ -141,14 +144,6 @@ def test_issue588(en_vocab):
|
|||
matcher.add("TEST", [[]])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue589():
|
||||
vocab = Vocab()
|
||||
vocab.strings.set_frozen(True)
|
||||
doc = Doc(vocab, words=["whata"])
|
||||
assert doc
|
||||
|
||||
|
||||
def test_issue590(en_vocab):
|
||||
"""Test overlapping matches"""
|
||||
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
||||
|
@ -285,7 +280,7 @@ def test_control_issue792(en_tokenizer, text):
|
|||
assert "".join([token.text_with_ws for token in doc]) == text
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218")
|
||||
@pytest.mark.parametrize(
|
||||
"text,tokens",
|
||||
[
|
||||
|
@ -417,8 +412,7 @@ def test_issue957(en_tokenizer):
|
|||
assert doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue999(train_data):
|
||||
def test_issue999():
|
||||
"""Test that adding entities and resuming training works passably OK.
|
||||
There are two issues here:
|
||||
1) We have to re-add labels. This isn't very nice.
|
||||
|
@ -432,27 +426,27 @@ def test_issue999(train_data):
|
|||
["hello", []],
|
||||
["hi", []],
|
||||
["i'm looking for a place to eat", []],
|
||||
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
|
||||
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
|
||||
["show me chines restaurants", [[8, 14, "CUISINE"]]],
|
||||
["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
|
||||
["show me chinese restaurants", [(8, 15, "CUISINE")]],
|
||||
["show me chines restaurants", [(8, 14, "CUISINE")]],
|
||||
]
|
||||
|
||||
nlp = Language()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner", {"learn_rate": 0.001}) # will need to be {"model": ...} in upcoming PR
|
||||
nlp.add_pipe(ner)
|
||||
for _, offsets in TRAIN_DATA:
|
||||
for start, end, label in offsets:
|
||||
ner.add_label(label)
|
||||
nlp.begin_training()
|
||||
ner.model.learn_rate = 0.001
|
||||
for itn in range(100):
|
||||
for itn in range(20):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
for raw_text, entity_offsets in TRAIN_DATA:
|
||||
nlp.update((raw_text, {"entities": entity_offsets}))
|
||||
example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets})
|
||||
nlp.update([example])
|
||||
|
||||
with make_tempdir() as model_dir:
|
||||
nlp.to_disk(model_dir)
|
||||
nlp2 = Language().from_disk(model_dir)
|
||||
nlp2 = util.load_model_from_path(model_dir)
|
||||
|
||||
for raw_text, entity_offsets in TRAIN_DATA:
|
||||
doc = nlp2(raw_text)
|
||||
|
@ -461,6 +455,6 @@ def test_issue999(train_data):
|
|||
if (start, end) in ents:
|
||||
assert ents[(start, end)] == label
|
||||
break
|
||||
else:
|
||||
if entity_offsets:
|
||||
raise Exception(ents)
|
||||
else:
|
||||
if entity_offsets:
|
||||
raise Exception(ents)
|
||||
|
|
|
@ -32,8 +32,8 @@ def test_issue1061():
|
|||
assert "MATH" not in [w.text for w in doc]
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
|
||||
@pytest.mark.skip(
|
||||
reason="Can not be fixed without variable-width look-behind (which we don't want)"
|
||||
)
|
||||
def test_issue1235():
|
||||
"""Test that g is not split of if preceded by a number and a letter"""
|
||||
|
|
|
@ -10,7 +10,9 @@ from spacy.lang.en import English
|
|||
from ..util import add_vecs_to_vocab, get_doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip(
|
||||
reason="Can not be fixed without iterative looping between prefix/suffix and infix"
|
||||
)
|
||||
def test_issue2070():
|
||||
"""Test that checks that a dot followed by a quote is handled
|
||||
appropriately.
|
||||
|
|
|
@ -226,7 +226,7 @@ def test_issue3412():
|
|||
assert best_rows[0] == 2
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
|
||||
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
|
||||
def test_issue3449():
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
|
|
|
@ -178,12 +178,12 @@ def test_issue3549(en_vocab):
|
|||
matcher.add("BAD", [[{"X": "Y"}]])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skip("Matching currently only works on strings and integers")
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
||||
|
|
|
@ -11,7 +11,6 @@ test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
|||
default_strings = ("_SP", "POS=SPACE")
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize("text", ["rat"])
|
||||
def test_serialize_vocab(en_vocab, text):
|
||||
text_hash = en_vocab.strings.add(text)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import numpy
|
||||
from spacy.errors import AlignmentError
|
||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||
from spacy.gold import Corpus, docs_to_json
|
||||
|
@ -544,42 +543,6 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||
|
||||
|
||||
# Hm, not sure where misalignment check would be handled? In the components too?
|
||||
# I guess that does make sense. A text categorizer doesn't care if it's
|
||||
# misaligned...
|
||||
@pytest.mark.xfail(reason="Outdated")
|
||||
def test_ignore_misaligned(doc):
|
||||
nlp = English()
|
||||
text = doc.text
|
||||
with make_tempdir() as tmpdir:
|
||||
json_file = tmpdir / "test.json"
|
||||
data = [docs_to_json(doc)]
|
||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||
# write to JSON train dicts
|
||||
srsly.write_json(json_file, data)
|
||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
||||
|
||||
with pytest.raises(AlignmentError):
|
||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||
|
||||
with make_tempdir() as tmpdir:
|
||||
json_file = tmpdir / "test.json"
|
||||
data = [docs_to_json(doc)]
|
||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||
# write to JSON train dicts
|
||||
srsly.write_json(json_file, data)
|
||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
||||
|
||||
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
||||
# because the only example can't be aligned
|
||||
train_reloaded_example = list(
|
||||
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
|
||||
)
|
||||
assert len(train_reloaded_example) == 0
|
||||
|
||||
|
||||
# We probably want the orth variant logic back, but this test won't be quite
|
||||
# right -- we need to go from DocBin.
|
||||
def test_make_orth_variants(doc):
|
||||
nlp = English()
|
||||
with make_tempdir() as tmpdir:
|
||||
|
|
|
@ -7,15 +7,26 @@ from spacy.tokens import Doc
|
|||
from .util import get_batch
|
||||
|
||||
|
||||
# This fails in Thinc v7.3.1. Need to push patch
|
||||
@pytest.mark.xfail
|
||||
def test_empty_doc():
|
||||
width = 128
|
||||
embed_size = 2000
|
||||
vocab = Vocab()
|
||||
doc = Doc(vocab, words=[])
|
||||
# TODO: fix tok2vec arguments
|
||||
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
|
||||
tok2vec = build_Tok2Vec_model(
|
||||
width,
|
||||
embed_size,
|
||||
pretrained_vectors=None,
|
||||
conv_depth=4,
|
||||
bilstm_depth=0,
|
||||
window_size=1,
|
||||
maxout_pieces=3,
|
||||
subword_features=True,
|
||||
char_embed=False,
|
||||
nM=64,
|
||||
nC=8,
|
||||
dropout=None,
|
||||
)
|
||||
tok2vec.initialize()
|
||||
vectors, backprop = tok2vec.begin_update([doc])
|
||||
assert len(vectors) == 1
|
||||
assert vectors[0].shape == (0, width)
|
||||
|
|
|
@ -95,22 +95,3 @@ def test_stringstore_to_bytes(stringstore, text):
|
|||
serialized = stringstore.to_bytes()
|
||||
new_stringstore = StringStore().from_bytes(serialized)
|
||||
assert new_stringstore[store] == text
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize("text", [["a", "b", "c"]])
|
||||
def test_stringstore_freeze_oov(stringstore, text):
|
||||
"""Test the possibly temporary workaround of flushing the stringstore of
|
||||
OOV words."""
|
||||
assert stringstore[text[0]] == 1
|
||||
assert stringstore[text[1]] == 2
|
||||
|
||||
stringstore.set_frozen(True)
|
||||
s = stringstore[text[2]]
|
||||
assert s >= 4
|
||||
s_ = stringstore[s]
|
||||
assert s_ == text[2]
|
||||
|
||||
stringstore.flush_oov()
|
||||
with pytest.raises(IndexError):
|
||||
s_ = stringstore[s]
|
||||
|
|
Loading…
Reference in New Issue
Block a user