diff --git a/pyproject.toml b/pyproject.toml index db1031d68..0e66ececf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a17,<8.0.0a20", + "thinc>=8.0.0a18,<8.0.0a20", "blis>=0.4.0,<0.5.0", "pytokenizations" ] diff --git a/requirements.txt b/requirements.txt index 760e0c0ea..7b6e0c9e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a17,<8.0.0a20 +thinc>=8.0.0a18,<8.0.0a20 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index a1c881d10..c7f5ce7f2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a17,<8.0.0a20 + thinc>=8.0.0a18,<8.0.0a20 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a17,<8.0.0a20 + thinc>=8.0.0a18,<8.0.0a20 blis>=0.4.0,<0.5.0 wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/errors.py b/spacy/errors.py index 4f234a494..4e3ca2a9b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -568,7 +568,3 @@ class MatchPatternError(ValueError): pattern_errors = "\n".join([f"- {e}" for e in error_msgs]) msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n" ValueError.__init__(self, msg) - - -class AlignmentError(ValueError): - pass diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 9efcc1015..9dfb54fd6 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -119,9 +119,8 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer): assert tokens[4].text == "Mr." -@pytest.mark.xfail +@pytest.mark.xfail(reason="Issue #225 - not yet implemented") def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): - # Re Issue #225 tokens = en_tokenizer( """Will this road take me to Puddleton?\u2014No, """ """you'll have to walk there.\u2014Ariel.""" diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index ba7b2f2cf..7c2e2e0bd 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -14,7 +14,7 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct): assert sum(len(sent) for sent in doc.sents) == len(doc) -@pytest.mark.xfail +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_en_sentence_breaks(en_tokenizer, en_parser): # fmt: off text = "This is a sentence . This is another one ." diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 91c0a0a4d..4b7ccad65 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -81,13 +81,14 @@ def test_fr_tokenizer_handles_title(fr_tokenizer): assert tokens[2].lemma_ == "ce" -@pytest.mark.xfail def test_fr_tokenizer_handles_title_2(fr_tokenizer): text = "Est-ce pas génial?" tokens = fr_tokenizer(text) - assert len(tokens) == 6 + assert len(tokens) == 5 assert tokens[0].text == "Est" assert tokens[0].lemma_ == "être" + assert tokens[1].text == "-ce" + assert tokens[1].lemma_ == "ce" def test_fr_tokenizer_handles_title_3(fr_tokenizer): diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index eb647a041..91ae057f8 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -89,7 +89,7 @@ def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail(reason="See #3327") +@pytest.mark.skip(reason="See Issue #3327 and PR #3329") @pytest.mark.parametrize("text", ["Тест''"]) def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): tokens = uk_tokenizer(text) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index ad7688344..b71285a34 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -83,7 +83,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab): assert names -@pytest.mark.xfail(reason="Maybe outdated? Unsure") +@pytest.mark.skip(reason="Maybe outdated? Unsure") def test_get_oracle_moves_negative_O(tsys, vocab): doc = Doc(vocab, words=["A", "B", "C", "D"]) entity_annots = ["O", "!O", "O", "!O"] @@ -95,7 +95,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab): # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? -@pytest.mark.xfail(reason="No longer supported") +@pytest.mark.skip(reason="No longer supported") def test_oracle_moves_missing_B(en_vocab): words = ["B", "52", "Bomber"] biluo_tags = [None, None, "L-PRODUCT"] @@ -121,7 +121,7 @@ def test_oracle_moves_missing_B(en_vocab): # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? -@pytest.mark.xfail(reason="No longer supported") +@pytest.mark.skip(reason="No longer supported") def test_oracle_moves_whitespace(en_vocab): words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 93d92e26b..b75d2f9e5 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -82,13 +82,13 @@ def test_update_doc(parser, model, doc, gold): parser.update([example], sgd=optimize) -@pytest.mark.xfail +@pytest.mark.skip(reason="No longer supported") def test_predict_doc_beam(parser, model, doc): parser.model = model parser(doc, beam_width=32, beam_density=0.001) -@pytest.mark.xfail +@pytest.mark.skip(reason="No longer supported") def test_update_doc_beam(parser, model, doc, gold): parser.model = model diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 5b9a1cd8e..b1c74e2e9 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -33,8 +33,8 @@ def test_parser_root(en_tokenizer): assert t.dep != 0, t.text -@pytest.mark.xfail -# @pytest.mark.parametrize("text", ["Hello"]) +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") +@pytest.mark.parametrize("text", ["Hello"]) def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) doc = get_doc( @@ -47,8 +47,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): assert doc[0].dep != 0 -# We removed the step_through API a while ago. we should bring it back though -@pytest.mark.xfail(reason="Unsupported") +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." # heads = [1, 0, 1, -2, -3, -1, -5] @@ -93,8 +92,7 @@ def test_parser_merge_pp(en_tokenizer): assert doc[3].text == "occurs" -# We removed the step_through API a while ago. we should bring it back though -@pytest.mark.xfail(reason="Unsupported") +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 59ae4e629..db25a25c0 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -28,7 +28,7 @@ def test_parser_sentence_space(en_tokenizer): assert len(list(doc.sents)) == 2 -@pytest.mark.xfail +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_parser_space_attachment_leading(en_tokenizer, en_parser): text = "\t \n This is a sentence ." heads = [1, 1, 0, 1, -2, -3] @@ -44,7 +44,7 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser): assert stepwise.stack == set([2]) -@pytest.mark.xfail +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): text = "This is \t a \t\n \n sentence . \n\n \n" heads = [1, 0, -1, 2, -1, -4, -5, -1] @@ -64,7 +64,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): @pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)]) -@pytest.mark.xfail +@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)") def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): doc = Doc(en_parser.vocab, words=text) assert len(doc) == length diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index bfca72853..abb3ac32b 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,10 +1,13 @@ import pytest import random + +from spacy import util +from spacy.gold import Example from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.symbols import POS, VERB from spacy.vocab import Vocab -from spacy.language import Language +from spacy.lang.en import English from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.tokens import Doc, Span @@ -141,14 +144,6 @@ def test_issue588(en_vocab): matcher.add("TEST", [[]]) -@pytest.mark.xfail -def test_issue589(): - vocab = Vocab() - vocab.strings.set_frozen(True) - doc = Doc(vocab, words=["whata"]) - assert doc - - def test_issue590(en_vocab): """Test overlapping matches""" doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) @@ -285,7 +280,7 @@ def test_control_issue792(en_tokenizer, text): assert "".join([token.text_with_ws for token in doc]) == text -@pytest.mark.xfail +@pytest.mark.skip(reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218") @pytest.mark.parametrize( "text,tokens", [ @@ -417,8 +412,7 @@ def test_issue957(en_tokenizer): assert doc -@pytest.mark.xfail -def test_issue999(train_data): +def test_issue999(): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to re-add labels. This isn't very nice. @@ -432,27 +426,27 @@ def test_issue999(train_data): ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], - ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], - ["show me chinese restaurants", [[8, 15, "CUISINE"]]], - ["show me chines restaurants", [[8, 14, "CUISINE"]]], + ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]], + ["show me chinese restaurants", [(8, 15, "CUISINE")]], + ["show me chines restaurants", [(8, 14, "CUISINE")]], ] - nlp = Language() - ner = nlp.create_pipe("ner") + nlp = English() + ner = nlp.create_pipe("ner", {"learn_rate": 0.001}) # will need to be {"model": ...} in upcoming PR nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() - ner.model.learn_rate = 0.001 - for itn in range(100): + for itn in range(20): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: - nlp.update((raw_text, {"entities": entity_offsets})) + example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets}) + nlp.update([example]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) - nlp2 = Language().from_disk(model_dir) + nlp2 = util.load_model_from_path(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) @@ -461,6 +455,6 @@ def test_issue999(train_data): if (start, end) in ents: assert ents[(start, end)] == label break - else: - if entity_offsets: - raise Exception(ents) + else: + if entity_offsets: + raise Exception(ents) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index aaff951e5..a9b54fc6d 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -32,8 +32,8 @@ def test_issue1061(): assert "MATH" not in [w.text for w in doc] -@pytest.mark.xfail( - reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)" +@pytest.mark.skip( + reason="Can not be fixed without variable-width look-behind (which we don't want)" ) def test_issue1235(): """Test that g is not split of if preceded by a number and a letter""" diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 8b998d216..1965c0f05 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -10,7 +10,9 @@ from spacy.lang.en import English from ..util import add_vecs_to_vocab, get_doc -@pytest.mark.xfail +@pytest.mark.skip( + reason="Can not be fixed without iterative looping between prefix/suffix and infix" +) def test_issue2070(): """Test that checks that a dot followed by a quote is handled appropriately. diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index ca4733f0d..599f0900a 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -226,7 +226,7 @@ def test_issue3412(): assert best_rows[0] == 2 -@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") +@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") def test_issue3449(): nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 5e2ee902c..80b32cfd6 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -178,12 +178,12 @@ def test_issue3549(en_vocab): matcher.add("BAD", [[{"X": "Y"}]]) -@pytest.mark.xfail +@pytest.mark.skip("Matching currently only works on strings and integers") def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) - pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] + pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index e570b1025..44930247a 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -11,7 +11,6 @@ test_strings_attrs = [(["rats", "are", "cute"], "Hello")] default_strings = ("_SP", "POS=SPACE") -@pytest.mark.xfail @pytest.mark.parametrize("text", ["rat"]) def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index efad7f465..7fb7a1000 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,5 +1,4 @@ import numpy -from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import Corpus, docs_to_json @@ -544,42 +543,6 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -# Hm, not sure where misalignment check would be handled? In the components too? -# I guess that does make sense. A text categorizer doesn't care if it's -# misaligned... -@pytest.mark.xfail(reason="Outdated") -def test_ignore_misaligned(doc): - nlp = English() - text = doc.text - with make_tempdir() as tmpdir: - json_file = tmpdir / "test.json" - data = [docs_to_json(doc)] - data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSON train dicts - srsly.write_json(json_file, data) - goldcorpus = Corpus(str(json_file), str(json_file)) - - with pytest.raises(AlignmentError): - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) - - with make_tempdir() as tmpdir: - json_file = tmpdir / "test.json" - data = [docs_to_json(doc)] - data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSON train dicts - srsly.write_json(json_file, data) - goldcorpus = Corpus(str(json_file), str(json_file)) - - # doesn't raise an AlignmentError, but there is nothing to iterate over - # because the only example can't be aligned - train_reloaded_example = list( - goldcorpus.train_dataset(nlp, ignore_misaligned=True) - ) - assert len(train_reloaded_example) == 0 - - -# We probably want the orth variant logic back, but this test won't be quite -# right -- we need to go from DocBin. def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index ee1f9dead..32f4c5774 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -7,15 +7,26 @@ from spacy.tokens import Doc from .util import get_batch -# This fails in Thinc v7.3.1. Need to push patch -@pytest.mark.xfail def test_empty_doc(): width = 128 embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) - # TODO: fix tok2vec arguments - tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) + tok2vec = build_Tok2Vec_model( + width, + embed_size, + pretrained_vectors=None, + conv_depth=4, + bilstm_depth=0, + window_size=1, + maxout_pieces=3, + subword_features=True, + char_embed=False, + nM=64, + nC=8, + dropout=None, + ) + tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index c71d5f3f2..a0f8016af 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -95,22 +95,3 @@ def test_stringstore_to_bytes(stringstore, text): serialized = stringstore.to_bytes() new_stringstore = StringStore().from_bytes(serialized) assert new_stringstore[store] == text - - -@pytest.mark.xfail -@pytest.mark.parametrize("text", [["a", "b", "c"]]) -def test_stringstore_freeze_oov(stringstore, text): - """Test the possibly temporary workaround of flushing the stringstore of - OOV words.""" - assert stringstore[text[0]] == 1 - assert stringstore[text[1]] == 2 - - stringstore.set_frozen(True) - s = stringstore[text[2]] - assert s >= 4 - s_ = stringstore[s] - assert s_ == text[2] - - stringstore.flush_oov() - with pytest.raises(IndexError): - s_ = stringstore[s]