From 199943deb4da7c68f08f578b404dbc6208cc41ac Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Fri, 5 Nov 2021 10:33:53 +0800 Subject: [PATCH 1/5] Add simple script to add pytest marks --- spacy/tests/regression/util_add_marker.py | 41 +++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 spacy/tests/regression/util_add_marker.py diff --git a/spacy/tests/regression/util_add_marker.py b/spacy/tests/regression/util_add_marker.py new file mode 100644 index 000000000..94fa415bc --- /dev/null +++ b/spacy/tests/regression/util_add_marker.py @@ -0,0 +1,41 @@ +import re +from pathlib import Path +from typing import Optional + +import typer + + +def main( + filename: Path, out_file: Optional[Path] = typer.Option(None), dry_run: bool = False +): + """Add pytest issue markers on regression tests + + If --out-file is not used, it will overwrite the original file. You can set + the --dry-run flag to just see the changeset and not write to disk. + """ + lines = [] + with filename.open() as f: + lines = f.readlines() + + # Regex pattern for matching common regression formats (e.g. test_issue1234) + pattern = r"def test_issue\d{1,4}" + regex = re.compile(pattern) + + new_lines = [] + for line_text in lines: + if regex.search(line_text): # if match, append marker first + issue_num = int(re.findall(r"\d+", line_text)[0]) # Simple heuristic + typer.echo(f"Found: {line_text} with issue number: {issue_num}") + new_lines.append(f"@pytest.mark.issue({issue_num})\n") + new_lines.append(line_text) + + # Save to file + if not dry_run: + out = out_file or filename + with out.open("w") as f: + for new_line in new_lines: + f.write(new_line) + + +if __name__ == "__main__": + typer.run(main) From 91dec2c76e9affbaafb62cc6a95b317db583c569 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Fri, 5 Nov 2021 09:27:08 +0800 Subject: [PATCH 2/5] Decorate non-regression tests --- spacy/tests/lang/en/test_prefix_suffix_infix.py | 1 + spacy/tests/lang/fr/test_prefix_suffix_infix.py | 1 + spacy/tests/matcher/test_dependency_matcher.py | 2 ++ spacy/tests/matcher/test_matcher_logic.py | 1 + spacy/tests/serialize/test_serialize_pipeline.py | 1 + 5 files changed, 6 insertions(+) diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 9dfb54fd6..a903496e8 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -119,6 +119,7 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer): assert tokens[4].text == "Mr." +@pytest.mark.issue(225) @pytest.mark.xfail(reason="Issue #225 - not yet implemented") def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): tokens = en_tokenizer( diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index 7770f807b..272531b63 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -4,6 +4,7 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES from spacy.lang.char_classes import ALPHA +@pytest.mark.issue(768) @pytest.mark.parametrize( "text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])] ) diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 61ae43c52..1728c82af 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -370,6 +370,7 @@ def test_dependency_matcher_span_user_data(en_tokenizer): assert doc_t_i == span_t_i + offset +@pytest.mark.issue(9263) def test_dependency_matcher_order_issue(en_tokenizer): # issue from #9263 doc = en_tokenizer("I like text") @@ -415,6 +416,7 @@ def test_dependency_matcher_order_issue(en_tokenizer): assert matches == [] +@pytest.mark.issue(9263) def test_dependency_matcher_remove(en_tokenizer): # issue from #9263 doc = en_tokenizer("The red book") diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index dcbe1ff33..b96bb2032 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -152,6 +152,7 @@ def test_operator_combos(en_vocab): assert not matches, (string, pattern_str) +@pytest.mark.issue(1450) def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 05871a524..eebf72638 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -162,6 +162,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): assert label in tagger2.vocab.strings +@pytest.mark.issue(1105) def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL} From addeb34bc4538cada8f373a16ea89c46dcf63f07 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Fri, 5 Nov 2021 09:27:19 +0800 Subject: [PATCH 3/5] Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py --- spacy/tests/regression/test_issue1-1000.py | 33 +++++++++++++++++++ spacy/tests/regression/test_issue1001-1500.py | 10 ++++++ spacy/tests/regression/test_issue1501-2000.py | 24 ++++++++++++++ spacy/tests/regression/test_issue2001-2500.py | 10 ++++++ spacy/tests/regression/test_issue2501-3000.py | 15 +++++++++ spacy/tests/regression/test_issue3001-3500.py | 17 ++++++++++ spacy/tests/regression/test_issue3501-4000.py | 20 +++++++++++ spacy/tests/regression/test_issue4001-4500.py | 15 +++++++++ spacy/tests/regression/test_issue4501-5000.py | 11 +++++++ spacy/tests/regression/test_issue5001-5500.py | 6 ++++ spacy/tests/regression/test_issue5501-6000.py | 3 ++ spacy/tests/regression/test_issue6001-6500.py | 2 ++ spacy/tests/regression/test_issue6501-7000.py | 8 +++++ spacy/tests/regression/test_issue7001-8000.py | 6 ++++ spacy/tests/regression/test_issue7716.py | 1 + spacy/tests/regression/test_issue8190.py | 1 + spacy/tests/regression/test_issue8216.py | 1 + 17 files changed, 183 insertions(+) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 6bb71f6f4..4846d2075 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -12,6 +12,7 @@ from spacy.tokens import Doc, Span from ..util import make_tempdir +@pytest.mark.issue(118) @pytest.mark.parametrize( "patterns", [ @@ -39,6 +40,7 @@ def test_issue118(en_tokenizer, patterns): assert ents[0].end == 11 +@pytest.mark.issue(118) @pytest.mark.parametrize( "patterns", [ @@ -66,6 +68,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns): assert ents[0].end == 11 +@pytest.mark.issue(242) def test_issue242(en_tokenizer): """Test overlapping multi-word phrases.""" text = "There are different food safety standards in different countries." @@ -88,6 +91,7 @@ def test_issue242(en_tokenizer): doc.ents += tuple(matches) +@pytest.mark.issue(309) def test_issue309(en_vocab): """Test Issue #309: SBD fails on empty string""" doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) @@ -96,6 +100,7 @@ def test_issue309(en_vocab): assert len(sents) == 1 +@pytest.mark.issue(351) def test_issue351(en_tokenizer): doc = en_tokenizer(" This is a cat.") assert doc[0].idx == 0 @@ -103,12 +108,14 @@ def test_issue351(en_tokenizer): assert doc[1].idx == 3 +@pytest.mark.issue(360) def test_issue360(en_tokenizer): """Test tokenization of big ellipsis""" tokens = en_tokenizer("$45...............Asking") assert len(tokens) > 2 +@pytest.mark.issue(361) @pytest.mark.parametrize("text1,text2", [("cat", "dog")]) def test_issue361(en_vocab, text1, text2): """Test Issue #361: Equality of lexemes""" @@ -116,6 +123,7 @@ def test_issue361(en_vocab, text1, text2): assert en_vocab[text1] != en_vocab[text2] +@pytest.mark.issue(587) def test_issue587(en_tokenizer): """Test that Matcher doesn't segfault on particular input""" doc = en_tokenizer("a b; c") @@ -131,12 +139,14 @@ def test_issue587(en_tokenizer): assert len(matches) == 2 +@pytest.mark.issue(588) def test_issue588(en_vocab): matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[]]) +@pytest.mark.issue(590) def test_issue590(en_vocab): """Test overlapping matches""" doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) @@ -149,6 +159,7 @@ def test_issue590(en_vocab): assert len(matches) == 2 +@pytest.mark.issue(595) @pytest.mark.skip(reason="Old vocab-based lemmatization") def test_issue595(): """Test lemmatization of base forms""" @@ -164,6 +175,7 @@ def test_issue595(): assert doc[2].lemma_ == "feed" +@pytest.mark.issue(599) def test_issue599(en_vocab): doc = Doc(en_vocab) doc2 = Doc(doc.vocab) @@ -171,12 +183,14 @@ def test_issue599(en_vocab): assert doc2.has_annotation("DEP") +@pytest.mark.issue(600) def test_issue600(): vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) doc = Doc(vocab, words=["hello"]) doc[0].tag_ = "NN" +@pytest.mark.issue(615) def test_issue615(en_tokenizer): def merge_phrases(matcher, doc, i, matches): """Merge a phrase. We have to be careful here because we'll change the @@ -204,6 +218,7 @@ def test_issue615(en_tokenizer): assert entities[0].label != 0 +@pytest.mark.issue(736) @pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) def test_issue736(en_tokenizer, text, number): """Test that times like "7am" are tokenized correctly and that numbers are @@ -213,6 +228,7 @@ def test_issue736(en_tokenizer, text, number): assert tokens[0].text == number +@pytest.mark.issue(740) @pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) def test_issue740(en_tokenizer, text): """Test that dates are not split and kept as one token. This behaviour is @@ -222,6 +238,7 @@ def test_issue740(en_tokenizer, text): assert len(tokens) == 1 +@pytest.mark.issue(743) def test_issue743(): doc = Doc(Vocab(), ["hello", "world"]) token = doc[0] @@ -230,6 +247,7 @@ def test_issue743(): assert items[0] is token +@pytest.mark.issue(744) @pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) def test_issue744(en_tokenizer, text): """Test that 'were' and 'Were' are excluded from the contractions @@ -239,6 +257,7 @@ def test_issue744(en_tokenizer, text): assert tokens[1].text.lower() == "were" +@pytest.mark.issue(759) @pytest.mark.parametrize( "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] ) @@ -247,6 +266,7 @@ def test_issue759(en_tokenizer, text, is_num): assert tokens[0].like_num == is_num +@pytest.mark.issue(775) @pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) def test_issue775(en_tokenizer, text): """Test that 'Shell' and 'shell' are excluded from the contractions @@ -256,6 +276,7 @@ def test_issue775(en_tokenizer, text): assert tokens[0].text == text +@pytest.mark.issue(792) @pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): """Test for Issue #792: Trailing whitespace is removed after tokenization.""" @@ -263,6 +284,7 @@ def test_issue792(en_tokenizer, text): assert "".join([token.text_with_ws for token in doc]) == text +@pytest.mark.issue(792) @pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) def test_control_issue792(en_tokenizer, text): """Test base case for Issue #792: Non-trailing whitespace""" @@ -270,6 +292,7 @@ def test_control_issue792(en_tokenizer, text): assert "".join([token.text_with_ws for token in doc]) == text +@pytest.mark.issue(801) @pytest.mark.skip( reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" ) @@ -292,6 +315,7 @@ def test_issue801(en_tokenizer, text, tokens): assert [t.text for t in doc] == tokens +@pytest.mark.issue(805) @pytest.mark.parametrize( "text,expected_tokens", [ @@ -311,6 +335,7 @@ def test_issue805(sv_tokenizer, text, expected_tokens): assert expected_tokens == token_list +@pytest.mark.issue(850) def test_issue850(): """The variable-length pattern matches the succeeding token. Check we handle the ambiguity correctly.""" @@ -326,6 +351,7 @@ def test_issue850(): assert end == 4 +@pytest.mark.issue(850) def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) @@ -340,6 +366,7 @@ def test_issue850_basic(): assert end == 4 +@pytest.mark.issue(852) @pytest.mark.skip( reason="French exception list is not enabled in the default tokenizer anymore" ) @@ -352,6 +379,7 @@ def test_issue852(fr_tokenizer, text): assert len(tokens) == 1 +@pytest.mark.issue(859) @pytest.mark.parametrize( "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] ) @@ -361,6 +389,7 @@ def test_issue859(en_tokenizer, text): assert doc.text == text +@pytest.mark.issue(886) @pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) def test_issue886(en_tokenizer, text): """Test that token.idx matches the original text index for texts with newlines.""" @@ -370,6 +399,7 @@ def test_issue886(en_tokenizer, text): assert text[token.idx] == token.text[0] +@pytest.mark.issue(891) @pytest.mark.parametrize("text", ["want/need"]) def test_issue891(en_tokenizer, text): """Test that / infixes are split correctly.""" @@ -378,6 +408,7 @@ def test_issue891(en_tokenizer, text): assert tokens[1].text == "/" +@pytest.mark.issue(912) @pytest.mark.skip(reason="Old vocab-based lemmatization") @pytest.mark.parametrize( "text,tag,lemma", @@ -390,6 +421,7 @@ def test_issue912(en_vocab, text, tag, lemma): assert doc[0].lemma_ == lemma +@pytest.mark.issue(957) @pytest.mark.slow def test_issue957(en_tokenizer): """Test that spaCy doesn't hang on many punctuation characters. @@ -405,6 +437,7 @@ def test_issue957(en_tokenizer): assert doc +@pytest.mark.issue(999) def test_issue999(): """Test that adding entities and resuming training works passably OK. There are two issues here: diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index d6a4600e3..0a60e4477 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer from spacy.symbols import ORTH, LEMMA, POS +@pytest.mark.issue(1061) def test_issue1061(): """Test special-case works after tokenizing. Was caching problem.""" text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." @@ -33,6 +34,7 @@ def test_issue1061(): @pytest.mark.skip( reason="Can not be fixed without variable-width look-behind (which we don't want)" ) +@pytest.mark.issue(1235) def test_issue1235(): """Test that g is not split of if preceded by a number and a letter""" nlp = English() @@ -46,6 +48,7 @@ def test_issue1235(): assert doc[4].text == "g" +@pytest.mark.issue(1242) def test_issue1242(): nlp = English() doc = nlp("") @@ -56,6 +59,7 @@ def test_issue1242(): @pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") +@pytest.mark.issue(1250) def test_issue1250(): """Test cached special cases.""" special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] @@ -67,6 +71,7 @@ def test_issue1250(): assert lemmas == ["reimburse", ",", "reimburse", "..."] +@pytest.mark.issue(1257) def test_issue1257(): """Test that tokens compare correctly.""" doc1 = Doc(Vocab(), words=["a", "b", "c"]) @@ -75,6 +80,7 @@ def test_issue1257(): assert not doc1[0] == doc2[0] +@pytest.mark.issue(1375) def test_issue1375(): """Test that token.nbor() raises IndexError for out-of-bounds access.""" doc = Doc(Vocab(), words=["0", "1", "2"]) @@ -86,6 +92,7 @@ def test_issue1375(): assert doc[1].nbor(1).text == "2" +@pytest.mark.issue(1434) def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] @@ -111,6 +118,7 @@ def test_issue1434(): ("a b b", 0, 3), ], ) +@pytest.mark.issue(1450) def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] @@ -124,6 +132,7 @@ def test_issue1450(string, start, end): assert matches[-1][2] == end +@pytest.mark.issue(1488) def test_issue1488(): prefix_re = re.compile(r"""[\[\("']""") suffix_re = re.compile(r"""[\]\)"']""") @@ -147,6 +156,7 @@ def test_issue1488(): assert token.text +@pytest.mark.issue(1494) def test_issue1494(): infix_re = re.compile(r"""[^a-z]""") test_cases = [ diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index f85ec70e1..07f173843 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -17,6 +17,7 @@ from spacy.matcher import Matcher from ..util import make_tempdir +@pytest.mark.issue(1506) def test_issue1506(): def string_generator(): for _ in range(10001): @@ -40,6 +41,7 @@ def test_issue1506(): str(t.lemma_) +@pytest.mark.issue(1518) def test_issue1518(): """Test vectors.resize() works.""" vectors = Vectors(shape=(10, 10)) @@ -47,6 +49,7 @@ def test_issue1518(): vectors.resize((5, 9)) +@pytest.mark.issue(1537) def test_issue1537(): """Test that Span.as_doc() doesn't segfault.""" string = "The sky is blue . The man is pink . The dog is purple ." @@ -65,6 +68,7 @@ def test_issue1537(): # TODO: Currently segfaulting, due to l_edge and r_edge misalignment +@pytest.mark.issue(1537) # def test_issue1537_model(): # nlp = load_spacy('en') # doc = nlp('The sky is blue. The man is pink. The dog is purple.') @@ -73,12 +77,14 @@ def test_issue1537(): # print(list(sents[1].noun_chunks)) +@pytest.mark.issue(1539) def test_issue1539(): """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) v.resize((100, 100)) +@pytest.mark.issue(1547) def test_issue1547(): """Test that entity labels still match after merging tokens.""" words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] @@ -89,12 +95,14 @@ def test_issue1547(): assert [ent.text for ent in doc.ents] +@pytest.mark.issue(1612) def test_issue1612(en_tokenizer): doc = en_tokenizer("The black cat purrs.") span = doc[1:3] assert span.orth_ == span.text +@pytest.mark.issue(1654) def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline @@ -116,12 +124,14 @@ def test_issue1654(): @pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) +@pytest.mark.issue(1698) def test_issue1698(en_tokenizer, text): doc = en_tokenizer(text) assert len(doc) == 1 assert not doc[0].like_url +@pytest.mark.issue(1727) def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" @@ -138,6 +148,7 @@ def test_issue1727(): assert tagger.cfg.get("pretrained_dims", 0) == 0 +@pytest.mark.issue(1757) def test_issue1757(): """Test comparison against None doesn't cause segfault.""" doc = Doc(Vocab(), words=["a", "b", "c"]) @@ -151,12 +162,14 @@ def test_issue1757(): assert not doc.vocab["a"] < None +@pytest.mark.issue(1758) def test_issue1758(en_tokenizer): """Test that "would've" is handled by the English tokenizer exceptions.""" tokens = en_tokenizer("would've") assert len(tokens) == 2 +@pytest.mark.issue(1773) def test_issue1773(en_tokenizer): """Test that spaces don't receive a POS but no TAG. This is the root cause of the serialization issue reported in #1773.""" @@ -165,6 +178,7 @@ def test_issue1773(en_tokenizer): assert doc[0].tag_ != "" +@pytest.mark.issue(1799) def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" @@ -186,6 +200,7 @@ def test_issue1799(): assert len(list(doc.sents)) == 1 +@pytest.mark.issue(1807) def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab(vectors_name="test_issue1807") @@ -194,6 +209,7 @@ def test_issue1807(): assert "hello" in vocab +@pytest.mark.issue(1834) def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" @@ -217,6 +233,7 @@ def test_issue1834(): assert new_doc.has_annotation("TAG") +@pytest.mark.issue(1868) def test_issue1868(): """Test Vocab.__contains__ works with int keys.""" vocab = Vocab() @@ -228,6 +245,7 @@ def test_issue1868(): assert int_id not in vocab +@pytest.mark.issue(1883) def test_issue1883(): matcher = Matcher(Vocab()) matcher.add("pat1", [[{"orth": "hello"}]]) @@ -239,11 +257,13 @@ def test_issue1883(): @pytest.mark.parametrize("word", ["the"]) +@pytest.mark.issue(1889) def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) @pytest.mark.skip(reason="obsolete with the config refactor of v.3") +@pytest.mark.issue(1915) def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() @@ -253,6 +273,7 @@ def test_issue1915(): nlp.initialize(**cfg) +@pytest.mark.issue(1945) def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) @@ -264,6 +285,7 @@ def test_issue1945(): assert matches[1][1:] == (1, 3) +@pytest.mark.issue(1963) def test_issue1963(en_tokenizer): """Test that doc.merge() resizes doc.tensor""" doc = en_tokenizer("a b c d") @@ -275,6 +297,7 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) +@pytest.mark.issue(1967) def test_issue1967(label): nlp = Language() config = {} @@ -293,6 +316,7 @@ def test_issue1967(label): assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] +@pytest.mark.issue(1971) def test_issue1971(en_vocab): # Possibly related to #2675 and #2671? matcher = Matcher(en_vocab) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 09baab4d8..a07360c2c 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -13,6 +13,7 @@ from ..util import add_vecs_to_vocab @pytest.mark.skip( reason="Can not be fixed without iterative looping between prefix/suffix and infix" ) +@pytest.mark.issue(2070) def test_issue2070(): """Test that checks that a dot followed by a quote is handled appropriately. @@ -25,6 +26,7 @@ def test_issue2070(): assert len(doc) == 11 +@pytest.mark.issue(2179) def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -41,6 +43,7 @@ def test_issue2179(): assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) +@pytest.mark.issue(2203) def test_issue2203(en_vocab): """Test that lemmas are set correctly in doc.from_array.""" words = ["I", "'ll", "survive"] @@ -61,6 +64,7 @@ def test_issue2203(en_vocab): assert [t.lemma_ for t in new_doc] == lemmas +@pytest.mark.issue(2219) def test_issue2219(en_vocab): vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] add_vecs_to_vocab(en_vocab, vectors) @@ -69,6 +73,7 @@ def test_issue2219(en_vocab): assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) +@pytest.mark.issue(2361) def test_issue2361(de_vocab): chars = ("<", ">", "&", """) words = ["<", ">", "&", '"'] @@ -78,6 +83,7 @@ def test_issue2361(de_vocab): assert char in html +@pytest.mark.issue(2385) def test_issue2385(): """Test that IOB tags are correctly converted to BILUO tags.""" # fix bug in labels with a 'b' character @@ -99,11 +105,13 @@ def test_issue2385(): ("U-BRAWLER", "U-BRAWLER"), ], ) +@pytest.mark.issue(2385) def test_issue2385_biluo(tags): """Test that BILUO-compatible tags aren't modified.""" assert iob_to_biluo(tags) == list(tags) +@pytest.mark.issue(2396) def test_issue2396(en_vocab): words = ["She", "created", "a", "test", "for", "spacy"] heads = [1, 1, 3, 1, 3, 4] @@ -125,6 +133,7 @@ def test_issue2396(en_vocab): assert (span.get_lca_matrix() == matrix).all() +@pytest.mark.issue(2464) def test_issue2464(en_vocab): """Test problem with successive ?. This is the same bug, so putting it here.""" matcher = Matcher(en_vocab) @@ -134,6 +143,7 @@ def test_issue2464(en_vocab): assert len(matches) == 3 +@pytest.mark.issue(2482) def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 4952a545d..cbb7f0621 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -13,6 +13,7 @@ import numpy import random +@pytest.mark.issue(2564) def test_issue2564(): """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" nlp = Language() @@ -26,6 +27,7 @@ def test_issue2564(): assert piped_doc.has_annotation("TAG") +@pytest.mark.issue(2569) def test_issue2569(en_tokenizer): """Test that operator + is greedy.""" doc = en_tokenizer("It is May 15, 1993.") @@ -46,12 +48,14 @@ def test_issue2569(en_tokenizer): "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", ], ) +@pytest.mark.issue(2626) def test_issue2626_2835(en_tokenizer, text): """Check that sentence doesn't cause an infinite loop in the tokenizer.""" doc = en_tokenizer(text) assert doc +@pytest.mark.issue(2656) def test_issue2656(en_tokenizer): """Test that tokenizer correctly splits off punctuation after numbers with decimal points. @@ -71,6 +75,7 @@ def test_issue2656(en_tokenizer): assert doc[10].text == "." +@pytest.mark.issue(2671) def test_issue2671(): """Ensure the correct entity ID is returned for matches with quantifiers. See also #2675 @@ -94,6 +99,7 @@ def test_issue2671(): assert nlp.vocab.strings[match_id] == pattern_id +@pytest.mark.issue(2728) def test_issue2728(en_vocab): """Test that displaCy ENT visualizer escapes HTML correctly.""" doc = Doc(en_vocab, words=["test", "", "test"]) @@ -105,6 +111,7 @@ def test_issue2728(en_vocab): assert "<RELEASE>" in html +@pytest.mark.issue(2754) def test_issue2754(en_tokenizer): """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" a = en_tokenizer("a") @@ -113,6 +120,7 @@ def test_issue2754(en_tokenizer): assert am[0].norm_ == "am" +@pytest.mark.issue(2772) def test_issue2772(en_vocab): """Test that deprojectivization doesn't mess up sentence boundaries.""" # fmt: off @@ -128,6 +136,7 @@ def test_issue2772(en_vocab): @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) @pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) +@pytest.mark.issue(2782) def test_issue2782(text, lang_cls): """Check that like_num handles + and - before number.""" nlp = lang_cls() @@ -136,6 +145,7 @@ def test_issue2782(text, lang_cls): assert doc[0].like_num +@pytest.mark.issue(2800) def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. @@ -157,6 +167,7 @@ def test_issue2800(): nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) +@pytest.mark.issue(2822) def test_issue2822(it_tokenizer): """Test that the abbreviation of poco is kept as one word.""" doc = it_tokenizer("Vuoi un po' di zucchero?") @@ -169,6 +180,7 @@ def test_issue2822(it_tokenizer): assert doc[5].text == "?" +@pytest.mark.issue(2833) def test_issue2833(en_vocab): """Test that a custom error is raised if a token or span is pickled.""" doc = Doc(en_vocab, words=["Hello", "world"]) @@ -178,6 +190,7 @@ def test_issue2833(en_vocab): pickle.dumps(doc[0:2]) +@pytest.mark.issue(2871) def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] @@ -196,6 +209,7 @@ def test_issue2871(): assert vocab.vectors.find(key="SUFFIX") == 2 +@pytest.mark.issue(2901) def test_issue2901(): """Test that `nlp` doesn't fail.""" try: @@ -207,6 +221,7 @@ def test_issue2901(): assert doc +@pytest.mark.issue(2926) def test_issue2926(fr_tokenizer): """Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit. diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index e123d2df9..6220003dc 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -14,6 +14,7 @@ from spacy.vectors import Vectors import numpy +@pytest.mark.issue(3002) def test_issue3002(): """Test that the tokenizer doesn't hang on a long list of dots""" nlp = German() @@ -23,6 +24,7 @@ def test_issue3002(): assert len(doc) == 5 +@pytest.mark.issue(3009) def test_issue3009(en_vocab): """Test problem with matcher quantifiers""" patterns = [ @@ -53,6 +55,7 @@ def test_issue3009(en_vocab): assert matches +@pytest.mark.issue(3012) def test_issue3012(en_vocab): """Test that the is_tagged attribute doesn't get overwritten when we from_array without tag information.""" @@ -74,6 +77,7 @@ def test_issue3012(en_vocab): assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected +@pytest.mark.issue(3199) def test_issue3199(): """Test that Span.noun_chunks works correctly if no noun chunks iterator is available. To make this test future-proof, we're constructing a Doc @@ -85,6 +89,7 @@ def test_issue3199(): list(doc[0:3].noun_chunks) +@pytest.mark.issue(3209) def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels @@ -104,6 +109,7 @@ def test_issue3209(): assert ner2.move_names == move_names +@pytest.mark.issue(3248) def test_issue3248_1(): """Test that the PhraseMatcher correctly reports its number of rules, not total number of patterns.""" @@ -114,6 +120,7 @@ def test_issue3248_1(): assert len(matcher) == 2 +@pytest.mark.issue(3248) def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English() @@ -125,6 +132,7 @@ def test_issue3248_2(): assert len(new_matcher) == len(matcher) +@pytest.mark.issue(3277) def test_issue3277(es_tokenizer): """Test that hyphens are split correctly as prefixes.""" doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") @@ -134,6 +142,7 @@ def test_issue3277(es_tokenizer): assert doc[9].text == "\u2013" +@pytest.mark.issue(3288) def test_issue3288(en_vocab): """Test that retokenization works correctly via displaCy when punctuation is merged onto the preceeding token and tensor is resized.""" @@ -145,6 +154,7 @@ def test_issue3288(en_vocab): displacy.render(doc) +@pytest.mark.issue(3289) def test_issue3289(): """Test that Language.to_bytes handles serializing a pipeline component with an uninitialized model.""" @@ -156,6 +166,7 @@ def test_issue3289(): new_nlp.from_bytes(bytes_data) +@pytest.mark.issue(3328) def test_issue3328(en_vocab): doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) matcher = Matcher(en_vocab) @@ -170,6 +181,7 @@ def test_issue3328(en_vocab): assert matched_texts == ["Hello", "how", "you", "doing"] +@pytest.mark.issue(3331) def test_issue3331(en_vocab): """Test that duplicate patterns for different rules result in multiple matches, one per rule. @@ -184,6 +196,7 @@ def test_issue3331(en_vocab): assert sorted(match_ids) == ["A", "B"] +@pytest.mark.issue(3345) def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() @@ -206,6 +219,7 @@ def test_issue3345(): assert ner.moves.is_valid(state, "B-GPE") +@pytest.mark.issue(3412) def test_issue3412(): data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") vectors = Vectors(data=data, keys=["A", "B", "C"]) @@ -216,6 +230,7 @@ def test_issue3412(): @pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") +@pytest.mark.issue(3449) def test_issue3449(): nlp = English() nlp.add_pipe("sentencizer") @@ -230,6 +245,7 @@ def test_issue3449(): assert t3[5].text == "I" +@pytest.mark.issue(3456) def test_issue3456(): # this crashed because of a padding error in layer.ops.unflatten in thinc nlp = English() @@ -239,6 +255,7 @@ def test_issue3456(): list(nlp.pipe(["hi", ""])) +@pytest.mark.issue(3468) def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can be restored after serialization.""" diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 71c3768dd..5d9bc4e83 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -24,6 +24,7 @@ from ..util import make_tempdir @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +@pytest.mark.issue(3521) def test_issue3521(en_tokenizer, word): tok = en_tokenizer(word)[1] # 'not' and 'would' should be stopwords, also in their abbreviated forms @@ -108,6 +109,7 @@ def test_issue_3526_4(en_vocab): assert new_ruler.overwrite is True +@pytest.mark.issue(3531) def test_issue3531(): """Test that displaCy renderer doesn't require "settings" key.""" example_dep = { @@ -137,6 +139,7 @@ def test_issue3531(): assert ent_html +@pytest.mark.issue(3540) def test_issue3540(en_vocab): words = ["I", "live", "in", "NewYork", "right", "now"] tensor = numpy.asarray( @@ -176,6 +179,7 @@ def test_issue3540(en_vocab): assert vectors_1[5].tolist() == vectors_2[6].tolist() +@pytest.mark.issue(3549) def test_issue3549(en_vocab): """Test that match pattern validation doesn't raise on empty errors.""" matcher = Matcher(en_vocab, validate=True) @@ -186,6 +190,7 @@ def test_issue3549(en_vocab): @pytest.mark.skip("Matching currently only works on strings and integers") +@pytest.mark.issue(3555) def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) @@ -196,6 +201,7 @@ def test_issue3555(en_vocab): matcher(doc) +@pytest.mark.issue(3611) def test_issue3611(): """Test whether adding n-grams in the textcat works even when n > token length of some docs""" unique_classes = ["offensive", "inoffensive"] @@ -232,6 +238,7 @@ def test_issue3611(): nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) +@pytest.mark.issue(3625) def test_issue3625(): """Test that default punctuation rules applies to hindi unicode characters""" nlp = Hindi() @@ -240,6 +247,7 @@ def test_issue3625(): assert [token.text for token in doc] == expected +@pytest.mark.issue(3803) def test_issue3803(): """Test that spanish num-like tokens have True for like_num attribute.""" nlp = Spanish() @@ -255,6 +263,7 @@ def _parser_example(parser): return Example.from_dict(doc, gold) +@pytest.mark.issue(3830) def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { @@ -268,6 +277,7 @@ def test_issue3830_no_subtok(): assert "subtok" not in parser.labels +@pytest.mark.issue(3830) def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { @@ -281,6 +291,7 @@ def test_issue3830_with_subtok(): assert "subtok" in parser.labels +@pytest.mark.issue(3839) def test_issue3839(en_vocab): """Test that match IDs returned by the matcher are correct, are in the string""" doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) @@ -307,6 +318,7 @@ def test_issue3839(en_vocab): "It was a missed assignment, but it shouldn't have resulted in a turnover ...", ], ) +@pytest.mark.issue(3869) def test_issue3869(sentence): """Test that the Doc's count_by function works consistently""" nlp = English() @@ -317,6 +329,7 @@ def test_issue3869(sentence): assert count == doc.count_by(IS_ALPHA).get(1, 0) +@pytest.mark.issue(3879) def test_issue3879(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) assert len(doc) == 5 @@ -326,6 +339,7 @@ def test_issue3879(en_vocab): assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' +@pytest.mark.issue(3880) def test_issue3880(): """Test that `nlp.pipe()` works when an empty string ends the batch. @@ -341,6 +355,7 @@ def test_issue3880(): pass +@pytest.mark.issue(3882) def test_issue3882(en_vocab): """Test that displaCy doesn't serialize the doc.user_data when making a copy of the Doc. @@ -350,6 +365,7 @@ def test_issue3882(en_vocab): parse_deps(doc) +@pytest.mark.issue(3951) def test_issue3951(en_vocab): """Test that combinations of optional rules are matched correctly.""" matcher = Matcher(en_vocab) @@ -365,6 +381,7 @@ def test_issue3951(en_vocab): assert len(matches) == 0 +@pytest.mark.issue(3959) def test_issue3959(): """Ensure that a modified pos attribute is serialized correctly.""" nlp = English() @@ -383,6 +400,7 @@ def test_issue3959(): assert doc2[0].pos_ == "NOUN" +@pytest.mark.issue(3962) def test_issue3962(en_vocab): """Ensure that as_doc does not result in out-of-bound access of tokens. This is achieved by setting the head to itself if it would lie out of the span otherwise.""" @@ -421,6 +439,7 @@ def test_issue3962(en_vocab): assert len(list(doc3.sents)) == 1 +@pytest.mark.issue(3962) def test_issue3962_long(en_vocab): """Ensure that as_doc does not result in out-of-bound access of tokens. This is achieved by setting the head to itself if it would lie out of the span otherwise.""" @@ -456,6 +475,7 @@ def test_issue3962_long(en_vocab): assert sents[1].text == "They never" +@pytest.mark.issue(3972) def test_issue3972(en_vocab): """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" matcher = PhraseMatcher(en_vocab) diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 4410e6236..7b7c304a3 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -17,6 +17,7 @@ from thinc.api import compounding from ..util import make_tempdir +@pytest.mark.issue(4002) def test_issue4002(en_vocab): """Test that the PhraseMatcher can match on overwritten NORM attributes.""" matcher = PhraseMatcher(en_vocab, attr="NORM") @@ -37,6 +38,7 @@ def test_issue4002(en_vocab): assert len(matches) == 1 +@pytest.mark.issue(4030) def test_issue4030(): """Test whether textcat works fine with empty doc""" unique_classes = ["offensive", "inoffensive"] @@ -77,6 +79,7 @@ def test_issue4030(): assert doc.cats["inoffensive"] == 0.0 +@pytest.mark.issue(4042) def test_issue4042(): """Test that serialization of an EntityRuler before NER works fine.""" nlp = English() @@ -105,6 +108,7 @@ def test_issue4042(): assert doc2.ents[0].label_ == "MY_ORG" +@pytest.mark.issue(4042) def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. @@ -139,6 +143,7 @@ def test_issue4042_bug2(): assert len(ner2.labels) == 2 +@pytest.mark.issue(4054) def test_issue4054(en_vocab): """Test that a new blank model can be made with a vocab from file, and that serialization does not drop the language at any point.""" @@ -159,6 +164,7 @@ def test_issue4054(en_vocab): assert nlp3.lang == "en" +@pytest.mark.issue(4120) def test_issue4120(en_vocab): """Test that matches without a final {OP: ?} token are returned.""" matcher = Matcher(en_vocab) @@ -177,6 +183,7 @@ def test_issue4120(en_vocab): assert len(matcher(doc4)) == 3 # fixed +@pytest.mark.issue(4133) def test_issue4133(en_vocab): nlp = English() vocab_bytes = nlp.vocab.to_bytes() @@ -196,6 +203,7 @@ def test_issue4133(en_vocab): assert actual == pos +@pytest.mark.issue(4190) def test_issue4190(): def customize_tokenizer(nlp): prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) @@ -236,6 +244,7 @@ def test_issue4190(): assert result_1b == result_2 +@pytest.mark.issue(4267) def test_issue4267(): """Test that running an entity_ruler after ner gives consistent results""" nlp = English() @@ -262,6 +271,7 @@ def test_issue4267(): @pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") +@pytest.mark.issue(4272) def test_issue4272(): """Test that lookup table can be accessed from Token.lemma if no POS tags are available.""" @@ -287,6 +297,7 @@ def test_multiple_predictions(): dummy_pipe(doc) +@pytest.mark.issue(4313) def test_issue4313(): """This should not crash or exit with some strange error code""" beam_width = 16 @@ -313,6 +324,7 @@ def test_issue4313(): assert "MY_ORG" in ner.labels +@pytest.mark.issue(4348) def test_issue4348(): """Test that training the tagger with empty data, doesn't throw errors""" nlp = English() @@ -328,6 +340,7 @@ def test_issue4348(): nlp.update(batch, sgd=optimizer, losses=losses) +@pytest.mark.issue(4367) def test_issue4367(): """Test that docbin init goes well""" DocBin() @@ -335,6 +348,7 @@ def test_issue4367(): DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) +@pytest.mark.issue(4373) def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) @@ -343,6 +357,7 @@ def test_issue4373(): assert isinstance(matcher.vocab, Vocab) +@pytest.mark.issue(4402) def test_issue4402(): json_data = { "id": 0, diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index effd67306..07a00d2b7 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -14,6 +14,7 @@ from thinc.api import NumpyOps, get_current_ops from ..util import make_tempdir +@pytest.mark.issue(4528) def test_issue4528(en_vocab): """Test that user_data is correctly serialized in DocBin.""" doc = Doc(en_vocab, words=["hello", "world"]) @@ -37,6 +38,7 @@ def test_gold_misaligned(en_tokenizer, text, words): Example.from_dict(doc, {"words": words}) +@pytest.mark.issue(4651) def test_issue4651_with_phrase_matcher_attr(): """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is @@ -59,6 +61,7 @@ def test_issue4651_with_phrase_matcher_attr(): assert res == res_reloaded +@pytest.mark.issue(4651) def test_issue4651_without_phrase_matcher_attr(): """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is @@ -81,6 +84,7 @@ def test_issue4651_without_phrase_matcher_attr(): assert res == res_reloaded +@pytest.mark.issue(4665) def test_issue4665(): """ conllu_to_docs should not raise an exception if the HEAD column contains an @@ -109,6 +113,7 @@ def test_issue4665(): conllu_to_docs(input_data) +@pytest.mark.issue(4674) def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() @@ -135,6 +140,7 @@ def test_issue4674(): @pytest.mark.skip(reason="API change: disable just disables, new exclude arg") +@pytest.mark.issue(4707) def test_issue4707(): """Tests that disabled component names are also excluded from nlp.from_disk by default when loading a model. @@ -151,6 +157,7 @@ def test_issue4707(): assert "entity_ruler" in new_nlp.pipe_names +@pytest.mark.issue(4725) def test_issue4725_1(): """Ensure the pickling of the NER goes well""" vocab = Vocab(vectors_name="test_vocab_add_vector") @@ -169,6 +176,7 @@ def test_issue4725_1(): assert ner2.cfg["update_with_oracle_cut_size"] == 111 +@pytest.mark.issue(4725) def test_issue4725_2(): if isinstance(get_current_ops, NumpyOps): # ensures that this runs correctly and doesn't hang or crash because of the global vectors @@ -188,6 +196,7 @@ def test_issue4725_2(): pass +@pytest.mark.issue(4849) def test_issue4849(): nlp = English() patterns = [ @@ -235,6 +244,7 @@ class CustomPipe: return str(span.end) +@pytest.mark.issue(4903) def test_issue4903(): """Ensure that this runs correctly and doesn't hang or crash on Windows / macOS.""" @@ -249,6 +259,7 @@ def test_issue4903(): assert docs[2].text == "No, I prefer wasabi." +@pytest.mark.issue(4924) def test_issue4924(): nlp = Language() example = Example.from_dict(nlp.make_doc(""), {}) diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py index bc9bcb982..e1f5231e7 100644 --- a/spacy/tests/regression/test_issue5001-5500.py +++ b/spacy/tests/regression/test_issue5001-5500.py @@ -12,6 +12,7 @@ import pytest from ...util import make_tempdir +@pytest.mark.issue(5048) def test_issue5048(en_vocab): words = ["This", "is", "a", "sentence"] pos_s = ["DET", "VERB", "DET", "NOUN"] @@ -34,6 +35,7 @@ def test_issue5048(en_vocab): assert v1 == v2 +@pytest.mark.issue(5082) def test_issue5082(): # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens nlp = English() @@ -68,6 +70,7 @@ def test_issue5082(): numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) +@pytest.mark.issue(5137) def test_issue5137(): factory_name = "test_issue5137" pipe_name = "my_component" @@ -98,6 +101,7 @@ def test_issue5137(): assert nlp2.get_pipe(pipe_name).categories == "my_categories" +@pytest.mark.issue(5141) def test_issue5141(en_vocab): """Ensure an empty DocBin does not crash on serialization""" doc_bin = DocBin(attrs=["DEP", "HEAD"]) @@ -107,6 +111,7 @@ def test_issue5141(en_vocab): assert list(doc_bin_2.get_docs(en_vocab)) == [] +@pytest.mark.issue(5152) def test_issue5152(): # Test that the comparison between a Span and a Token, goes well # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) @@ -125,6 +130,7 @@ def test_issue5152(): assert span_2.similarity(span_3) < 1.0 +@pytest.mark.issue(5458) def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index 355ffffeb..87c40ec2a 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -25,6 +25,7 @@ from spacy.training import Example multi_label_cnn_config, ], ) +@pytest.mark.issue(5551) def test_issue5551(textcat_config): """Test that after fixing the random seed, the results of the pipeline are truly identical""" component = "textcat" @@ -53,6 +54,7 @@ def test_issue5551(textcat_config): assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) +@pytest.mark.issue(5838) def test_issue5838(): # Displacy's EntityRenderer break line # not working after last entity @@ -65,6 +67,7 @@ def test_issue5838(): assert found == 4 +@pytest.mark.issue(5918) def test_issue5918(): # Test edge case when merging entities. nlp = English() diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py index 470b2f388..cb27d39e4 100644 --- a/spacy/tests/regression/test_issue6001-6500.py +++ b/spacy/tests/regression/test_issue6001-6500.py @@ -4,6 +4,7 @@ from spacy.schemas import TokenPattern, TokenPatternSchema import pytest +@pytest.mark.issue(6207) def test_issue6207(en_tokenizer): doc = en_tokenizer("zero one two three four five six") @@ -18,6 +19,7 @@ def test_issue6207(en_tokenizer): assert s3 in result +@pytest.mark.issue(6258) def test_issue6258(): """Test that the non-empty constraint pattern field is respected""" # These one is valid diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py index f57e4085c..84517d79b 100644 --- a/spacy/tests/regression/test_issue6501-7000.py +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -13,6 +13,7 @@ import pickle from ..util import make_tempdir +@pytest.mark.issue(6730) def test_issue6730(en_vocab): """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" from spacy.kb import KnowledgeBase @@ -34,6 +35,7 @@ def test_issue6730(en_vocab): assert set(kb.get_alias_strings()) == {"x", "y"} +@pytest.mark.issue(6755) def test_issue6755(en_tokenizer): doc = en_tokenizer("This is a magnificent sentence.") span = doc[:0] @@ -45,6 +47,7 @@ def test_issue6755(en_tokenizer): "sentence, start_idx,end_idx,label", [("Welcome to Mumbai, my friend", 11, 17, "GPE")], ) +@pytest.mark.issue(6815) def test_issue6815_1(sentence, start_idx, end_idx, label): nlp = English() doc = nlp(sentence) @@ -55,6 +58,7 @@ def test_issue6815_1(sentence, start_idx, end_idx, label): @pytest.mark.parametrize( "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] ) +@pytest.mark.issue(6815) def test_issue6815_2(sentence, start_idx, end_idx, kb_id): nlp = English() doc = nlp(sentence) @@ -66,6 +70,7 @@ def test_issue6815_2(sentence, start_idx, end_idx, kb_id): "sentence, start_idx,end_idx,vector", [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], ) +@pytest.mark.issue(6815) def test_issue6815_3(sentence, start_idx, end_idx, vector): nlp = English() doc = nlp(sentence) @@ -73,6 +78,7 @@ def test_issue6815_3(sentence, start_idx, end_idx, vector): assert (span.vector == vector).all() +@pytest.mark.issue(6839) def test_issue6839(en_vocab): """Ensure that PhraseMatcher accepts Span as input""" # fmt: off @@ -155,6 +161,7 @@ labels = ['label1', 'label2'] "component_name", ["textcat", "textcat_multilabel"], ) +@pytest.mark.issue(6908) def test_issue6908(component_name): """Test intializing textcat with labels in a list""" @@ -219,6 +226,7 @@ upstream = "*" """ +@pytest.mark.issue(6950) def test_issue6950(): """Test that the nlp object with initialized tok2vec with listeners pickles correctly (and doesn't have lambdas). diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py index 5bb7cc08e..17b8a6839 100644 --- a/spacy/tests/regression/test_issue7001-8000.py +++ b/spacy/tests/regression/test_issue7001-8000.py @@ -13,6 +13,7 @@ from wasabi import msg from ..util import make_tempdir +@pytest.mark.issue(7019) def test_issue7019(): scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} print_textcats_auc_per_cat(msg, scores) @@ -64,6 +65,7 @@ upstream = "*" """ +@pytest.mark.issue(7029) def test_issue7029(): """Test that an empty document doesn't mess up an entire batch.""" TRAIN_DATA = [ @@ -84,6 +86,7 @@ def test_issue7029(): assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] +@pytest.mark.issue(7055) def test_issue7055(): """Test that fill-config doesn't turn sourced components into factories.""" source_cfg = { @@ -118,6 +121,7 @@ def test_issue7055(): assert "model" in filled_cfg["components"]["ner"] +@pytest.mark.issue(7056) def test_issue7056(): """Test that the Unshift transition works properly, and doesn't cause sentence segmentation errors.""" @@ -190,6 +194,7 @@ def test_partial_links(): assert "ORG" not in results["nel_f_per_type"] +@pytest.mark.issue(7065) def test_issue7065(): text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." nlp = English() @@ -217,6 +222,7 @@ def test_issue7065(): assert sentences.index(ent.sent) == 0 +@pytest.mark.issue(7065) def test_issue7065_b(): # Test that the NEL doesn't crash when an entity crosses a sentence boundary nlp = English() diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py index 811952792..d9b3967ff 100644 --- a/spacy/tests/regression/test_issue7716.py +++ b/spacy/tests/regression/test_issue7716.py @@ -43,6 +43,7 @@ def parser(vocab): return parser +@pytest.mark.issue(7716) @pytest.mark.xfail(reason="Not fixed yet") def test_partial_annotation(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py index 6ddbe53e0..1168630b6 100644 --- a/spacy/tests/regression/test_issue8190.py +++ b/spacy/tests/regression/test_issue8190.py @@ -3,6 +3,7 @@ from spacy.lang.en import English from ..util import make_tempdir +@pytest.mark.issue(8190) def test_issue8190(): """Test that config overrides are not lost after load is complete.""" source_cfg = { diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py index 00cd6da3b..0370074fe 100644 --- a/spacy/tests/regression/test_issue8216.py +++ b/spacy/tests/regression/test_issue8216.py @@ -22,6 +22,7 @@ def patterns(): ] +@pytest.mark.issue(8216) def test_entity_ruler_fix8216(nlp, patterns): """Test that patterns don't get added excessively.""" ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) From 8e7deaf210988ed87f72144dc7a75f9c27885f41 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Fri, 5 Nov 2021 10:49:48 +0800 Subject: [PATCH 4/5] Add missing imports in some regression tests - test_issue7001-8000.py - test_issue8190.py --- spacy/tests/regression/test_issue7001-8000.py | 1 + spacy/tests/regression/test_issue8190.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py index 17b8a6839..1164e85b9 100644 --- a/spacy/tests/regression/test_issue7001-8000.py +++ b/spacy/tests/regression/test_issue7001-8000.py @@ -1,3 +1,4 @@ +import pytest from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type from spacy.lang.en import English from spacy.training import Example diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py index 1168630b6..0b2f2824b 100644 --- a/spacy/tests/regression/test_issue8190.py +++ b/spacy/tests/regression/test_issue8190.py @@ -1,3 +1,5 @@ +import pytest + import spacy from spacy.lang.en import English from ..util import make_tempdir From 909177589dcdbde1cd4770f9f744d4d57d08d7e0 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Sat, 6 Nov 2021 06:35:58 +0800 Subject: [PATCH 5/5] Remove utility script --- spacy/tests/regression/util_add_marker.py | 41 ----------------------- 1 file changed, 41 deletions(-) delete mode 100644 spacy/tests/regression/util_add_marker.py diff --git a/spacy/tests/regression/util_add_marker.py b/spacy/tests/regression/util_add_marker.py deleted file mode 100644 index 94fa415bc..000000000 --- a/spacy/tests/regression/util_add_marker.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from pathlib import Path -from typing import Optional - -import typer - - -def main( - filename: Path, out_file: Optional[Path] = typer.Option(None), dry_run: bool = False -): - """Add pytest issue markers on regression tests - - If --out-file is not used, it will overwrite the original file. You can set - the --dry-run flag to just see the changeset and not write to disk. - """ - lines = [] - with filename.open() as f: - lines = f.readlines() - - # Regex pattern for matching common regression formats (e.g. test_issue1234) - pattern = r"def test_issue\d{1,4}" - regex = re.compile(pattern) - - new_lines = [] - for line_text in lines: - if regex.search(line_text): # if match, append marker first - issue_num = int(re.findall(r"\d+", line_text)[0]) # Simple heuristic - typer.echo(f"Found: {line_text} with issue number: {issue_num}") - new_lines.append(f"@pytest.mark.issue({issue_num})\n") - new_lines.append(line_text) - - # Save to file - if not dry_run: - out = out_file or filename - with out.open("w") as f: - for new_line in new_lines: - f.write(new_line) - - -if __name__ == "__main__": - typer.run(main)