Decorate regression tests

Even if the issue number is already in the file, I still
decorated them just to follow the convention found in test_issue8168.py
This commit is contained in:
Lj Miranda 2021-11-05 09:27:19 +08:00
parent 91dec2c76e
commit addeb34bc4
17 changed files with 183 additions and 0 deletions

View File

@ -12,6 +12,7 @@ from spacy.tokens import Doc, Span
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(118)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"patterns", "patterns",
[ [
@ -39,6 +40,7 @@ def test_issue118(en_tokenizer, patterns):
assert ents[0].end == 11 assert ents[0].end == 11
@pytest.mark.issue(118)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"patterns", "patterns",
[ [
@ -66,6 +68,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
assert ents[0].end == 11 assert ents[0].end == 11
@pytest.mark.issue(242)
def test_issue242(en_tokenizer): def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases.""" """Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries." text = "There are different food safety standards in different countries."
@ -88,6 +91,7 @@ def test_issue242(en_tokenizer):
doc.ents += tuple(matches) doc.ents += tuple(matches)
@pytest.mark.issue(309)
def test_issue309(en_vocab): def test_issue309(en_vocab):
"""Test Issue #309: SBD fails on empty string""" """Test Issue #309: SBD fails on empty string"""
doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
@ -96,6 +100,7 @@ def test_issue309(en_vocab):
assert len(sents) == 1 assert len(sents) == 1
@pytest.mark.issue(351)
def test_issue351(en_tokenizer): def test_issue351(en_tokenizer):
doc = en_tokenizer(" This is a cat.") doc = en_tokenizer(" This is a cat.")
assert doc[0].idx == 0 assert doc[0].idx == 0
@ -103,12 +108,14 @@ def test_issue351(en_tokenizer):
assert doc[1].idx == 3 assert doc[1].idx == 3
@pytest.mark.issue(360)
def test_issue360(en_tokenizer): def test_issue360(en_tokenizer):
"""Test tokenization of big ellipsis""" """Test tokenization of big ellipsis"""
tokens = en_tokenizer("$45...............Asking") tokens = en_tokenizer("$45...............Asking")
assert len(tokens) > 2 assert len(tokens) > 2
@pytest.mark.issue(361)
@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) @pytest.mark.parametrize("text1,text2", [("cat", "dog")])
def test_issue361(en_vocab, text1, text2): def test_issue361(en_vocab, text1, text2):
"""Test Issue #361: Equality of lexemes""" """Test Issue #361: Equality of lexemes"""
@ -116,6 +123,7 @@ def test_issue361(en_vocab, text1, text2):
assert en_vocab[text1] != en_vocab[text2] assert en_vocab[text1] != en_vocab[text2]
@pytest.mark.issue(587)
def test_issue587(en_tokenizer): def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input""" """Test that Matcher doesn't segfault on particular input"""
doc = en_tokenizer("a b; c") doc = en_tokenizer("a b; c")
@ -131,12 +139,14 @@ def test_issue587(en_tokenizer):
assert len(matches) == 2 assert len(matches) == 2
@pytest.mark.issue(588)
def test_issue588(en_vocab): def test_issue588(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST", [[]]) matcher.add("TEST", [[]])
@pytest.mark.issue(590)
def test_issue590(en_vocab): def test_issue590(en_vocab):
"""Test overlapping matches""" """Test overlapping matches"""
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
@ -149,6 +159,7 @@ def test_issue590(en_vocab):
assert len(matches) == 2 assert len(matches) == 2
@pytest.mark.issue(595)
@pytest.mark.skip(reason="Old vocab-based lemmatization") @pytest.mark.skip(reason="Old vocab-based lemmatization")
def test_issue595(): def test_issue595():
"""Test lemmatization of base forms""" """Test lemmatization of base forms"""
@ -164,6 +175,7 @@ def test_issue595():
assert doc[2].lemma_ == "feed" assert doc[2].lemma_ == "feed"
@pytest.mark.issue(599)
def test_issue599(en_vocab): def test_issue599(en_vocab):
doc = Doc(en_vocab) doc = Doc(en_vocab)
doc2 = Doc(doc.vocab) doc2 = Doc(doc.vocab)
@ -171,12 +183,14 @@ def test_issue599(en_vocab):
assert doc2.has_annotation("DEP") assert doc2.has_annotation("DEP")
@pytest.mark.issue(600)
def test_issue600(): def test_issue600():
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
doc = Doc(vocab, words=["hello"]) doc = Doc(vocab, words=["hello"])
doc[0].tag_ = "NN" doc[0].tag_ = "NN"
@pytest.mark.issue(615)
def test_issue615(en_tokenizer): def test_issue615(en_tokenizer):
def merge_phrases(matcher, doc, i, matches): def merge_phrases(matcher, doc, i, matches):
"""Merge a phrase. We have to be careful here because we'll change the """Merge a phrase. We have to be careful here because we'll change the
@ -204,6 +218,7 @@ def test_issue615(en_tokenizer):
assert entities[0].label != 0 assert entities[0].label != 0
@pytest.mark.issue(736)
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) @pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
def test_issue736(en_tokenizer, text, number): def test_issue736(en_tokenizer, text, number):
"""Test that times like "7am" are tokenized correctly and that numbers are """Test that times like "7am" are tokenized correctly and that numbers are
@ -213,6 +228,7 @@ def test_issue736(en_tokenizer, text, number):
assert tokens[0].text == number assert tokens[0].text == number
@pytest.mark.issue(740)
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) @pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
def test_issue740(en_tokenizer, text): def test_issue740(en_tokenizer, text):
"""Test that dates are not split and kept as one token. This behaviour is """Test that dates are not split and kept as one token. This behaviour is
@ -222,6 +238,7 @@ def test_issue740(en_tokenizer, text):
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.issue(743)
def test_issue743(): def test_issue743():
doc = Doc(Vocab(), ["hello", "world"]) doc = Doc(Vocab(), ["hello", "world"])
token = doc[0] token = doc[0]
@ -230,6 +247,7 @@ def test_issue743():
assert items[0] is token assert items[0] is token
@pytest.mark.issue(744)
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) @pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
def test_issue744(en_tokenizer, text): def test_issue744(en_tokenizer, text):
"""Test that 'were' and 'Were' are excluded from the contractions """Test that 'were' and 'Were' are excluded from the contractions
@ -239,6 +257,7 @@ def test_issue744(en_tokenizer, text):
assert tokens[1].text.lower() == "were" assert tokens[1].text.lower() == "were"
@pytest.mark.issue(759)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)] "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
) )
@ -247,6 +266,7 @@ def test_issue759(en_tokenizer, text, is_num):
assert tokens[0].like_num == is_num assert tokens[0].like_num == is_num
@pytest.mark.issue(775)
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) @pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
def test_issue775(en_tokenizer, text): def test_issue775(en_tokenizer, text):
"""Test that 'Shell' and 'shell' are excluded from the contractions """Test that 'Shell' and 'shell' are excluded from the contractions
@ -256,6 +276,7 @@ def test_issue775(en_tokenizer, text):
assert tokens[0].text == text assert tokens[0].text == text
@pytest.mark.issue(792)
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) @pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text): def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after tokenization.""" """Test for Issue #792: Trailing whitespace is removed after tokenization."""
@ -263,6 +284,7 @@ def test_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.issue(792)
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) @pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text): def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace""" """Test base case for Issue #792: Non-trailing whitespace"""
@ -270,6 +292,7 @@ def test_control_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.issue(801)
@pytest.mark.skip( @pytest.mark.skip(
reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
) )
@ -292,6 +315,7 @@ def test_issue801(en_tokenizer, text, tokens):
assert [t.text for t in doc] == tokens assert [t.text for t in doc] == tokens
@pytest.mark.issue(805)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,expected_tokens", "text,expected_tokens",
[ [
@ -311,6 +335,7 @@ def test_issue805(sv_tokenizer, text, expected_tokens):
assert expected_tokens == token_list assert expected_tokens == token_list
@pytest.mark.issue(850)
def test_issue850(): def test_issue850():
"""The variable-length pattern matches the succeeding token. Check we """The variable-length pattern matches the succeeding token. Check we
handle the ambiguity correctly.""" handle the ambiguity correctly."""
@ -326,6 +351,7 @@ def test_issue850():
assert end == 4 assert end == 4
@pytest.mark.issue(850)
def test_issue850_basic(): def test_issue850_basic():
"""Test Matcher matches with '*' operator and Boolean flag""" """Test Matcher matches with '*' operator and Boolean flag"""
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
@ -340,6 +366,7 @@ def test_issue850_basic():
assert end == 4 assert end == 4
@pytest.mark.issue(852)
@pytest.mark.skip( @pytest.mark.skip(
reason="French exception list is not enabled in the default tokenizer anymore" reason="French exception list is not enabled in the default tokenizer anymore"
) )
@ -352,6 +379,7 @@ def test_issue852(fr_tokenizer, text):
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.issue(859)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
) )
@ -361,6 +389,7 @@ def test_issue859(en_tokenizer, text):
assert doc.text == text assert doc.text == text
@pytest.mark.issue(886)
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) @pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
def test_issue886(en_tokenizer, text): def test_issue886(en_tokenizer, text):
"""Test that token.idx matches the original text index for texts with newlines.""" """Test that token.idx matches the original text index for texts with newlines."""
@ -370,6 +399,7 @@ def test_issue886(en_tokenizer, text):
assert text[token.idx] == token.text[0] assert text[token.idx] == token.text[0]
@pytest.mark.issue(891)
@pytest.mark.parametrize("text", ["want/need"]) @pytest.mark.parametrize("text", ["want/need"])
def test_issue891(en_tokenizer, text): def test_issue891(en_tokenizer, text):
"""Test that / infixes are split correctly.""" """Test that / infixes are split correctly."""
@ -378,6 +408,7 @@ def test_issue891(en_tokenizer, text):
assert tokens[1].text == "/" assert tokens[1].text == "/"
@pytest.mark.issue(912)
@pytest.mark.skip(reason="Old vocab-based lemmatization") @pytest.mark.skip(reason="Old vocab-based lemmatization")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,tag,lemma", "text,tag,lemma",
@ -390,6 +421,7 @@ def test_issue912(en_vocab, text, tag, lemma):
assert doc[0].lemma_ == lemma assert doc[0].lemma_ == lemma
@pytest.mark.issue(957)
@pytest.mark.slow @pytest.mark.slow
def test_issue957(en_tokenizer): def test_issue957(en_tokenizer):
"""Test that spaCy doesn't hang on many punctuation characters. """Test that spaCy doesn't hang on many punctuation characters.
@ -405,6 +437,7 @@ def test_issue957(en_tokenizer):
assert doc assert doc
@pytest.mark.issue(999)
def test_issue999(): def test_issue999():
"""Test that adding entities and resuming training works passably OK. """Test that adding entities and resuming training works passably OK.
There are two issues here: There are two issues here:

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.symbols import ORTH, LEMMA, POS from spacy.symbols import ORTH, LEMMA, POS
@pytest.mark.issue(1061)
def test_issue1061(): def test_issue1061():
"""Test special-case works after tokenizing. Was caching problem.""" """Test special-case works after tokenizing. Was caching problem."""
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
@ -33,6 +34,7 @@ def test_issue1061():
@pytest.mark.skip( @pytest.mark.skip(
reason="Can not be fixed without variable-width look-behind (which we don't want)" reason="Can not be fixed without variable-width look-behind (which we don't want)"
) )
@pytest.mark.issue(1235)
def test_issue1235(): def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter""" """Test that g is not split of if preceded by a number and a letter"""
nlp = English() nlp = English()
@ -46,6 +48,7 @@ def test_issue1235():
assert doc[4].text == "g" assert doc[4].text == "g"
@pytest.mark.issue(1242)
def test_issue1242(): def test_issue1242():
nlp = English() nlp = English()
doc = nlp("") doc = nlp("")
@ -56,6 +59,7 @@ def test_issue1242():
@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") @pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
@pytest.mark.issue(1250)
def test_issue1250(): def test_issue1250():
"""Test cached special cases.""" """Test cached special cases."""
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@ -67,6 +71,7 @@ def test_issue1250():
assert lemmas == ["reimburse", ",", "reimburse", "..."] assert lemmas == ["reimburse", ",", "reimburse", "..."]
@pytest.mark.issue(1257)
def test_issue1257(): def test_issue1257():
"""Test that tokens compare correctly.""" """Test that tokens compare correctly."""
doc1 = Doc(Vocab(), words=["a", "b", "c"]) doc1 = Doc(Vocab(), words=["a", "b", "c"])
@ -75,6 +80,7 @@ def test_issue1257():
assert not doc1[0] == doc2[0] assert not doc1[0] == doc2[0]
@pytest.mark.issue(1375)
def test_issue1375(): def test_issue1375():
"""Test that token.nbor() raises IndexError for out-of-bounds access.""" """Test that token.nbor() raises IndexError for out-of-bounds access."""
doc = Doc(Vocab(), words=["0", "1", "2"]) doc = Doc(Vocab(), words=["0", "1", "2"])
@ -86,6 +92,7 @@ def test_issue1375():
assert doc[1].nbor(1).text == "2" assert doc[1].nbor(1).text == "2"
@pytest.mark.issue(1434)
def test_issue1434(): def test_issue1434():
"""Test matches occur when optional element at end of short doc.""" """Test matches occur when optional element at end of short doc."""
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
@ -111,6 +118,7 @@ def test_issue1434():
("a b b", 0, 3), ("a b b", 0, 3),
], ],
) )
@pytest.mark.issue(1450)
def test_issue1450(string, start, end): def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator.""" """Test matcher works when patterns end with * operator."""
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
@ -124,6 +132,7 @@ def test_issue1450(string, start, end):
assert matches[-1][2] == end assert matches[-1][2] == end
@pytest.mark.issue(1488)
def test_issue1488(): def test_issue1488():
prefix_re = re.compile(r"""[\[\("']""") prefix_re = re.compile(r"""[\[\("']""")
suffix_re = re.compile(r"""[\]\)"']""") suffix_re = re.compile(r"""[\]\)"']""")
@ -147,6 +156,7 @@ def test_issue1488():
assert token.text assert token.text
@pytest.mark.issue(1494)
def test_issue1494(): def test_issue1494():
infix_re = re.compile(r"""[^a-z]""") infix_re = re.compile(r"""[^a-z]""")
test_cases = [ test_cases = [

View File

@ -17,6 +17,7 @@ from spacy.matcher import Matcher
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(1506)
def test_issue1506(): def test_issue1506():
def string_generator(): def string_generator():
for _ in range(10001): for _ in range(10001):
@ -40,6 +41,7 @@ def test_issue1506():
str(t.lemma_) str(t.lemma_)
@pytest.mark.issue(1518)
def test_issue1518(): def test_issue1518():
"""Test vectors.resize() works.""" """Test vectors.resize() works."""
vectors = Vectors(shape=(10, 10)) vectors = Vectors(shape=(10, 10))
@ -47,6 +49,7 @@ def test_issue1518():
vectors.resize((5, 9)) vectors.resize((5, 9))
@pytest.mark.issue(1537)
def test_issue1537(): def test_issue1537():
"""Test that Span.as_doc() doesn't segfault.""" """Test that Span.as_doc() doesn't segfault."""
string = "The sky is blue . The man is pink . The dog is purple ." string = "The sky is blue . The man is pink . The dog is purple ."
@ -65,6 +68,7 @@ def test_issue1537():
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment # TODO: Currently segfaulting, due to l_edge and r_edge misalignment
@pytest.mark.issue(1537)
# def test_issue1537_model(): # def test_issue1537_model():
# nlp = load_spacy('en') # nlp = load_spacy('en')
# doc = nlp('The sky is blue. The man is pink. The dog is purple.') # doc = nlp('The sky is blue. The man is pink. The dog is purple.')
@ -73,12 +77,14 @@ def test_issue1537():
# print(list(sents[1].noun_chunks)) # print(list(sents[1].noun_chunks))
@pytest.mark.issue(1539)
def test_issue1539(): def test_issue1539():
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
v.resize((100, 100)) v.resize((100, 100))
@pytest.mark.issue(1547)
def test_issue1547(): def test_issue1547():
"""Test that entity labels still match after merging tokens.""" """Test that entity labels still match after merging tokens."""
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
@ -89,12 +95,14 @@ def test_issue1547():
assert [ent.text for ent in doc.ents] assert [ent.text for ent in doc.ents]
@pytest.mark.issue(1612)
def test_issue1612(en_tokenizer): def test_issue1612(en_tokenizer):
doc = en_tokenizer("The black cat purrs.") doc = en_tokenizer("The black cat purrs.")
span = doc[1:3] span = doc[1:3]
assert span.orth_ == span.text assert span.orth_ == span.text
@pytest.mark.issue(1654)
def test_issue1654(): def test_issue1654():
nlp = Language(Vocab()) nlp = Language(Vocab())
assert not nlp.pipeline assert not nlp.pipeline
@ -116,12 +124,14 @@ def test_issue1654():
@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) @pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
@pytest.mark.issue(1698)
def test_issue1698(en_tokenizer, text): def test_issue1698(en_tokenizer, text):
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 1 assert len(doc) == 1
assert not doc[0].like_url assert not doc[0].like_url
@pytest.mark.issue(1727)
def test_issue1727(): def test_issue1727():
"""Test that models with no pretrained vectors can be deserialized """Test that models with no pretrained vectors can be deserialized
correctly after vectors are added.""" correctly after vectors are added."""
@ -138,6 +148,7 @@ def test_issue1727():
assert tagger.cfg.get("pretrained_dims", 0) == 0 assert tagger.cfg.get("pretrained_dims", 0) == 0
@pytest.mark.issue(1757)
def test_issue1757(): def test_issue1757():
"""Test comparison against None doesn't cause segfault.""" """Test comparison against None doesn't cause segfault."""
doc = Doc(Vocab(), words=["a", "b", "c"]) doc = Doc(Vocab(), words=["a", "b", "c"])
@ -151,12 +162,14 @@ def test_issue1757():
assert not doc.vocab["a"] < None assert not doc.vocab["a"] < None
@pytest.mark.issue(1758)
def test_issue1758(en_tokenizer): def test_issue1758(en_tokenizer):
"""Test that "would've" is handled by the English tokenizer exceptions.""" """Test that "would've" is handled by the English tokenizer exceptions."""
tokens = en_tokenizer("would've") tokens = en_tokenizer("would've")
assert len(tokens) == 2 assert len(tokens) == 2
@pytest.mark.issue(1773)
def test_issue1773(en_tokenizer): def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause """Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773.""" of the serialization issue reported in #1773."""
@ -165,6 +178,7 @@ def test_issue1773(en_tokenizer):
assert doc[0].tag_ != "" assert doc[0].tag_ != ""
@pytest.mark.issue(1799)
def test_issue1799(): def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for """Test sentence boundaries are deserialized correctly, even for
non-projective sentences.""" non-projective sentences."""
@ -186,6 +200,7 @@ def test_issue1799():
assert len(list(doc.sents)) == 1 assert len(list(doc.sents)) == 1
@pytest.mark.issue(1807)
def test_issue1807(): def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab.""" """Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab(vectors_name="test_issue1807") vocab = Vocab(vectors_name="test_issue1807")
@ -194,6 +209,7 @@ def test_issue1807():
assert "hello" in vocab assert "hello" in vocab
@pytest.mark.issue(1834)
def test_issue1834(): def test_issue1834():
"""Test that sentence boundaries & parse/tag flags are not lost """Test that sentence boundaries & parse/tag flags are not lost
during serialization.""" during serialization."""
@ -217,6 +233,7 @@ def test_issue1834():
assert new_doc.has_annotation("TAG") assert new_doc.has_annotation("TAG")
@pytest.mark.issue(1868)
def test_issue1868(): def test_issue1868():
"""Test Vocab.__contains__ works with int keys.""" """Test Vocab.__contains__ works with int keys."""
vocab = Vocab() vocab = Vocab()
@ -228,6 +245,7 @@ def test_issue1868():
assert int_id not in vocab assert int_id not in vocab
@pytest.mark.issue(1883)
def test_issue1883(): def test_issue1883():
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
matcher.add("pat1", [[{"orth": "hello"}]]) matcher.add("pat1", [[{"orth": "hello"}]])
@ -239,11 +257,13 @@ def test_issue1883():
@pytest.mark.parametrize("word", ["the"]) @pytest.mark.parametrize("word", ["the"])
@pytest.mark.issue(1889)
def test_issue1889(word): def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@pytest.mark.skip(reason="obsolete with the config refactor of v.3") @pytest.mark.skip(reason="obsolete with the config refactor of v.3")
@pytest.mark.issue(1915)
def test_issue1915(): def test_issue1915():
cfg = {"hidden_depth": 2} # should error out cfg = {"hidden_depth": 2} # should error out
nlp = Language() nlp = Language()
@ -253,6 +273,7 @@ def test_issue1915():
nlp.initialize(**cfg) nlp.initialize(**cfg)
@pytest.mark.issue(1945)
def test_issue1945(): def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6.""" """Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
@ -264,6 +285,7 @@ def test_issue1945():
assert matches[1][1:] == (1, 3) assert matches[1][1:] == (1, 3)
@pytest.mark.issue(1963)
def test_issue1963(en_tokenizer): def test_issue1963(en_tokenizer):
"""Test that doc.merge() resizes doc.tensor""" """Test that doc.merge() resizes doc.tensor"""
doc = en_tokenizer("a b c d") doc = en_tokenizer("a b c d")
@ -275,6 +297,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
@pytest.mark.issue(1967)
def test_issue1967(label): def test_issue1967(label):
nlp = Language() nlp = Language()
config = {} config = {}
@ -293,6 +316,7 @@ def test_issue1967(label):
assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
@pytest.mark.issue(1971)
def test_issue1971(en_vocab): def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671? # Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)

View File

@ -13,6 +13,7 @@ from ..util import add_vecs_to_vocab
@pytest.mark.skip( @pytest.mark.skip(
reason="Can not be fixed without iterative looping between prefix/suffix and infix" reason="Can not be fixed without iterative looping between prefix/suffix and infix"
) )
@pytest.mark.issue(2070)
def test_issue2070(): def test_issue2070():
"""Test that checks that a dot followed by a quote is handled """Test that checks that a dot followed by a quote is handled
appropriately. appropriately.
@ -25,6 +26,7 @@ def test_issue2070():
assert len(doc) == 11 assert len(doc) == 11
@pytest.mark.issue(2179)
def test_issue2179(): def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER.""" """Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian() nlp = Italian()
@ -41,6 +43,7 @@ def test_issue2179():
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
@pytest.mark.issue(2203)
def test_issue2203(en_vocab): def test_issue2203(en_vocab):
"""Test that lemmas are set correctly in doc.from_array.""" """Test that lemmas are set correctly in doc.from_array."""
words = ["I", "'ll", "survive"] words = ["I", "'ll", "survive"]
@ -61,6 +64,7 @@ def test_issue2203(en_vocab):
assert [t.lemma_ for t in new_doc] == lemmas assert [t.lemma_ for t in new_doc] == lemmas
@pytest.mark.issue(2219)
def test_issue2219(en_vocab): def test_issue2219(en_vocab):
vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
add_vecs_to_vocab(en_vocab, vectors) add_vecs_to_vocab(en_vocab, vectors)
@ -69,6 +73,7 @@ def test_issue2219(en_vocab):
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
@pytest.mark.issue(2361)
def test_issue2361(de_vocab): def test_issue2361(de_vocab):
chars = ("&lt;", "&gt;", "&amp;", "&quot;") chars = ("&lt;", "&gt;", "&amp;", "&quot;")
words = ["<", ">", "&", '"'] words = ["<", ">", "&", '"']
@ -78,6 +83,7 @@ def test_issue2361(de_vocab):
assert char in html assert char in html
@pytest.mark.issue(2385)
def test_issue2385(): def test_issue2385():
"""Test that IOB tags are correctly converted to BILUO tags.""" """Test that IOB tags are correctly converted to BILUO tags."""
# fix bug in labels with a 'b' character # fix bug in labels with a 'b' character
@ -99,11 +105,13 @@ def test_issue2385():
("U-BRAWLER", "U-BRAWLER"), ("U-BRAWLER", "U-BRAWLER"),
], ],
) )
@pytest.mark.issue(2385)
def test_issue2385_biluo(tags): def test_issue2385_biluo(tags):
"""Test that BILUO-compatible tags aren't modified.""" """Test that BILUO-compatible tags aren't modified."""
assert iob_to_biluo(tags) == list(tags) assert iob_to_biluo(tags) == list(tags)
@pytest.mark.issue(2396)
def test_issue2396(en_vocab): def test_issue2396(en_vocab):
words = ["She", "created", "a", "test", "for", "spacy"] words = ["She", "created", "a", "test", "for", "spacy"]
heads = [1, 1, 3, 1, 3, 4] heads = [1, 1, 3, 1, 3, 4]
@ -125,6 +133,7 @@ def test_issue2396(en_vocab):
assert (span.get_lca_matrix() == matrix).all() assert (span.get_lca_matrix() == matrix).all()
@pytest.mark.issue(2464)
def test_issue2464(en_vocab): def test_issue2464(en_vocab):
"""Test problem with successive ?. This is the same bug, so putting it here.""" """Test problem with successive ?. This is the same bug, so putting it here."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -134,6 +143,7 @@ def test_issue2464(en_vocab):
assert len(matches) == 3 assert len(matches) == 3
@pytest.mark.issue(2482)
def test_issue2482(): def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model.""" """Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian() nlp = Italian()

View File

@ -13,6 +13,7 @@ import numpy
import random import random
@pytest.mark.issue(2564)
def test_issue2564(): def test_issue2564():
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
nlp = Language() nlp = Language()
@ -26,6 +27,7 @@ def test_issue2564():
assert piped_doc.has_annotation("TAG") assert piped_doc.has_annotation("TAG")
@pytest.mark.issue(2569)
def test_issue2569(en_tokenizer): def test_issue2569(en_tokenizer):
"""Test that operator + is greedy.""" """Test that operator + is greedy."""
doc = en_tokenizer("It is May 15, 1993.") doc = en_tokenizer("It is May 15, 1993.")
@ -46,12 +48,14 @@ def test_issue2569(en_tokenizer):
"oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
], ],
) )
@pytest.mark.issue(2626)
def test_issue2626_2835(en_tokenizer, text): def test_issue2626_2835(en_tokenizer, text):
"""Check that sentence doesn't cause an infinite loop in the tokenizer.""" """Check that sentence doesn't cause an infinite loop in the tokenizer."""
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc assert doc
@pytest.mark.issue(2656)
def test_issue2656(en_tokenizer): def test_issue2656(en_tokenizer):
"""Test that tokenizer correctly splits off punctuation after numbers with """Test that tokenizer correctly splits off punctuation after numbers with
decimal points. decimal points.
@ -71,6 +75,7 @@ def test_issue2656(en_tokenizer):
assert doc[10].text == "." assert doc[10].text == "."
@pytest.mark.issue(2671)
def test_issue2671(): def test_issue2671():
"""Ensure the correct entity ID is returned for matches with quantifiers. """Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675 See also #2675
@ -94,6 +99,7 @@ def test_issue2671():
assert nlp.vocab.strings[match_id] == pattern_id assert nlp.vocab.strings[match_id] == pattern_id
@pytest.mark.issue(2728)
def test_issue2728(en_vocab): def test_issue2728(en_vocab):
"""Test that displaCy ENT visualizer escapes HTML correctly.""" """Test that displaCy ENT visualizer escapes HTML correctly."""
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"]) doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
@ -105,6 +111,7 @@ def test_issue2728(en_vocab):
assert "&lt;RELEASE&gt;" in html assert "&lt;RELEASE&gt;" in html
@pytest.mark.issue(2754)
def test_issue2754(en_tokenizer): def test_issue2754(en_tokenizer):
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
a = en_tokenizer("a") a = en_tokenizer("a")
@ -113,6 +120,7 @@ def test_issue2754(en_tokenizer):
assert am[0].norm_ == "am" assert am[0].norm_ == "am"
@pytest.mark.issue(2772)
def test_issue2772(en_vocab): def test_issue2772(en_vocab):
"""Test that deprojectivization doesn't mess up sentence boundaries.""" """Test that deprojectivization doesn't mess up sentence boundaries."""
# fmt: off # fmt: off
@ -128,6 +136,7 @@ def test_issue2772(en_vocab):
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) @pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
@pytest.mark.issue(2782)
def test_issue2782(text, lang_cls): def test_issue2782(text, lang_cls):
"""Check that like_num handles + and - before number.""" """Check that like_num handles + and - before number."""
nlp = lang_cls() nlp = lang_cls()
@ -136,6 +145,7 @@ def test_issue2782(text, lang_cls):
assert doc[0].like_num assert doc[0].like_num
@pytest.mark.issue(2800)
def test_issue2800(): def test_issue2800():
"""Test issue that arises when too many labels are added to NER model. """Test issue that arises when too many labels are added to NER model.
Used to cause segfault. Used to cause segfault.
@ -157,6 +167,7 @@ def test_issue2800():
nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
@pytest.mark.issue(2822)
def test_issue2822(it_tokenizer): def test_issue2822(it_tokenizer):
"""Test that the abbreviation of poco is kept as one word.""" """Test that the abbreviation of poco is kept as one word."""
doc = it_tokenizer("Vuoi un po' di zucchero?") doc = it_tokenizer("Vuoi un po' di zucchero?")
@ -169,6 +180,7 @@ def test_issue2822(it_tokenizer):
assert doc[5].text == "?" assert doc[5].text == "?"
@pytest.mark.issue(2833)
def test_issue2833(en_vocab): def test_issue2833(en_vocab):
"""Test that a custom error is raised if a token or span is pickled.""" """Test that a custom error is raised if a token or span is pickled."""
doc = Doc(en_vocab, words=["Hello", "world"]) doc = Doc(en_vocab, words=["Hello", "world"])
@ -178,6 +190,7 @@ def test_issue2833(en_vocab):
pickle.dumps(doc[0:2]) pickle.dumps(doc[0:2])
@pytest.mark.issue(2871)
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"] words = ["dog", "cat", "SUFFIX"]
@ -196,6 +209,7 @@ def test_issue2871():
assert vocab.vectors.find(key="SUFFIX") == 2 assert vocab.vectors.find(key="SUFFIX") == 2
@pytest.mark.issue(2901)
def test_issue2901(): def test_issue2901():
"""Test that `nlp` doesn't fail.""" """Test that `nlp` doesn't fail."""
try: try:
@ -207,6 +221,7 @@ def test_issue2901():
assert doc assert doc
@pytest.mark.issue(2926)
def test_issue2926(fr_tokenizer): def test_issue2926(fr_tokenizer):
"""Test that the tokenizer correctly splits tokens separated by a slash (/) """Test that the tokenizer correctly splits tokens separated by a slash (/)
ending in a digit. ending in a digit.

View File

@ -14,6 +14,7 @@ from spacy.vectors import Vectors
import numpy import numpy
@pytest.mark.issue(3002)
def test_issue3002(): def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots""" """Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German() nlp = German()
@ -23,6 +24,7 @@ def test_issue3002():
assert len(doc) == 5 assert len(doc) == 5
@pytest.mark.issue(3009)
def test_issue3009(en_vocab): def test_issue3009(en_vocab):
"""Test problem with matcher quantifiers""" """Test problem with matcher quantifiers"""
patterns = [ patterns = [
@ -53,6 +55,7 @@ def test_issue3009(en_vocab):
assert matches assert matches
@pytest.mark.issue(3012)
def test_issue3012(en_vocab): def test_issue3012(en_vocab):
"""Test that the is_tagged attribute doesn't get overwritten when we from_array """Test that the is_tagged attribute doesn't get overwritten when we from_array
without tag information.""" without tag information."""
@ -74,6 +77,7 @@ def test_issue3012(en_vocab):
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
@pytest.mark.issue(3199)
def test_issue3199(): def test_issue3199():
"""Test that Span.noun_chunks works correctly if no noun chunks iterator """Test that Span.noun_chunks works correctly if no noun chunks iterator
is available. To make this test future-proof, we're constructing a Doc is available. To make this test future-proof, we're constructing a Doc
@ -85,6 +89,7 @@ def test_issue3199():
list(doc[0:3].noun_chunks) list(doc[0:3].noun_chunks)
@pytest.mark.issue(3209)
def test_issue3209(): def test_issue3209():
"""Test issue that occurred in spaCy nightly where NER labels were being """Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels mapped to classes incorrectly after loading the model, when the labels
@ -104,6 +109,7 @@ def test_issue3209():
assert ner2.move_names == move_names assert ner2.move_names == move_names
@pytest.mark.issue(3248)
def test_issue3248_1(): def test_issue3248_1():
"""Test that the PhraseMatcher correctly reports its number of rules, not """Test that the PhraseMatcher correctly reports its number of rules, not
total number of patterns.""" total number of patterns."""
@ -114,6 +120,7 @@ def test_issue3248_1():
assert len(matcher) == 2 assert len(matcher) == 2
@pytest.mark.issue(3248)
def test_issue3248_2(): def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly.""" """Test that the PhraseMatcher can be pickled correctly."""
nlp = English() nlp = English()
@ -125,6 +132,7 @@ def test_issue3248_2():
assert len(new_matcher) == len(matcher) assert len(new_matcher) == len(matcher)
@pytest.mark.issue(3277)
def test_issue3277(es_tokenizer): def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes.""" """Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.") doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
@ -134,6 +142,7 @@ def test_issue3277(es_tokenizer):
assert doc[9].text == "\u2013" assert doc[9].text == "\u2013"
@pytest.mark.issue(3288)
def test_issue3288(en_vocab): def test_issue3288(en_vocab):
"""Test that retokenization works correctly via displaCy when punctuation """Test that retokenization works correctly via displaCy when punctuation
is merged onto the preceeding token and tensor is resized.""" is merged onto the preceeding token and tensor is resized."""
@ -145,6 +154,7 @@ def test_issue3288(en_vocab):
displacy.render(doc) displacy.render(doc)
@pytest.mark.issue(3289)
def test_issue3289(): def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component """Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model.""" with an uninitialized model."""
@ -156,6 +166,7 @@ def test_issue3289():
new_nlp.from_bytes(bytes_data) new_nlp.from_bytes(bytes_data)
@pytest.mark.issue(3328)
def test_issue3328(en_vocab): def test_issue3328(en_vocab):
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -170,6 +181,7 @@ def test_issue3328(en_vocab):
assert matched_texts == ["Hello", "how", "you", "doing"] assert matched_texts == ["Hello", "how", "you", "doing"]
@pytest.mark.issue(3331)
def test_issue3331(en_vocab): def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple """Test that duplicate patterns for different rules result in multiple
matches, one per rule. matches, one per rule.
@ -184,6 +196,7 @@ def test_issue3331(en_vocab):
assert sorted(match_ids) == ["A", "B"] assert sorted(match_ids) == ["A", "B"]
@pytest.mark.issue(3345)
def test_issue3345(): def test_issue3345():
"""Test case where preset entity crosses sentence boundary.""" """Test case where preset entity crosses sentence boundary."""
nlp = English() nlp = English()
@ -206,6 +219,7 @@ def test_issue3345():
assert ner.moves.is_valid(state, "B-GPE") assert ner.moves.is_valid(state, "B-GPE")
@pytest.mark.issue(3412)
def test_issue3412(): def test_issue3412():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
vectors = Vectors(data=data, keys=["A", "B", "C"]) vectors = Vectors(data=data, keys=["A", "B", "C"])
@ -216,6 +230,7 @@ def test_issue3412():
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") @pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
@pytest.mark.issue(3449)
def test_issue3449(): def test_issue3449():
nlp = English() nlp = English()
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")
@ -230,6 +245,7 @@ def test_issue3449():
assert t3[5].text == "I" assert t3[5].text == "I"
@pytest.mark.issue(3456)
def test_issue3456(): def test_issue3456():
# this crashed because of a padding error in layer.ops.unflatten in thinc # this crashed because of a padding error in layer.ops.unflatten in thinc
nlp = English() nlp = English()
@ -239,6 +255,7 @@ def test_issue3456():
list(nlp.pipe(["hi", ""])) list(nlp.pipe(["hi", ""]))
@pytest.mark.issue(3468)
def test_issue3468(): def test_issue3468():
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
be restored after serialization.""" be restored after serialization."""

View File

@ -24,6 +24,7 @@ from ..util import make_tempdir
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"]) @pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
@pytest.mark.issue(3521)
def test_issue3521(en_tokenizer, word): def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1] tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms # 'not' and 'would' should be stopwords, also in their abbreviated forms
@ -108,6 +109,7 @@ def test_issue_3526_4(en_vocab):
assert new_ruler.overwrite is True assert new_ruler.overwrite is True
@pytest.mark.issue(3531)
def test_issue3531(): def test_issue3531():
"""Test that displaCy renderer doesn't require "settings" key.""" """Test that displaCy renderer doesn't require "settings" key."""
example_dep = { example_dep = {
@ -137,6 +139,7 @@ def test_issue3531():
assert ent_html assert ent_html
@pytest.mark.issue(3540)
def test_issue3540(en_vocab): def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"] words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = numpy.asarray( tensor = numpy.asarray(
@ -176,6 +179,7 @@ def test_issue3540(en_vocab):
assert vectors_1[5].tolist() == vectors_2[6].tolist() assert vectors_1[5].tolist() == vectors_2[6].tolist()
@pytest.mark.issue(3549)
def test_issue3549(en_vocab): def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors.""" """Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True) matcher = Matcher(en_vocab, validate=True)
@ -186,6 +190,7 @@ def test_issue3549(en_vocab):
@pytest.mark.skip("Matching currently only works on strings and integers") @pytest.mark.skip("Matching currently only works on strings and integers")
@pytest.mark.issue(3555)
def test_issue3555(en_vocab): def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher.""" """Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None) Token.set_extension("issue3555", default=None)
@ -196,6 +201,7 @@ def test_issue3555(en_vocab):
matcher(doc) matcher(doc)
@pytest.mark.issue(3611)
def test_issue3611(): def test_issue3611():
"""Test whether adding n-grams in the textcat works even when n > token length of some docs""" """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
unique_classes = ["offensive", "inoffensive"] unique_classes = ["offensive", "inoffensive"]
@ -232,6 +238,7 @@ def test_issue3611():
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@pytest.mark.issue(3625)
def test_issue3625(): def test_issue3625():
"""Test that default punctuation rules applies to hindi unicode characters""" """Test that default punctuation rules applies to hindi unicode characters"""
nlp = Hindi() nlp = Hindi()
@ -240,6 +247,7 @@ def test_issue3625():
assert [token.text for token in doc] == expected assert [token.text for token in doc] == expected
@pytest.mark.issue(3803)
def test_issue3803(): def test_issue3803():
"""Test that spanish num-like tokens have True for like_num attribute.""" """Test that spanish num-like tokens have True for like_num attribute."""
nlp = Spanish() nlp = Spanish()
@ -255,6 +263,7 @@ def _parser_example(parser):
return Example.from_dict(doc, gold) return Example.from_dict(doc, gold)
@pytest.mark.issue(3830)
def test_issue3830_no_subtok(): def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens""" """Test that the parser doesn't have subtok label if not learn_tokens"""
config = { config = {
@ -268,6 +277,7 @@ def test_issue3830_no_subtok():
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
@pytest.mark.issue(3830)
def test_issue3830_with_subtok(): def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True.""" """Test that the parser does have subtok label if learn_tokens=True."""
config = { config = {
@ -281,6 +291,7 @@ def test_issue3830_with_subtok():
assert "subtok" in parser.labels assert "subtok" in parser.labels
@pytest.mark.issue(3839)
def test_issue3839(en_vocab): def test_issue3839(en_vocab):
"""Test that match IDs returned by the matcher are correct, are in the string""" """Test that match IDs returned by the matcher are correct, are in the string"""
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
@ -307,6 +318,7 @@ def test_issue3839(en_vocab):
"It was a missed assignment, but it shouldn't have resulted in a turnover ...", "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
], ],
) )
@pytest.mark.issue(3869)
def test_issue3869(sentence): def test_issue3869(sentence):
"""Test that the Doc's count_by function works consistently""" """Test that the Doc's count_by function works consistently"""
nlp = English() nlp = English()
@ -317,6 +329,7 @@ def test_issue3869(sentence):
assert count == doc.count_by(IS_ALPHA).get(1, 0) assert count == doc.count_by(IS_ALPHA).get(1, 0)
@pytest.mark.issue(3879)
def test_issue3879(en_vocab): def test_issue3879(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
assert len(doc) == 5 assert len(doc) == 5
@ -326,6 +339,7 @@ def test_issue3879(en_vocab):
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
@pytest.mark.issue(3880)
def test_issue3880(): def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch. """Test that `nlp.pipe()` works when an empty string ends the batch.
@ -341,6 +355,7 @@ def test_issue3880():
pass pass
@pytest.mark.issue(3882)
def test_issue3882(en_vocab): def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a """Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc. copy of the Doc.
@ -350,6 +365,7 @@ def test_issue3882(en_vocab):
parse_deps(doc) parse_deps(doc)
@pytest.mark.issue(3951)
def test_issue3951(en_vocab): def test_issue3951(en_vocab):
"""Test that combinations of optional rules are matched correctly.""" """Test that combinations of optional rules are matched correctly."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -365,6 +381,7 @@ def test_issue3951(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
@pytest.mark.issue(3959)
def test_issue3959(): def test_issue3959():
"""Ensure that a modified pos attribute is serialized correctly.""" """Ensure that a modified pos attribute is serialized correctly."""
nlp = English() nlp = English()
@ -383,6 +400,7 @@ def test_issue3959():
assert doc2[0].pos_ == "NOUN" assert doc2[0].pos_ == "NOUN"
@pytest.mark.issue(3962)
def test_issue3962(en_vocab): def test_issue3962(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens. """Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@ -421,6 +439,7 @@ def test_issue3962(en_vocab):
assert len(list(doc3.sents)) == 1 assert len(list(doc3.sents)) == 1
@pytest.mark.issue(3962)
def test_issue3962_long(en_vocab): def test_issue3962_long(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens. """Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@ -456,6 +475,7 @@ def test_issue3962_long(en_vocab):
assert sents[1].text == "They never" assert sents[1].text == "They never"
@pytest.mark.issue(3972)
def test_issue3972(en_vocab): def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)

View File

@ -17,6 +17,7 @@ from thinc.api import compounding
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(4002)
def test_issue4002(en_vocab): def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes.""" """Test that the PhraseMatcher can match on overwritten NORM attributes."""
matcher = PhraseMatcher(en_vocab, attr="NORM") matcher = PhraseMatcher(en_vocab, attr="NORM")
@ -37,6 +38,7 @@ def test_issue4002(en_vocab):
assert len(matches) == 1 assert len(matches) == 1
@pytest.mark.issue(4030)
def test_issue4030(): def test_issue4030():
"""Test whether textcat works fine with empty doc""" """Test whether textcat works fine with empty doc"""
unique_classes = ["offensive", "inoffensive"] unique_classes = ["offensive", "inoffensive"]
@ -77,6 +79,7 @@ def test_issue4030():
assert doc.cats["inoffensive"] == 0.0 assert doc.cats["inoffensive"] == 0.0
@pytest.mark.issue(4042)
def test_issue4042(): def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine.""" """Test that serialization of an EntityRuler before NER works fine."""
nlp = English() nlp = English()
@ -105,6 +108,7 @@ def test_issue4042():
assert doc2.ents[0].label_ == "MY_ORG" assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.issue(4042)
def test_issue4042_bug2(): def test_issue4042_bug2():
""" """
Test that serialization of an NER works fine when new labels were added. Test that serialization of an NER works fine when new labels were added.
@ -139,6 +143,7 @@ def test_issue4042_bug2():
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2
@pytest.mark.issue(4054)
def test_issue4054(en_vocab): def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file, """Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point.""" and that serialization does not drop the language at any point."""
@ -159,6 +164,7 @@ def test_issue4054(en_vocab):
assert nlp3.lang == "en" assert nlp3.lang == "en"
@pytest.mark.issue(4120)
def test_issue4120(en_vocab): def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned.""" """Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -177,6 +183,7 @@ def test_issue4120(en_vocab):
assert len(matcher(doc4)) == 3 # fixed assert len(matcher(doc4)) == 3 # fixed
@pytest.mark.issue(4133)
def test_issue4133(en_vocab): def test_issue4133(en_vocab):
nlp = English() nlp = English()
vocab_bytes = nlp.vocab.to_bytes() vocab_bytes = nlp.vocab.to_bytes()
@ -196,6 +203,7 @@ def test_issue4133(en_vocab):
assert actual == pos assert actual == pos
@pytest.mark.issue(4190)
def test_issue4190(): def test_issue4190():
def customize_tokenizer(nlp): def customize_tokenizer(nlp):
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
@ -236,6 +244,7 @@ def test_issue4190():
assert result_1b == result_2 assert result_1b == result_2
@pytest.mark.issue(4267)
def test_issue4267(): def test_issue4267():
"""Test that running an entity_ruler after ner gives consistent results""" """Test that running an entity_ruler after ner gives consistent results"""
nlp = English() nlp = English()
@ -262,6 +271,7 @@ def test_issue4267():
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") @pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
@pytest.mark.issue(4272)
def test_issue4272(): def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags """Test that lookup table can be accessed from Token.lemma if no POS tags
are available.""" are available."""
@ -287,6 +297,7 @@ def test_multiple_predictions():
dummy_pipe(doc) dummy_pipe(doc)
@pytest.mark.issue(4313)
def test_issue4313(): def test_issue4313():
"""This should not crash or exit with some strange error code""" """This should not crash or exit with some strange error code"""
beam_width = 16 beam_width = 16
@ -313,6 +324,7 @@ def test_issue4313():
assert "MY_ORG" in ner.labels assert "MY_ORG" in ner.labels
@pytest.mark.issue(4348)
def test_issue4348(): def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors""" """Test that training the tagger with empty data, doesn't throw errors"""
nlp = English() nlp = English()
@ -328,6 +340,7 @@ def test_issue4348():
nlp.update(batch, sgd=optimizer, losses=losses) nlp.update(batch, sgd=optimizer, losses=losses)
@pytest.mark.issue(4367)
def test_issue4367(): def test_issue4367():
"""Test that docbin init goes well""" """Test that docbin init goes well"""
DocBin() DocBin()
@ -335,6 +348,7 @@ def test_issue4367():
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
@pytest.mark.issue(4373)
def test_issue4373(): def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
@ -343,6 +357,7 @@ def test_issue4373():
assert isinstance(matcher.vocab, Vocab) assert isinstance(matcher.vocab, Vocab)
@pytest.mark.issue(4402)
def test_issue4402(): def test_issue4402():
json_data = { json_data = {
"id": 0, "id": 0,

View File

@ -14,6 +14,7 @@ from thinc.api import NumpyOps, get_current_ops
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(4528)
def test_issue4528(en_vocab): def test_issue4528(en_vocab):
"""Test that user_data is correctly serialized in DocBin.""" """Test that user_data is correctly serialized in DocBin."""
doc = Doc(en_vocab, words=["hello", "world"]) doc = Doc(en_vocab, words=["hello", "world"])
@ -37,6 +38,7 @@ def test_gold_misaligned(en_tokenizer, text, words):
Example.from_dict(doc, {"words": words}) Example.from_dict(doc, {"words": words})
@pytest.mark.issue(4651)
def test_issue4651_with_phrase_matcher_attr(): def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using """Test that the EntityRuler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is the method from_disk when the EntityRuler argument phrase_matcher_attr is
@ -59,6 +61,7 @@ def test_issue4651_with_phrase_matcher_attr():
assert res == res_reloaded assert res == res_reloaded
@pytest.mark.issue(4651)
def test_issue4651_without_phrase_matcher_attr(): def test_issue4651_without_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using """Test that the EntityRuler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is the method from_disk when the EntityRuler argument phrase_matcher_attr is
@ -81,6 +84,7 @@ def test_issue4651_without_phrase_matcher_attr():
assert res == res_reloaded assert res == res_reloaded
@pytest.mark.issue(4665)
def test_issue4665(): def test_issue4665():
""" """
conllu_to_docs should not raise an exception if the HEAD column contains an conllu_to_docs should not raise an exception if the HEAD column contains an
@ -109,6 +113,7 @@ def test_issue4665():
conllu_to_docs(input_data) conllu_to_docs(input_data)
@pytest.mark.issue(4674)
def test_issue4674(): def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO""" """Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English() nlp = English()
@ -135,6 +140,7 @@ def test_issue4674():
@pytest.mark.skip(reason="API change: disable just disables, new exclude arg") @pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
@pytest.mark.issue(4707)
def test_issue4707(): def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk """Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model. by default when loading a model.
@ -151,6 +157,7 @@ def test_issue4707():
assert "entity_ruler" in new_nlp.pipe_names assert "entity_ruler" in new_nlp.pipe_names
@pytest.mark.issue(4725)
def test_issue4725_1(): def test_issue4725_1():
"""Ensure the pickling of the NER goes well""" """Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab(vectors_name="test_vocab_add_vector")
@ -169,6 +176,7 @@ def test_issue4725_1():
assert ner2.cfg["update_with_oracle_cut_size"] == 111 assert ner2.cfg["update_with_oracle_cut_size"] == 111
@pytest.mark.issue(4725)
def test_issue4725_2(): def test_issue4725_2():
if isinstance(get_current_ops, NumpyOps): if isinstance(get_current_ops, NumpyOps):
# ensures that this runs correctly and doesn't hang or crash because of the global vectors # ensures that this runs correctly and doesn't hang or crash because of the global vectors
@ -188,6 +196,7 @@ def test_issue4725_2():
pass pass
@pytest.mark.issue(4849)
def test_issue4849(): def test_issue4849():
nlp = English() nlp = English()
patterns = [ patterns = [
@ -235,6 +244,7 @@ class CustomPipe:
return str(span.end) return str(span.end)
@pytest.mark.issue(4903)
def test_issue4903(): def test_issue4903():
"""Ensure that this runs correctly and doesn't hang or crash on Windows / """Ensure that this runs correctly and doesn't hang or crash on Windows /
macOS.""" macOS."""
@ -249,6 +259,7 @@ def test_issue4903():
assert docs[2].text == "No, I prefer wasabi." assert docs[2].text == "No, I prefer wasabi."
@pytest.mark.issue(4924)
def test_issue4924(): def test_issue4924():
nlp = Language() nlp = Language()
example = Example.from_dict(nlp.make_doc(""), {}) example = Example.from_dict(nlp.make_doc(""), {})

View File

@ -12,6 +12,7 @@ import pytest
from ...util import make_tempdir from ...util import make_tempdir
@pytest.mark.issue(5048)
def test_issue5048(en_vocab): def test_issue5048(en_vocab):
words = ["This", "is", "a", "sentence"] words = ["This", "is", "a", "sentence"]
pos_s = ["DET", "VERB", "DET", "NOUN"] pos_s = ["DET", "VERB", "DET", "NOUN"]
@ -34,6 +35,7 @@ def test_issue5048(en_vocab):
assert v1 == v2 assert v1 == v2
@pytest.mark.issue(5082)
def test_issue5082(): def test_issue5082():
# Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
nlp = English() nlp = English()
@ -68,6 +70,7 @@ def test_issue5082():
numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
@pytest.mark.issue(5137)
def test_issue5137(): def test_issue5137():
factory_name = "test_issue5137" factory_name = "test_issue5137"
pipe_name = "my_component" pipe_name = "my_component"
@ -98,6 +101,7 @@ def test_issue5137():
assert nlp2.get_pipe(pipe_name).categories == "my_categories" assert nlp2.get_pipe(pipe_name).categories == "my_categories"
@pytest.mark.issue(5141)
def test_issue5141(en_vocab): def test_issue5141(en_vocab):
"""Ensure an empty DocBin does not crash on serialization""" """Ensure an empty DocBin does not crash on serialization"""
doc_bin = DocBin(attrs=["DEP", "HEAD"]) doc_bin = DocBin(attrs=["DEP", "HEAD"])
@ -107,6 +111,7 @@ def test_issue5141(en_vocab):
assert list(doc_bin_2.get_docs(en_vocab)) == [] assert list(doc_bin_2.get_docs(en_vocab)) == []
@pytest.mark.issue(5152)
def test_issue5152(): def test_issue5152():
# Test that the comparison between a Span and a Token, goes well # Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!) # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -125,6 +130,7 @@ def test_issue5152():
assert span_2.similarity(span_3) < 1.0 assert span_2.similarity(span_3) < 1.0
@pytest.mark.issue(5458)
def test_issue5458(): def test_issue5458():
# Test that the noun chuncker does not generate overlapping spans # Test that the noun chuncker does not generate overlapping spans
# fmt: off # fmt: off

View File

@ -25,6 +25,7 @@ from spacy.training import Example
multi_label_cnn_config, multi_label_cnn_config,
], ],
) )
@pytest.mark.issue(5551)
def test_issue5551(textcat_config): def test_issue5551(textcat_config):
"""Test that after fixing the random seed, the results of the pipeline are truly identical""" """Test that after fixing the random seed, the results of the pipeline are truly identical"""
component = "textcat" component = "textcat"
@ -53,6 +54,7 @@ def test_issue5551(textcat_config):
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
@pytest.mark.issue(5838)
def test_issue5838(): def test_issue5838():
# Displacy's EntityRenderer break line # Displacy's EntityRenderer break line
# not working after last entity # not working after last entity
@ -65,6 +67,7 @@ def test_issue5838():
assert found == 4 assert found == 4
@pytest.mark.issue(5918)
def test_issue5918(): def test_issue5918():
# Test edge case when merging entities. # Test edge case when merging entities.
nlp = English() nlp = English()

View File

@ -4,6 +4,7 @@ from spacy.schemas import TokenPattern, TokenPatternSchema
import pytest import pytest
@pytest.mark.issue(6207)
def test_issue6207(en_tokenizer): def test_issue6207(en_tokenizer):
doc = en_tokenizer("zero one two three four five six") doc = en_tokenizer("zero one two three four five six")
@ -18,6 +19,7 @@ def test_issue6207(en_tokenizer):
assert s3 in result assert s3 in result
@pytest.mark.issue(6258)
def test_issue6258(): def test_issue6258():
"""Test that the non-empty constraint pattern field is respected""" """Test that the non-empty constraint pattern field is respected"""
# These one is valid # These one is valid

View File

@ -13,6 +13,7 @@ import pickle
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(6730)
def test_issue6730(en_vocab): def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
@ -34,6 +35,7 @@ def test_issue6730(en_vocab):
assert set(kb.get_alias_strings()) == {"x", "y"} assert set(kb.get_alias_strings()) == {"x", "y"}
@pytest.mark.issue(6755)
def test_issue6755(en_tokenizer): def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.") doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0] span = doc[:0]
@ -45,6 +47,7 @@ def test_issue6755(en_tokenizer):
"sentence, start_idx,end_idx,label", "sentence, start_idx,end_idx,label",
[("Welcome to Mumbai, my friend", 11, 17, "GPE")], [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
) )
@pytest.mark.issue(6815)
def test_issue6815_1(sentence, start_idx, end_idx, label): def test_issue6815_1(sentence, start_idx, end_idx, label):
nlp = English() nlp = English()
doc = nlp(sentence) doc = nlp(sentence)
@ -55,6 +58,7 @@ def test_issue6815_1(sentence, start_idx, end_idx, label):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
) )
@pytest.mark.issue(6815)
def test_issue6815_2(sentence, start_idx, end_idx, kb_id): def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
nlp = English() nlp = English()
doc = nlp(sentence) doc = nlp(sentence)
@ -66,6 +70,7 @@ def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
"sentence, start_idx,end_idx,vector", "sentence, start_idx,end_idx,vector",
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
) )
@pytest.mark.issue(6815)
def test_issue6815_3(sentence, start_idx, end_idx, vector): def test_issue6815_3(sentence, start_idx, end_idx, vector):
nlp = English() nlp = English()
doc = nlp(sentence) doc = nlp(sentence)
@ -73,6 +78,7 @@ def test_issue6815_3(sentence, start_idx, end_idx, vector):
assert (span.vector == vector).all() assert (span.vector == vector).all()
@pytest.mark.issue(6839)
def test_issue6839(en_vocab): def test_issue6839(en_vocab):
"""Ensure that PhraseMatcher accepts Span as input""" """Ensure that PhraseMatcher accepts Span as input"""
# fmt: off # fmt: off
@ -155,6 +161,7 @@ labels = ['label1', 'label2']
"component_name", "component_name",
["textcat", "textcat_multilabel"], ["textcat", "textcat_multilabel"],
) )
@pytest.mark.issue(6908)
def test_issue6908(component_name): def test_issue6908(component_name):
"""Test intializing textcat with labels in a list""" """Test intializing textcat with labels in a list"""
@ -219,6 +226,7 @@ upstream = "*"
""" """
@pytest.mark.issue(6950)
def test_issue6950(): def test_issue6950():
"""Test that the nlp object with initialized tok2vec with listeners pickles """Test that the nlp object with initialized tok2vec with listeners pickles
correctly (and doesn't have lambdas). correctly (and doesn't have lambdas).

View File

@ -13,6 +13,7 @@ from wasabi import msg
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(7019)
def test_issue7019(): def test_issue7019():
scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
print_textcats_auc_per_cat(msg, scores) print_textcats_auc_per_cat(msg, scores)
@ -64,6 +65,7 @@ upstream = "*"
""" """
@pytest.mark.issue(7029)
def test_issue7029(): def test_issue7029():
"""Test that an empty document doesn't mess up an entire batch.""" """Test that an empty document doesn't mess up an entire batch."""
TRAIN_DATA = [ TRAIN_DATA = [
@ -84,6 +86,7 @@ def test_issue7029():
assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
@pytest.mark.issue(7055)
def test_issue7055(): def test_issue7055():
"""Test that fill-config doesn't turn sourced components into factories.""" """Test that fill-config doesn't turn sourced components into factories."""
source_cfg = { source_cfg = {
@ -118,6 +121,7 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"] assert "model" in filled_cfg["components"]["ner"]
@pytest.mark.issue(7056)
def test_issue7056(): def test_issue7056():
"""Test that the Unshift transition works properly, and doesn't cause """Test that the Unshift transition works properly, and doesn't cause
sentence segmentation errors.""" sentence segmentation errors."""
@ -190,6 +194,7 @@ def test_partial_links():
assert "ORG" not in results["nel_f_per_type"] assert "ORG" not in results["nel_f_per_type"]
@pytest.mark.issue(7065)
def test_issue7065(): def test_issue7065():
text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
nlp = English() nlp = English()
@ -217,6 +222,7 @@ def test_issue7065():
assert sentences.index(ent.sent) == 0 assert sentences.index(ent.sent) == 0
@pytest.mark.issue(7065)
def test_issue7065_b(): def test_issue7065_b():
# Test that the NEL doesn't crash when an entity crosses a sentence boundary # Test that the NEL doesn't crash when an entity crosses a sentence boundary
nlp = English() nlp = English()

View File

@ -43,6 +43,7 @@ def parser(vocab):
return parser return parser
@pytest.mark.issue(7716)
@pytest.mark.xfail(reason="Not fixed yet") @pytest.mark.xfail(reason="Not fixed yet")
def test_partial_annotation(parser): def test_partial_annotation(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = Doc(parser.vocab, words=["a", "b", "c", "d"])

View File

@ -3,6 +3,7 @@ from spacy.lang.en import English
from ..util import make_tempdir from ..util import make_tempdir
@pytest.mark.issue(8190)
def test_issue8190(): def test_issue8190():
"""Test that config overrides are not lost after load is complete.""" """Test that config overrides are not lost after load is complete."""
source_cfg = { source_cfg = {

View File

@ -22,6 +22,7 @@ def patterns():
] ]
@pytest.mark.issue(8216)
def test_entity_ruler_fix8216(nlp, patterns): def test_entity_ruler_fix8216(nlp, patterns):
"""Test that patterns don't get added excessively.""" """Test that patterns don't get added excessively."""
ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) ruler = nlp.add_pipe("entity_ruler", config={"validate": True})