diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 82b3a81a9..4292c8d23 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest import numpy from spacy.tokens import Doc +from spacy.matcher import Matcher from spacy.displacy import render from spacy.gold import iob_to_biluo from spacy.lang.it import Italian @@ -123,6 +124,15 @@ def test_issue2396(en_vocab): assert (span.get_lca_matrix() == matrix).all() +def test_issue2464(en_vocab): + """Test problem with successive ?. This is the same bug, so putting it here.""" + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["a", "b"]) + matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) + matches = matcher(doc) + assert len(matches) == 3 + + def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py new file mode 100644 index 000000000..3b0c2f1ed --- /dev/null +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -0,0 +1,334 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English +from spacy.lang.de import German +from spacy.pipeline import EntityRuler, EntityRecognizer +from spacy.matcher import Matcher, PhraseMatcher +from spacy.tokens import Doc +from spacy.vocab import Vocab +from spacy.attrs import ENT_IOB, ENT_TYPE +from spacy.compat import pickle, is_python2, unescape_unicode +from spacy import displacy +from spacy.util import decaying +import numpy +import re + +from ..util import get_doc + + +def test_issue3002(): + """Test that the tokenizer doesn't hang on a long list of dots""" + nlp = German() + doc = nlp( + "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" + ) + assert len(doc) == 5 + + +def test_issue3009(en_vocab): + """Test problem with matcher quantifiers""" + patterns = [ + [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}], + [ + {"LEMMA": "have"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"POS": "ADP"}, + ], + [ + {"LEMMA": "have"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"POS": "ADP"}, + ], + ] + words = ["also", "has", "to", "do", "with"] + tags = ["RB", "VBZ", "TO", "VB", "IN"] + doc = get_doc(en_vocab, words=words, tags=tags) + matcher = Matcher(en_vocab) + for i, pattern in enumerate(patterns): + matcher.add(str(i), None, pattern) + matches = matcher(doc) + assert matches + + +def test_issue3012(en_vocab): + """Test that the is_tagged attribute doesn't get overwritten when we from_array + without tag information.""" + words = ["This", "is", "10", "%", "."] + tags = ["DT", "VBZ", "CD", "NN", "."] + pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] + ents = [(2, 4, "PERCENT")] + doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) + assert doc.is_tagged + + expected = ("10", "NUM", "CD", "PERCENT") + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + + header = [ENT_IOB, ENT_TYPE] + ent_array = doc.to_array(header) + doc.from_array(header, ent_array) + + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + + # Serializing then deserializing + doc_bytes = doc.to_bytes() + doc2 = Doc(en_vocab).from_bytes(doc_bytes) + assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected + + +def test_issue3199(): + """Test that Span.noun_chunks works correctly if no noun chunks iterator + is available. To make this test future-proof, we're constructing a Doc + with a new Vocab here and setting is_parsed to make sure the noun chunks run. + """ + doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) + doc.is_parsed = True + assert list(doc[0:3].noun_chunks) == [] + + +def test_issue3209(): + """Test issue that occurred in spaCy nightly where NER labels were being + mapped to classes incorrectly after loading the model, when the labels + were added using ner.add_label(). + """ + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + + ner.add_label("ANIMAL") + nlp.begin_training() + move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] + assert ner.move_names == move_names + nlp2 = English() + nlp2.add_pipe(nlp2.create_pipe("ner")) + nlp2.from_bytes(nlp.to_bytes()) + assert nlp2.get_pipe("ner").move_names == move_names + + +def test_issue3248_1(): + """Test that the PhraseMatcher correctly reports its number of rules, not + total number of patterns.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) + matcher.add("TEST2", None, nlp("d")) + assert len(matcher) == 2 + + +def test_issue3248_2(): + """Test that the PhraseMatcher can be pickled correctly.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) + matcher.add("TEST2", None, nlp("d")) + data = pickle.dumps(matcher) + new_matcher = pickle.loads(data) + assert len(new_matcher) == len(matcher) + + +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013" + + +def test_issue3288(en_vocab): + """Test that retokenization works correctly via displaCy when punctuation + is merged onto the preceeding token and tensor is resized.""" + words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] + heads = [1, 0, -1, 1, 0, 1, -2, -3] + deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + doc.tensor = numpy.zeros((len(words), 96), dtype="float32") + displacy.render(doc) + + +def test_issue3289(): + """Test that Language.to_bytes handles serializing a pipeline component + with an uninitialized model.""" + nlp = English() + nlp.add_pipe(nlp.create_pipe("textcat")) + bytes_data = nlp.to_bytes() + new_nlp = English() + new_nlp.add_pipe(nlp.create_pipe("textcat")) + new_nlp.from_bytes(bytes_data) + + +def test_issue3328(en_vocab): + doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) + matcher = Matcher(en_vocab) + patterns = [ + [{"LOWER": {"IN": ["hello", "how"]}}], + [{"LOWER": {"IN": ["you", "doing"]}}], + ] + matcher.add("TEST", None, *patterns) + matches = matcher(doc) + assert len(matches) == 4 + matched_texts = [doc[start:end].text for _, start, end in matches] + assert matched_texts == ["Hello", "how", "you", "doing"] + + +@pytest.mark.xfail +def test_issue3331(en_vocab): + """Test that duplicate patterns for different rules result in multiple + matches, one per rule. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) + matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) + doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) + matches = matcher(doc) + assert len(matches) == 2 + match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] + assert sorted(match_ids) == ["A", "B"] + + +def test_issue3345(): + """Test case where preset entity crosses sentence boundary.""" + nlp = English() + doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) + doc[4].is_sent_start = True + ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) + ner = EntityRecognizer(doc.vocab) + # Add the OUT action. I wouldn't have thought this would be necessary... + ner.moves.add_action(5, "") + ner.add_label("GPE") + doc = ruler(doc) + # Get into the state just before "New" + state = ner.moves.init_batch([doc])[0] + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + # Check that B-GPE is valid. + assert ner.moves.is_valid(state, "B-GPE") + + +if is_python2: + # If we have this test in Python 3, pytest chokes, as it can't print the + # string above in the xpass message. + prefix_search = ( + b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" + b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" + b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" + b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" + b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" + b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" + b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" + b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" + b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" + b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" + b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" + b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" + b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" + b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" + b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" + b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" + b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" + b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" + b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" + b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" + b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" + b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" + b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" + b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" + b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" + b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" + b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" + b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" + b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" + b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" + b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" + b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" + b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" + b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" + b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" + b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" + b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" + b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" + b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" + b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" + b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" + b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" + b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" + b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" + b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" + b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" + b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" + b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" + b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" + b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" + b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" + b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" + b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" + b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" + b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" + b"\\U0001FA60-\\U0001FA6D]" + ) + + def test_issue3356(): + pattern = re.compile(unescape_unicode(prefix_search.decode("utf8"))) + assert not pattern.search("hello") + + +def test_issue3410(): + texts = ["Hello world", "This is a test"] + nlp = English() + matcher = Matcher(nlp.vocab) + phrasematcher = PhraseMatcher(nlp.vocab) + with pytest.deprecated_call(): + docs = list(nlp.pipe(texts, n_threads=4)) + with pytest.deprecated_call(): + docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) + with pytest.deprecated_call(): + list(matcher.pipe(docs, n_threads=4)) + with pytest.deprecated_call(): + list(phrasematcher.pipe(docs, n_threads=4)) + + +def test_issue3447(): + sizes = decaying(10.0, 1.0, 0.5) + size = next(sizes) + assert size == 10.0 + size = next(sizes) + assert size == 10.0 - 0.5 + size = next(sizes) + assert size == 10.0 - 0.5 - 0.5 + + +@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") +def test_issue3449(): + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + assert t1[5].text == "I" + assert t2[5].text == "I" + assert t3[5].text == "I" + + +def test_issue3468(): + """Test that sentence boundaries are set correctly so Doc.is_sentenced can + be restored after serialization.""" + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + doc = nlp("Hello world") + assert doc[0].is_sent_start + assert doc.is_sentenced + assert len(list(doc.sents)) == 1 + doc_bytes = doc.to_bytes() + new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) + assert new_doc[0].is_sent_start + assert new_doc.is_sentenced + assert len(list(new_doc.sents)) == 1 diff --git a/spacy/tests/regression/test_issue3002.py b/spacy/tests/regression/test_issue3002.py deleted file mode 100644 index 54e661d1f..000000000 --- a/spacy/tests/regression/test_issue3002.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.de import German - - -def test_issue3002(): - """Test that the tokenizer doesn't hang on a long list of dots""" - nlp = German() - doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl') - assert len(doc) == 5 diff --git a/spacy/tests/regression/test_issue3009.py b/spacy/tests/regression/test_issue3009.py deleted file mode 100644 index 25f208903..000000000 --- a/spacy/tests/regression/test_issue3009.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -PATTERNS = [ - ("1", [[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}]]), - ( - "2", - [ - [ - {"LEMMA": "have"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"POS": "ADP"}, - ] - ], - ), - ( - "3", - [ - [ - {"LEMMA": "have"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"POS": "ADP"}, - ] - ], - ), -] - - -@pytest.fixture -def doc(en_tokenizer): - doc = en_tokenizer("also has to do with") - doc[0].tag_ = "RB" - doc[1].tag_ = "VBZ" - doc[2].tag_ = "TO" - doc[3].tag_ = "VB" - doc[4].tag_ = "IN" - return doc - - -@pytest.fixture -def matcher(en_tokenizer): - return Matcher(en_tokenizer.vocab) - - -@pytest.mark.parametrize("pattern", PATTERNS) -def test_issue3009(doc, matcher, pattern): - """Test problem with matcher quantifiers""" - matcher.add(pattern[0], None, *pattern[1]) - matches = matcher(doc) - assert matches - - -def test_issue2464(matcher): - """Test problem with successive ?. This is the same bug, so putting it here.""" - doc = Doc(matcher.vocab, words=["a", "b"]) - matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) - matches = matcher(doc) - assert len(matches) == 3 diff --git a/spacy/tests/regression/test_issue3012.py b/spacy/tests/regression/test_issue3012.py deleted file mode 100644 index 8fdc8b318..000000000 --- a/spacy/tests/regression/test_issue3012.py +++ /dev/null @@ -1,31 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...attrs import ENT_IOB, ENT_TYPE -from ...tokens import Doc -from ..util import get_doc - - -def test_issue3012(en_vocab): - """Test that the is_tagged attribute doesn't get overwritten when we from_array - without tag information.""" - words = ["This", "is", "10", "%", "."] - tags = ["DT", "VBZ", "CD", "NN", "."] - pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = [(2, 4, "PERCENT")] - doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) - assert doc.is_tagged - - expected = ("10", "NUM", "CD", "PERCENT") - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - - header = [ENT_IOB, ENT_TYPE] - ent_array = doc.to_array(header) - doc.from_array(header, ent_array) - - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - - # serializing then deserializing - doc_bytes = doc.to_bytes() - doc2 = Doc(en_vocab).from_bytes(doc_bytes) - assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected diff --git a/spacy/tests/regression/test_issue3199.py b/spacy/tests/regression/test_issue3199.py deleted file mode 100644 index d80a55330..000000000 --- a/spacy/tests/regression/test_issue3199.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.tokens import Doc -from spacy.vocab import Vocab - - -def test_issue3199(): - """Test that Span.noun_chunks works correctly if no noun chunks iterator - is available. To make this test future-proof, we're constructing a Doc - with a new Vocab here and setting is_parsed to make sure the noun chunks run. - """ - doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) - doc.is_parsed = True - assert list(doc[0:3].noun_chunks) == [] diff --git a/spacy/tests/regression/test_issue3209.py b/spacy/tests/regression/test_issue3209.py deleted file mode 100644 index 469e38b8c..000000000 --- a/spacy/tests/regression/test_issue3209.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English - - -def test_issue3209(): - """Test issue that occurred in spaCy nightly where NER labels were being - mapped to classes incorrectly after loading the model, when the labels - were added using ner.add_label(). - """ - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - - ner.add_label("ANIMAL") - nlp.begin_training() - move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] - assert ner.move_names == move_names - nlp2 = English() - nlp2.add_pipe(nlp2.create_pipe("ner")) - nlp2.from_bytes(nlp.to_bytes()) - assert nlp2.get_pipe("ner").move_names == move_names diff --git a/spacy/tests/regression/test_issue3248.py b/spacy/tests/regression/test_issue3248.py deleted file mode 100644 index c4b592f3c..000000000 --- a/spacy/tests/regression/test_issue3248.py +++ /dev/null @@ -1,27 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.matcher import PhraseMatcher -from spacy.lang.en import English -from spacy.compat import pickle - - -def test_issue3248_1(): - """Test that the PhraseMatcher correctly reports its number of rules, not - total number of patterns.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) - assert len(matcher) == 2 - - -def test_issue3248_2(): - """Test that the PhraseMatcher can be pickled correctly.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) - data = pickle.dumps(matcher) - new_matcher = pickle.loads(data) - assert len(new_matcher) == len(matcher) diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py deleted file mode 100644 index 88ea67774..000000000 --- a/spacy/tests/regression/test_issue3277.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -def test_issue3277(es_tokenizer): - """Test that hyphens are split correctly as prefixes.""" - doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") - assert len(doc) == 14 - assert doc[0].text == "\u2014" - assert doc[5].text == "\u2013" - assert doc[9].text == "\u2013" diff --git a/spacy/tests/regression/test_issue3288.py b/spacy/tests/regression/test_issue3288.py deleted file mode 100644 index 188bf361c..000000000 --- a/spacy/tests/regression/test_issue3288.py +++ /dev/null @@ -1,18 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import numpy -from spacy import displacy - -from ..util import get_doc - - -def test_issue3288(en_vocab): - """Test that retokenization works correctly via displaCy when punctuation - is merged onto the preceeding token and tensor is resized.""" - words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] - heads = [1, 0, -1, 1, 0, 1, -2, -3] - deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] - doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - doc.tensor = numpy.zeros((len(words), 96), dtype="float32") - displacy.render(doc) diff --git a/spacy/tests/regression/test_issue3289.py b/spacy/tests/regression/test_issue3289.py deleted file mode 100644 index 0e64f07ce..000000000 --- a/spacy/tests/regression/test_issue3289.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.lang.en import English - - -def test_issue3289(): - """Test that Language.to_bytes handles serializing a pipeline component - with an uninitialized model.""" - nlp = English() - nlp.add_pipe(nlp.create_pipe("textcat")) - bytes_data = nlp.to_bytes() - new_nlp = English() - new_nlp.add_pipe(nlp.create_pipe("textcat")) - new_nlp.from_bytes(bytes_data) diff --git a/spacy/tests/regression/test_issue3328.py b/spacy/tests/regression/test_issue3328.py deleted file mode 100644 index c397feebb..000000000 --- a/spacy/tests/regression/test_issue3328.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3328(en_vocab): - doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) - matcher = Matcher(en_vocab) - patterns = [ - [{"LOWER": {"IN": ["hello", "how"]}}], - [{"LOWER": {"IN": ["you", "doing"]}}], - ] - matcher.add("TEST", None, *patterns) - matches = matcher(doc) - assert len(matches) == 4 - matched_texts = [doc[start:end].text for _, start, end in matches] - assert matched_texts == ["Hello", "how", "you", "doing"] diff --git a/spacy/tests/regression/test_issue3331.py b/spacy/tests/regression/test_issue3331.py deleted file mode 100644 index c30712f81..000000000 --- a/spacy/tests/regression/test_issue3331.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -@pytest.mark.xfail -def test_issue3331(en_vocab): - """Test that duplicate patterns for different rules result in multiple - matches, one per rule. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) - matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) - doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) - matches = matcher(doc) - assert len(matches) == 2 - match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] - assert sorted(match_ids) == ["A", "B"] diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py deleted file mode 100644 index c358fd7bc..000000000 --- a/spacy/tests/regression/test_issue3345.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English -from spacy.tokens import Doc -from spacy.pipeline import EntityRuler, EntityRecognizer - - -def test_issue3345(): - """Test case where preset entity crosses sentence boundary.""" - nlp = English() - doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) - doc[4].is_sent_start = True - ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab) - # Add the OUT action. I wouldn't have thought this would be necessary... - ner.moves.add_action(5, "") - ner.add_label("GPE") - doc = ruler(doc) - # Get into the state just before "New" - state = ner.moves.init_batch([doc])[0] - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - # Check that B-GPE is valid. - assert ner.moves.is_valid(state, "B-GPE") diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py deleted file mode 100644 index f8d16459c..000000000 --- a/spacy/tests/regression/test_issue3356.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import re -from spacy import compat - -prefix_search = ( - b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" - b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" - b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" - b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" - b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" - b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" - b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" - b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" - b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" - b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" - b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" - b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" - b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" - b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" - b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" - b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" - b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" - b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" - b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" - b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" - b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" - b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" - b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" - b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" - b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" - b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" - b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" - b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" - b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" - b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" - b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" - b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" - b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" - b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" - b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" - b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" - b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" - b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" - b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" - b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" - b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" - b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" - b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" - b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" - b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" - b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" - b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" - b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" - b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" - b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" - b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" - b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" - b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" - b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" - b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" - b"\\U0001FA60-\\U0001FA6D]" -) - - -if compat.is_python2: - # If we have this test in Python 3, pytest chokes, as it can't print the - # string above in the xpass message. - def test_issue3356(): - pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8"))) - assert not pattern.search("hello") diff --git a/spacy/tests/regression/test_issue3410.py b/spacy/tests/regression/test_issue3410.py deleted file mode 100644 index 5d2ac5ba3..000000000 --- a/spacy/tests/regression/test_issue3410.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -from spacy.lang.en import English -from spacy.matcher import Matcher, PhraseMatcher - - -def test_issue3410(): - texts = ["Hello world", "This is a test"] - nlp = English() - matcher = Matcher(nlp.vocab) - phrasematcher = PhraseMatcher(nlp.vocab) - with pytest.deprecated_call(): - docs = list(nlp.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - list(matcher.pipe(docs, n_threads=4)) - with pytest.deprecated_call(): - list(phrasematcher.pipe(docs, n_threads=4)) diff --git a/spacy/tests/regression/test_issue3447.py b/spacy/tests/regression/test_issue3447.py deleted file mode 100644 index 0ca1f9e67..000000000 --- a/spacy/tests/regression/test_issue3447.py +++ /dev/null @@ -1,14 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.util import decaying - - -def test_issue3447(): - sizes = decaying(10.0, 1.0, 0.5) - size = next(sizes) - assert size == 10.0 - size = next(sizes) - assert size == 10.0 - 0.5 - size = next(sizes) - assert size == 10.0 - 0.5 - 0.5 diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py deleted file mode 100644 index deff49fd6..000000000 --- a/spacy/tests/regression/test_issue3449.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest - -from spacy.lang.en import English - - -@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") -def test_issue3449(): - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - text1 = "He gave the ball to I. Do you want to go to the movies with I?" - text2 = "He gave the ball to I. Do you want to go to the movies with I?" - text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" - t1 = nlp(text1) - t2 = nlp(text2) - t3 = nlp(text3) - assert t1[5].text == "I" - assert t2[5].text == "I" - assert t3[5].text == "I" diff --git a/spacy/tests/regression/test_issue3468.py b/spacy/tests/regression/test_issue3468.py deleted file mode 100644 index ebbed2640..000000000 --- a/spacy/tests/regression/test_issue3468.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English -from spacy.tokens import Doc - - -def test_issue3468(): - """Test that sentence boundaries are set correctly so Doc.is_sentenced can - be restored after serialization.""" - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - doc = nlp("Hello world") - assert doc[0].is_sent_start - assert doc.is_sentenced - assert len(list(doc.sents)) == 1 - doc_bytes = doc.to_bytes() - new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) - assert new_doc[0].is_sent_start - assert new_doc.is_sentenced - assert len(list(new_doc.sents)) == 1