diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 28ebcb0a9..6edef0702 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop from spacy.vectors import Vectors from spacy.vocab import Vocab from spacy.language import Language -from spacy.tokens import Doc, Span +from spacy.tokens import Doc, Span, Token from spacy.pipeline import Tagger, EntityRecognizer from spacy.attrs import HEAD, DEP from spacy.matcher import Matcher @@ -272,3 +272,60 @@ def test_issue1967(label): entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) gold_parses = [(None, [(entry, None)])] ner.moves.get_actions(gold_parses=gold_parses) + + +def test_issue1971(en_vocab): + # Possibly related to #2675 and #2671? + matcher = Matcher(en_vocab) + pattern = [ + {"ORTH": "Doe"}, + {"ORTH": "!", "OP": "?"}, + {"_": {"optional": True}, "OP": "?"}, + {"ORTH": "!", "OP": "?"}, + ] + Token.set_extension("optional", default=False) + matcher.add("TEST", None, pattern) + doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) + # We could also assert length 1 here, but this is more conclusive, because + # the real problem here is that it returns a duplicate match for a match_id + # that's not actually in the vocab! + matches = matcher(doc) + assert all([match_id in en_vocab.strings for match_id, start, end in matches]) + + +def test_issue_1971_2(en_vocab): + matcher = Matcher(en_vocab) + pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] + pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] + doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) + matcher.add("TEST1", None, pattern1, pattern2) + matches = matcher(doc) + assert len(matches) == 2 + + +def test_issue_1971_3(en_vocab): + """Test that pattern matches correctly for multiple extension attributes.""" + Token.set_extension("a", default=1, force=True) + Token.set_extension("b", default=2, force=True) + doc = Doc(en_vocab, words=["hello", "world"]) + matcher = Matcher(en_vocab) + matcher.add("A", None, [{"_": {"a": 1}}]) + matcher.add("B", None, [{"_": {"b": 2}}]) + matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) + assert len(matches) == 4 + assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) + + +def test_issue_1971_4(en_vocab): + """Test that pattern matches correctly with multiple extension attribute + values on a single token. + """ + Token.set_extension("ext_a", default="str_a", force=True) + Token.set_extension("ext_b", default="str_b", force=True) + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["this", "is", "text"]) + pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 + matcher.add("TEST", None, pattern) + matches = matcher(doc) + # Uncommenting this caused a segmentation fault + assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue1971.py b/spacy/tests/regression/test_issue1971.py deleted file mode 100644 index 858cb393d..000000000 --- a/spacy/tests/regression/test_issue1971.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.matcher import Matcher -from spacy.tokens import Token, Doc - - -def test_issue1971(en_vocab): - # Possibly related to #2675 and #2671? - matcher = Matcher(en_vocab) - pattern = [ - {"ORTH": "Doe"}, - {"ORTH": "!", "OP": "?"}, - {"_": {"optional": True}, "OP": "?"}, - {"ORTH": "!", "OP": "?"}, - ] - Token.set_extension("optional", default=False) - matcher.add("TEST", None, pattern) - doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) - # We could also assert length 1 here, but this is more conclusive, because - # the real problem here is that it returns a duplicate match for a match_id - # that's not actually in the vocab! - matches = matcher(doc) - assert all([match_id in en_vocab.strings for match_id, start, end in matches]) - - -def test_issue_1971_2(en_vocab): - matcher = Matcher(en_vocab) - pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] - pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] - doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) - matcher.add("TEST1", None, pattern1, pattern2) - matches = matcher(doc) - assert len(matches) == 2 - - -def test_issue_1971_3(en_vocab): - """Test that pattern matches correctly for multiple extension attributes.""" - Token.set_extension("a", default=1, force=True) - Token.set_extension("b", default=2, force=True) - doc = Doc(en_vocab, words=["hello", "world"]) - matcher = Matcher(en_vocab) - matcher.add("A", None, [{"_": {"a": 1}}]) - matcher.add("B", None, [{"_": {"b": 2}}]) - matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) - assert len(matches) == 4 - assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) - - -def test_issue_1971_4(en_vocab): - """Test that pattern matches correctly with multiple extension attribute - values on a single token. - """ - Token.set_extension("ext_a", default="str_a", force=True) - Token.set_extension("ext_b", default="str_b", force=True) - matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=["this", "is", "text"]) - pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 - matcher.add("TEST", None, pattern) - matches = matcher(doc) - # Uncommenting this caused a segmentation fault - assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index a0df71135..8997c8a56 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -2,13 +2,15 @@ from __future__ import unicode_literals import pytest +from spacy import displacy from spacy.lang.en import English from spacy.lang.ja import Japanese from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.matcher import Matcher -from spacy.tokens import Span +from spacy.tokens import Doc, Span from spacy.vocab import Vocab +from spacy.compat import pickle from spacy._ml import link_vectors_to_models import numpy @@ -54,6 +56,25 @@ def test_issue2626_2835(en_tokenizer, text): assert doc +def test_issue2656(en_tokenizer): + """Test that tokenizer correctly splits of punctuation after numbers with + decimal points. + """ + doc = en_tokenizer("I went for 40.3, and got home by 10.0.") + assert len(doc) == 11 + assert doc[0].text == "I" + assert doc[1].text == "went" + assert doc[2].text == "for" + assert doc[3].text == "40.3" + assert doc[4].text == "," + assert doc[5].text == "and" + assert doc[6].text == "got" + assert doc[7].text == "home" + assert doc[8].text == "by" + assert doc[9].text == "10.0" + assert doc[10].text == "." + + def test_issue2671(): """Ensure the correct entity ID is returned for matches with quantifiers. See also #2675 @@ -77,6 +98,17 @@ def test_issue2671(): assert nlp.vocab.strings[match_id] == pattern_id +def test_issue2728(en_vocab): + """Test that displaCy ENT visualizer escapes HTML correctly.""" + doc = Doc(en_vocab, words=["test", "", "test"]) + doc.ents = [Span(doc, 0, 1, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + doc.ents = [Span(doc, 1, 2, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + + def test_issue2754(en_tokenizer): """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" a = en_tokenizer("a") @@ -106,6 +138,28 @@ def test_issue2782(text, lang_cls): assert doc[0].like_num +def test_issue2822(it_tokenizer): + """Test that the abbreviation of poco is kept as one word.""" + doc = it_tokenizer("Vuoi un po' di zucchero?") + assert len(doc) == 6 + assert doc[0].text == "Vuoi" + assert doc[1].text == "un" + assert doc[2].text == "po'" + assert doc[2].lemma_ == "poco" + assert doc[3].text == "di" + assert doc[4].text == "zucchero" + assert doc[5].text == "?" + + +def test_issue2833(en_vocab): + """Test that a custom error is raised if a token or span is pickled.""" + doc = Doc(en_vocab, words=["Hello", "world"]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0:2]) + + def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] @@ -134,3 +188,19 @@ def test_issue2901(): doc = nlp("pythonが大好きです") assert doc + + +def test_issue2926(fr_tokenizer): + """Test that the tokenizer correctly splits tokens separated by a slash (/) + ending in a digit. + """ + doc = fr_tokenizer("Learn html5/css3/javascript/jquery") + assert len(doc) == 8 + assert doc[0].text == "Learn" + assert doc[1].text == "html5" + assert doc[2].text == "/" + assert doc[3].text == "css3" + assert doc[4].text == "/" + assert doc[5].text == "javascript" + assert doc[6].text == "/" + assert doc[7].text == "jquery" diff --git a/spacy/tests/regression/test_issue2656.py b/spacy/tests/regression/test_issue2656.py deleted file mode 100644 index ef51a10ce..000000000 --- a/spacy/tests/regression/test_issue2656.py +++ /dev/null @@ -1,24 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals -from spacy.lang.en import English - - -def test_issue2656(): - """ Test that tokenizer correctly splits of punctuation after numbers with decimal points """ - text = "I went for 40.3, and got home by 10.0." - nlp = English() - doc = nlp(text) - - assert len(doc) == 11 - - assert doc[0].text == "I" - assert doc[1].text == "went" - assert doc[2].text == "for" - assert doc[3].text == "40.3" - assert doc[4].text == "," - assert doc[5].text == "and" - assert doc[6].text == "got" - assert doc[7].text == "home" - assert doc[8].text == "by" - assert doc[9].text == "10.0" - assert doc[10].text == "." diff --git a/spacy/tests/regression/test_issue2728.py b/spacy/tests/regression/test_issue2728.py deleted file mode 100644 index ac3cbc91c..000000000 --- a/spacy/tests/regression/test_issue2728.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy import displacy -from spacy.tokens import Doc, Span - - -def test_issue2728(en_vocab): - """Test that displaCy ENT visualizer escapes HTML correctly.""" - doc = Doc(en_vocab, words=["test", "", "test"]) - doc.ents = [Span(doc, 0, 1, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html - doc.ents = [Span(doc, 1, 2, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html diff --git a/spacy/tests/regression/test_issue2822.py b/spacy/tests/regression/test_issue2822.py deleted file mode 100644 index f35f903df..000000000 --- a/spacy/tests/regression/test_issue2822.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals -from spacy.lang.it import Italian - - -def test_issue2822(): - """ Test that the abbreviation of poco is kept as one word """ - nlp = Italian() - text = "Vuoi un po' di zucchero?" - - doc = nlp(text) - - assert len(doc) == 6 - - assert doc[0].text == "Vuoi" - assert doc[1].text == "un" - assert doc[2].text == "po'" - assert doc[2].lemma_ == "poco" - assert doc[3].text == "di" - assert doc[4].text == "zucchero" - assert doc[5].text == "?" diff --git a/spacy/tests/regression/test_issue2833.py b/spacy/tests/regression/test_issue2833.py deleted file mode 100644 index de71a6524..000000000 --- a/spacy/tests/regression/test_issue2833.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy.tokens import Doc -from spacy.compat import pickle - - -def test_issue2833(en_vocab): - """Test that a custom error is raised if a token or span is pickled.""" - doc = Doc(en_vocab, words=["Hello", "world"]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0:2]) diff --git a/spacy/tests/regression/test_issue2926.py b/spacy/tests/regression/test_issue2926.py deleted file mode 100644 index 294b910de..000000000 --- a/spacy/tests/regression/test_issue2926.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals -from spacy.lang.fr import French - - -def test_issue2926(): - """ Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """ - nlp = French() - text = "Learn html5/css3/javascript/jquery" - doc = nlp(text) - - assert len(doc) == 8 - - assert doc[0].text == "Learn" - assert doc[1].text == "html5" - assert doc[2].text == "/" - assert doc[3].text == "css3" - assert doc[4].text == "/" - assert doc[5].text == "javascript" - assert doc[6].text == "/" - assert doc[7].text == "jquery"