Merge regression tests

2025-08-23 21:44:54 +03:00 · 2019-02-24 20:31:38 +01:00 · 2019-02-24 20:31:38 +01:00 · 328b589deb
commit 328b589deb
parent 3bc53905cc
8 changed files with 129 additions and 161 deletions
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop
 from spacy.vectors import Vectors
 from spacy.vocab import Vocab
 from spacy.language import Language
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc, Span, Token
 from spacy.pipeline import Tagger, EntityRecognizer
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
@ -272,3 +272,60 @@ def test_issue1967(label):
    entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
    gold_parses = [(None, [(entry, None)])]
    ner.moves.get_actions(gold_parses=gold_parses)
+
+
+def test_issue1971(en_vocab):
+    # Possibly related to #2675 and #2671?
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"ORTH": "Doe"},
+        {"ORTH": "!", "OP": "?"},
+        {"_": {"optional": True}, "OP": "?"},
+        {"ORTH": "!", "OP": "?"},
+    ]
+    Token.set_extension("optional", default=False)
+    matcher.add("TEST", None, pattern)
+    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
+    # We could also assert length 1 here, but this is more conclusive, because
+    # the real problem here is that it returns a duplicate match for a match_id
+    # that's not actually in the vocab!
+    matches = matcher(doc)
+    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
+
+
+def test_issue_1971_2(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
+    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
+    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
+    matcher.add("TEST1", None, pattern1, pattern2)
+    matches = matcher(doc)
+    assert len(matches) == 2
+
+
+def test_issue_1971_3(en_vocab):
+    """Test that pattern matches correctly for multiple extension attributes."""
+    Token.set_extension("a", default=1, force=True)
+    Token.set_extension("b", default=2, force=True)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    matcher = Matcher(en_vocab)
+    matcher.add("A", None, [{"_": {"a": 1}}])
+    matcher.add("B", None, [{"_": {"b": 2}}])
+    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
+    assert len(matches) == 4
+    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
+
+
+def test_issue_1971_4(en_vocab):
+    """Test that pattern matches correctly with multiple extension attribute
+    values on a single token.
+    """
+    Token.set_extension("ext_a", default="str_a", force=True)
+    Token.set_extension("ext_b", default="str_b", force=True)
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=["this", "is", "text"])
+    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
+    matcher.add("TEST", None, pattern)
+    matches = matcher(doc)
+    # Uncommenting this caused a segmentation fault
+    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue1971.py
+++ b/spacy/tests/regression/test_issue1971.py
@ -1,62 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.matcher import Matcher
-from spacy.tokens import Token, Doc
-
-
-def test_issue1971(en_vocab):
-    # Possibly related to #2675 and #2671?
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"ORTH": "Doe"},
-        {"ORTH": "!", "OP": "?"},
-        {"_": {"optional": True}, "OP": "?"},
-        {"ORTH": "!", "OP": "?"},
-    ]
-    Token.set_extension("optional", default=False)
-    matcher.add("TEST", None, pattern)
-    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
-    # We could also assert length 1 here, but this is more conclusive, because
-    # the real problem here is that it returns a duplicate match for a match_id
-    # that's not actually in the vocab!
-    matches = matcher(doc)
-    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
-
-
-def test_issue_1971_2(en_vocab):
-    matcher = Matcher(en_vocab)
-    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
-    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
-    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
-    matcher.add("TEST1", None, pattern1, pattern2)
-    matches = matcher(doc)
-    assert len(matches) == 2
-
-
-def test_issue_1971_3(en_vocab):
-    """Test that pattern matches correctly for multiple extension attributes."""
-    Token.set_extension("a", default=1, force=True)
-    Token.set_extension("b", default=2, force=True)
-    doc = Doc(en_vocab, words=["hello", "world"])
-    matcher = Matcher(en_vocab)
-    matcher.add("A", None, [{"_": {"a": 1}}])
-    matcher.add("B", None, [{"_": {"b": 2}}])
-    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
-    assert len(matches) == 4
-    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
-
-
-def test_issue_1971_4(en_vocab):
-    """Test that pattern matches correctly with multiple extension attribute
-    values on a single token.
-    """
-    Token.set_extension("ext_a", default="str_a", force=True)
-    Token.set_extension("ext_b", default="str_b", force=True)
-    matcher = Matcher(en_vocab)
-    doc = Doc(en_vocab, words=["this", "is", "text"])
-    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
-    matcher.add("TEST", None, pattern)
-    matches = matcher(doc)
-    # Uncommenting this caused a segmentation fault
-    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -2,13 +2,15 @@
 from __future__ import unicode_literals

 import pytest
+from spacy import displacy
 from spacy.lang.en import English
 from spacy.lang.ja import Japanese
 from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
 from spacy.matcher import Matcher
-from spacy.tokens import Span
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
+from spacy.compat import pickle
 from spacy._ml import link_vectors_to_models
 import numpy

@ -54,6 +56,25 @@ def test_issue2626_2835(en_tokenizer, text):
    assert doc


+def test_issue2656(en_tokenizer):
+    """Test that tokenizer correctly splits of punctuation after numbers with
+    decimal points.
+    """
+    doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
+    assert len(doc) == 11
+    assert doc[0].text == "I"
+    assert doc[1].text == "went"
+    assert doc[2].text == "for"
+    assert doc[3].text == "40.3"
+    assert doc[4].text == ","
+    assert doc[5].text == "and"
+    assert doc[6].text == "got"
+    assert doc[7].text == "home"
+    assert doc[8].text == "by"
+    assert doc[9].text == "10.0"
+    assert doc[10].text == "."
+
+
 def test_issue2671():
    """Ensure the correct entity ID is returned for matches with quantifiers.
    See also #2675
@ -77,6 +98,17 @@ def test_issue2671():
        assert nlp.vocab.strings[match_id] == pattern_id


+def test_issue2728(en_vocab):
+    """Test that displaCy ENT visualizer escapes HTML correctly."""
+    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
+    doc.ents = [Span(doc, 0, 1, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
+    doc.ents = [Span(doc, 1, 2, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
+
+
 def test_issue2754(en_tokenizer):
    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
    a = en_tokenizer("a")
@ -106,6 +138,28 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num


+def test_issue2822(it_tokenizer):
+    """Test that the abbreviation of poco is kept as one word."""
+    doc = it_tokenizer("Vuoi un po' di zucchero?")
+    assert len(doc) == 6
+    assert doc[0].text == "Vuoi"
+    assert doc[1].text == "un"
+    assert doc[2].text == "po'"
+    assert doc[2].lemma_ == "poco"
+    assert doc[3].text == "di"
+    assert doc[4].text == "zucchero"
+    assert doc[5].text == "?"
+
+
+def test_issue2833(en_vocab):
+    """Test that a custom error is raised if a token or span is pickled."""
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    with pytest.raises(NotImplementedError):
+        pickle.dumps(doc[0])
+    with pytest.raises(NotImplementedError):
+        pickle.dumps(doc[0:2])
+
+
 def test_issue2871():
    """Test that vectors recover the correct key for spaCy reserved words."""
    words = ["dog", "cat", "SUFFIX"]
@ -134,3 +188,19 @@ def test_issue2901():

    doc = nlp("pythonが大好きです")
    assert doc
+
+
+def test_issue2926(fr_tokenizer):
+    """Test that the tokenizer correctly splits tokens separated by a slash (/)
+    ending in a digit.
+    """
+    doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
+    assert len(doc) == 8
+    assert doc[0].text == "Learn"
+    assert doc[1].text == "html5"
+    assert doc[2].text == "/"
+    assert doc[3].text == "css3"
+    assert doc[4].text == "/"
+    assert doc[5].text == "javascript"
+    assert doc[6].text == "/"
+    assert doc[7].text == "jquery"
--- a/spacy/tests/regression/test_issue2656.py
+++ b/spacy/tests/regression/test_issue2656.py
@ -1,24 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-from spacy.lang.en import English
-
-
-def test_issue2656():
-    """ Test that tokenizer correctly splits of punctuation after numbers with decimal points """
-    text = "I went for 40.3, and got home by 10.0."
-    nlp = English()
-    doc = nlp(text)
-
-    assert len(doc) == 11
-
-    assert doc[0].text == "I"
-    assert doc[1].text == "went"
-    assert doc[2].text == "for"
-    assert doc[3].text == "40.3"
-    assert doc[4].text == ","
-    assert doc[5].text == "and"
-    assert doc[6].text == "got"
-    assert doc[7].text == "home"
-    assert doc[8].text == "by"
-    assert doc[9].text == "10.0"
-    assert doc[10].text == "."
--- a/spacy/tests/regression/test_issue2728.py
+++ b/spacy/tests/regression/test_issue2728.py
@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy import displacy
-from spacy.tokens import Doc, Span
-
-
-def test_issue2728(en_vocab):
-    """Test that displaCy ENT visualizer escapes HTML correctly."""
-    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
-    doc.ents = [Span(doc, 0, 1, label="TEST")]
-    html = displacy.render(doc, style="ent")
-    assert "&lt;RELEASE&gt;" in html
-    doc.ents = [Span(doc, 1, 2, label="TEST")]
-    html = displacy.render(doc, style="ent")
-    assert "&lt;RELEASE&gt;" in html
--- a/spacy/tests/regression/test_issue2822.py
+++ b/spacy/tests/regression/test_issue2822.py
@ -1,21 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-from spacy.lang.it import Italian
-
-
-def test_issue2822():
-    """ Test that the abbreviation of poco is kept as one word """
-    nlp = Italian()
-    text = "Vuoi un po' di zucchero?"
-
-    doc = nlp(text)
-
-    assert len(doc) == 6
-
-    assert doc[0].text == "Vuoi"
-    assert doc[1].text == "un"
-    assert doc[2].text == "po'"
-    assert doc[2].lemma_ == "poco"
-    assert doc[3].text == "di"
-    assert doc[4].text == "zucchero"
-    assert doc[5].text == "?"
--- a/spacy/tests/regression/test_issue2833.py
+++ b/spacy/tests/regression/test_issue2833.py
@ -1,15 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.tokens import Doc
-from spacy.compat import pickle
-
-
-def test_issue2833(en_vocab):
-    """Test that a custom error is raised if a token or span is pickled."""
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    with pytest.raises(NotImplementedError):
-        pickle.dumps(doc[0])
-    with pytest.raises(NotImplementedError):
-        pickle.dumps(doc[0:2])
--- a/spacy/tests/regression/test_issue2926.py
+++ b/spacy/tests/regression/test_issue2926.py
@ -1,21 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-from spacy.lang.fr import French
-
-
-def test_issue2926():
-    """ Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
-    nlp = French()
-    text = "Learn html5/css3/javascript/jquery"
-    doc = nlp(text)
-
-    assert len(doc) == 8
-
-    assert doc[0].text == "Learn"
-    assert doc[1].text == "html5"
-    assert doc[2].text == "/"
-    assert doc[3].text == "css3"
-    assert doc[4].text == "/"
-    assert doc[5].text == "javascript"
-    assert doc[6].text == "/"
-    assert doc[7].text == "jquery"