From 586c56fc6cbb4d5a386c4c91f64d6c63fc21920e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 8 Feb 2019 15:51:13 +0100
Subject: [PATCH] Tidy up regression tests

---
 spacy/tests/regression/_test_issue1622.py     |   2 +-
 spacy/tests/regression/test_issue2501-3000.py | 136 ++++++++++++++++++
 spacy/tests/regression/test_issue2564.py      |  17 ---
 spacy/tests/regression/test_issue2569.py      |  17 ---
 spacy/tests/regression/test_issue2626.py      |  11 --
 spacy/tests/regression/test_issue2671.py      |  28 ----
 spacy/tests/regression/test_issue2754.py      |  10 --
 spacy/tests/regression/test_issue2772.py      |  15 --
 spacy/tests/regression/test_issue2782.py      |  16 ---
 spacy/tests/regression/test_issue2835.py      |  11 --
 spacy/tests/regression/test_issue2871.py      |  25 ----
 spacy/tests/regression/test_issue2901.py      |  17 ---
 12 files changed, 137 insertions(+), 168 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue2501-3000.py
 delete mode 100644 spacy/tests/regression/test_issue2564.py
 delete mode 100644 spacy/tests/regression/test_issue2569.py
 delete mode 100644 spacy/tests/regression/test_issue2626.py
 delete mode 100644 spacy/tests/regression/test_issue2671.py
 delete mode 100644 spacy/tests/regression/test_issue2754.py
 delete mode 100644 spacy/tests/regression/test_issue2772.py
 delete mode 100644 spacy/tests/regression/test_issue2782.py
 delete mode 100644 spacy/tests/regression/test_issue2835.py
 delete mode 100644 spacy/tests/regression/test_issue2871.py
 delete mode 100644 spacy/tests/regression/test_issue2901.py

diff --git a/spacy/tests/regression/_test_issue1622.py b/spacy/tests/regression/_test_issue1622.py
index e8348b508..607a75c8e 100644
--- a/spacy/tests/regression/_test_issue1622.py
+++ b/spacy/tests/regression/_test_issue1622.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import json
 from tempfile import NamedTemporaryFile
 
-from ...cli.train import train
+from spacy.cli.train import train
 
 
 def test_cli_trained_model_can_be_saved(tmpdir):
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
new file mode 100644
index 000000000..a0df71135
--- /dev/null
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -0,0 +1,136 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.lang.en import English
+from spacy.lang.ja import Japanese
+from spacy.lang.xx import MultiLanguage
+from spacy.language import Language
+from spacy.matcher import Matcher
+from spacy.tokens import Span
+from spacy.vocab import Vocab
+from spacy._ml import link_vectors_to_models
+import numpy
+
+from ..util import get_doc
+
+
+def test_issue2564():
+    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
+    nlp = Language()
+    tagger = nlp.create_pipe("tagger")
+    tagger.begin_training()  # initialise weights
+    nlp.add_pipe(tagger)
+    doc = nlp("hello world")
+    assert doc.is_tagged
+    docs = nlp.pipe(["hello", "world"])
+    piped_doc = next(docs)
+    assert piped_doc.is_tagged
+
+
+def test_issue2569(en_tokenizer):
+    """Test that operator + is greedy."""
+    doc = en_tokenizer("It is May 15, 1993.")
+    doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
+    matcher = Matcher(doc.vocab)
+    matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
+    matched = [doc[start:end] for _, start, end in matcher(doc)]
+    matched = sorted(matched, key=len, reverse=True)
+    assert len(matched) == 10
+    assert len(matched[0]) == 4
+    assert matched[0].text == "May 15, 1993"
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
+        "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
+    ],
+)
+def test_issue2626_2835(en_tokenizer, text):
+    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
+    doc = en_tokenizer(text)
+    assert doc
+
+
+def test_issue2671():
+    """Ensure the correct entity ID is returned for matches with quantifiers.
+    See also #2675
+    """
+    nlp = English()
+    matcher = Matcher(nlp.vocab)
+    pattern_id = "test_pattern"
+    pattern = [
+        {"LOWER": "high"},
+        {"IS_PUNCT": True, "OP": "?"},
+        {"LOWER": "adrenaline"},
+    ]
+    matcher.add(pattern_id, None, pattern)
+    doc1 = nlp("This is a high-adrenaline situation.")
+    doc2 = nlp("This is a high adrenaline situation.")
+    matches1 = matcher(doc1)
+    for match_id, start, end in matches1:
+        assert nlp.vocab.strings[match_id] == pattern_id
+    matches2 = matcher(doc2)
+    for match_id, start, end in matches2:
+        assert nlp.vocab.strings[match_id] == pattern_id
+
+
+def test_issue2754(en_tokenizer):
+    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
+    a = en_tokenizer("a")
+    assert a[0].norm_ == "a"
+    am = en_tokenizer("am")
+    assert am[0].norm_ == "am"
+
+
+def test_issue2772(en_vocab):
+    """Test that deprojectivization doesn't mess up sentence boundaries."""
+    words = "When we write or communicate virtually , we can hide our true feelings .".split()
+    # A tree with a non-projective (i.e. crossing) arc
+    # The arcs (0, 4) and (2, 9) cross.
+    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
+    deps = ["dep"] * len(heads)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    assert doc[1].is_sent_start is None
+
+
+@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
+@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
+def test_issue2782(text, lang_cls):
+    """Check that like_num handles + and - before number."""
+    nlp = lang_cls()
+    doc = nlp(text)
+    assert len(doc) == 1
+    assert doc[0].like_num
+
+
+def test_issue2871():
+    """Test that vectors recover the correct key for spaCy reserved words."""
+    words = ["dog", "cat", "SUFFIX"]
+    vocab = Vocab()
+    vocab.vectors.resize(shape=(3, 10))
+    vector_data = numpy.zeros((3, 10), dtype="f")
+    for word in words:
+        _ = vocab[word]  # noqa: F841
+        vocab.set_vector(word, vector_data[0])
+    vocab.vectors.name = "dummy_vectors"
+    link_vectors_to_models(vocab)
+    assert vocab["dog"].rank == 0
+    assert vocab["cat"].rank == 1
+    assert vocab["SUFFIX"].rank == 2
+    assert vocab.vectors.find(key="dog") == 0
+    assert vocab.vectors.find(key="cat") == 1
+    assert vocab.vectors.find(key="SUFFIX") == 2
+
+
+def test_issue2901():
+    """Test that `nlp` doesn't fail."""
+    try:
+        nlp = Japanese()
+    except ImportError:
+        pytest.skip()
+
+    doc = nlp("pythonが大好きです")
+    assert doc
diff --git a/spacy/tests/regression/test_issue2564.py b/spacy/tests/regression/test_issue2564.py
deleted file mode 100644
index 12b376d1a..000000000
--- a/spacy/tests/regression/test_issue2564.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.language import Language
-
-
-def test_issue2564():
-    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
-    nlp = Language()
-    tagger = nlp.create_pipe("tagger")
-    tagger.begin_training()  # initialise weights
-    nlp.add_pipe(tagger)
-    doc = nlp("hello world")
-    assert doc.is_tagged
-    docs = nlp.pipe(["hello", "world"])
-    piped_doc = next(docs)
-    assert piped_doc.is_tagged
diff --git a/spacy/tests/regression/test_issue2569.py b/spacy/tests/regression/test_issue2569.py
deleted file mode 100644
index 6f30948c5..000000000
--- a/spacy/tests/regression/test_issue2569.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.matcher import Matcher
-from spacy.tokens import Span
-
-
-def test_issue2569(en_tokenizer):
-    doc = en_tokenizer("It is May 15, 1993.")
-    doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
-    matcher = Matcher(doc.vocab)
-    matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
-    matched = [doc[start:end] for _, start, end in matcher(doc)]
-    matched = sorted(matched, key=len, reverse=True)
-    assert len(matched) == 10
-    assert len(matched[0]) == 4
-    assert matched[0].text == "May 15, 1993"
diff --git a/spacy/tests/regression/test_issue2626.py b/spacy/tests/regression/test_issue2626.py
deleted file mode 100644
index 48cee35a0..000000000
--- a/spacy/tests/regression/test_issue2626.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-def test_issue2626(en_tokenizer):
-    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
-    text = """
-    ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume
-    """
-    doc = en_tokenizer(text)
-    assert doc
diff --git a/spacy/tests/regression/test_issue2671.py b/spacy/tests/regression/test_issue2671.py
deleted file mode 100644
index f2595a220..000000000
--- a/spacy/tests/regression/test_issue2671.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from spacy.lang.en import English
-from spacy.matcher import Matcher
-
-
-def test_issue2671():
-    """Ensure the correct entity ID is returned for matches with quantifiers.
-    See also #2675
-    """
-    nlp = English()
-    matcher = Matcher(nlp.vocab)
-    pattern_id = "test_pattern"
-    pattern = [
-        {"LOWER": "high"},
-        {"IS_PUNCT": True, "OP": "?"},
-        {"LOWER": "adrenaline"},
-    ]
-    matcher.add(pattern_id, None, pattern)
-    doc1 = nlp("This is a high-adrenaline situation.")
-    doc2 = nlp("This is a high adrenaline situation.")
-    matches1 = matcher(doc1)
-    for match_id, start, end in matches1:
-        assert nlp.vocab.strings[match_id] == pattern_id
-    matches2 = matcher(doc2)
-    for match_id, start, end in matches2:
-        assert nlp.vocab.strings[match_id] == pattern_id
diff --git a/spacy/tests/regression/test_issue2754.py b/spacy/tests/regression/test_issue2754.py
deleted file mode 100644
index c05006517..000000000
--- a/spacy/tests/regression/test_issue2754.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-def test_issue2754(en_tokenizer):
-    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
-    a = en_tokenizer("a")
-    assert a[0].norm_ == "a"
-    am = en_tokenizer("am")
-    assert am[0].norm_ == "am"
diff --git a/spacy/tests/regression/test_issue2772.py b/spacy/tests/regression/test_issue2772.py
deleted file mode 100644
index 3ae2a7860..000000000
--- a/spacy/tests/regression/test_issue2772.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from ..util import get_doc
-
-
-def test_issue2772(en_vocab):
-    """Test that deprojectivization doesn't mess up sentence boundaries."""
-    words = "When we write or communicate virtually , we can hide our true feelings .".split()
-    # A tree with a non-projective (i.e. crossing) arc
-    # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
-    deps = ["dep"] * len(heads)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert doc[1].is_sent_start is None
diff --git a/spacy/tests/regression/test_issue2782.py b/spacy/tests/regression/test_issue2782.py
deleted file mode 100644
index 86591ab12..000000000
--- a/spacy/tests/regression/test_issue2782.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.util import get_lang_class
-import pytest
-
-
-@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
-@pytest.mark.parametrize("lang", ["en", "xx"])
-def test_issue2782(text, lang):
-    """Check that like_num handles + and - before number."""
-    cls = get_lang_class(lang)
-    nlp = cls()
-    doc = nlp(text)
-    assert len(doc) == 1
-    assert doc[0].like_num
diff --git a/spacy/tests/regression/test_issue2835.py b/spacy/tests/regression/test_issue2835.py
deleted file mode 100644
index e5734b756..000000000
--- a/spacy/tests/regression/test_issue2835.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-def test_issue2835(en_tokenizer):
-    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
-    text = """
-    oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:
-    """
-    doc = en_tokenizer(text)
-    assert doc
diff --git a/spacy/tests/regression/test_issue2871.py b/spacy/tests/regression/test_issue2871.py
deleted file mode 100644
index b71099ed0..000000000
--- a/spacy/tests/regression/test_issue2871.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import numpy
-from spacy.vocab import Vocab
-from spacy._ml import link_vectors_to_models
-
-
-def test_issue2871():
-    """Test that vectors recover the correct key for spaCy reserved words."""
-    words = ["dog", "cat", "SUFFIX"]
-    vocab = Vocab()
-    vocab.vectors.resize(shape=(3, 10))
-    vector_data = numpy.zeros((3, 10), dtype="f")
-    for word in words:
-        _ = vocab[word]  # noqa: F841
-        vocab.set_vector(word, vector_data[0])
-    vocab.vectors.name = "dummy_vectors"
-    link_vectors_to_models(vocab)
-    assert vocab["dog"].rank == 0
-    assert vocab["cat"].rank == 1
-    assert vocab["SUFFIX"].rank == 2
-    assert vocab.vectors.find(key="dog") == 0
-    assert vocab.vectors.find(key="cat") == 1
-    assert vocab.vectors.find(key="SUFFIX") == 2
diff --git a/spacy/tests/regression/test_issue2901.py b/spacy/tests/regression/test_issue2901.py
deleted file mode 100644
index 077e33706..000000000
--- a/spacy/tests/regression/test_issue2901.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-
-from ...lang.ja import Japanese
-
-
-def test_issue2901():
-    """Test that `nlp` doesn't fail."""
-    try:
-        nlp = Japanese()
-    except ImportError:
-        pytest.skip()
-
-    doc = nlp("pythonが大好きです")
-    assert doc