From 5b7b2a498d4651196fac837dfd06885e021b3456 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 6 Jul 2020 14:05:59 +0200
Subject: [PATCH 01/21] Tidy up and merge regression tests

---
 spacy/tests/regression/test_issue2001-2500.py |   2 +
 spacy/tests/regression/test_issue2501-3000.py |   5 +-
 spacy/tests/regression/test_issue3001-3500.py |   1 +
 spacy/tests/regression/test_issue3501-4000.py | 472 ++++++++++++++++++
 spacy/tests/regression/test_issue3521.py      |   8 -
 spacy/tests/regression/test_issue3526.py      |  85 ----
 spacy/tests/regression/test_issue3531.py      |  30 --
 spacy/tests/regression/test_issue3540.py      |  44 --
 spacy/tests/regression/test_issue3549.py      |  12 -
 spacy/tests/regression/test_issue3555.py      |  14 -
 spacy/tests/regression/test_issue3611.py      |  45 --
 spacy/tests/regression/test_issue3625.py      |   9 -
 spacy/tests/regression/test_issue3803.py      |  10 -
 spacy/tests/regression/test_issue3830.py      |  34 --
 spacy/tests/regression/test_issue3839.py      |  18 -
 spacy/tests/regression/test_issue3869.py      |  25 -
 spacy/tests/regression/test_issue3879.py      |  11 -
 spacy/tests/regression/test_issue3880.py      |  21 -
 spacy/tests/regression/test_issue3882.py      |  12 -
 spacy/tests/regression/test_issue3951.py      |  17 -
 spacy/tests/regression/test_issue3959.py      |  26 -
 spacy/tests/regression/test_issue3962.py      | 117 -----
 spacy/tests/regression/test_issue3972.py      |  19 -
 spacy/tests/regression/test_issue4001-4500.py | 469 +++++++++++++++++
 spacy/tests/regression/test_issue4002.py      |  23 -
 spacy/tests/regression/test_issue4030.py      |  50 --
 spacy/tests/regression/test_issue4042.py      |  85 ----
 spacy/tests/regression/test_issue4054.py      |  30 --
 spacy/tests/regression/test_issue4120.py      |  23 -
 spacy/tests/regression/test_issue4133.py      |  28 --
 spacy/tests/regression/test_issue4190.py      |  46 --
 spacy/tests/regression/test_issue4267.py      |  34 --
 spacy/tests/regression/test_issue4272.py      |   9 -
 spacy/tests/regression/test_issue4278.py      |  25 -
 spacy/tests/regression/test_issue4313.py      |  47 --
 spacy/tests/regression/test_issue4348.py      |  24 -
 spacy/tests/regression/test_issue4367.py      |   8 -
 spacy/tests/regression/test_issue4373.py      |  10 -
 spacy/tests/regression/test_issue4402.py      |  98 ----
 spacy/tests/regression/test_issue4501-5000.py | 288 +++++++++++
 spacy/tests/regression/test_issue4528.py      |  16 -
 spacy/tests/regression/test_issue4529.py      |  11 -
 spacy/tests/regression/test_issue4590.py      |  35 --
 spacy/tests/regression/test_issue4651.py      |  62 ---
 spacy/tests/regression/test_issue4665.py      |  35 --
 spacy/tests/regression/test_issue4674.py      |  36 --
 spacy/tests/regression/test_issue4707.py      |  20 -
 spacy/tests/regression/test_issue4725.py      |  41 --
 spacy/tests/regression/test_issue4849.py      |  34 --
 spacy/tests/regression/test_issue4903.py      |  40 --
 spacy/tests/regression/test_issue4924.py      |   8 -
 spacy/tests/regression/test_issue5152.py      |   3 +-
 spacy/tests/regression/test_issue5230.py      |   3 +-
 53 files changed, 1240 insertions(+), 1438 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3501-4000.py
 delete mode 100644 spacy/tests/regression/test_issue3521.py
 delete mode 100644 spacy/tests/regression/test_issue3526.py
 delete mode 100644 spacy/tests/regression/test_issue3531.py
 delete mode 100644 spacy/tests/regression/test_issue3540.py
 delete mode 100644 spacy/tests/regression/test_issue3549.py
 delete mode 100644 spacy/tests/regression/test_issue3555.py
 delete mode 100644 spacy/tests/regression/test_issue3611.py
 delete mode 100644 spacy/tests/regression/test_issue3625.py
 delete mode 100644 spacy/tests/regression/test_issue3803.py
 delete mode 100644 spacy/tests/regression/test_issue3830.py
 delete mode 100644 spacy/tests/regression/test_issue3839.py
 delete mode 100644 spacy/tests/regression/test_issue3869.py
 delete mode 100644 spacy/tests/regression/test_issue3879.py
 delete mode 100644 spacy/tests/regression/test_issue3880.py
 delete mode 100644 spacy/tests/regression/test_issue3882.py
 delete mode 100644 spacy/tests/regression/test_issue3951.py
 delete mode 100644 spacy/tests/regression/test_issue3959.py
 delete mode 100644 spacy/tests/regression/test_issue3962.py
 delete mode 100644 spacy/tests/regression/test_issue3972.py
 create mode 100644 spacy/tests/regression/test_issue4001-4500.py
 delete mode 100644 spacy/tests/regression/test_issue4002.py
 delete mode 100644 spacy/tests/regression/test_issue4030.py
 delete mode 100644 spacy/tests/regression/test_issue4042.py
 delete mode 100644 spacy/tests/regression/test_issue4054.py
 delete mode 100644 spacy/tests/regression/test_issue4120.py
 delete mode 100644 spacy/tests/regression/test_issue4133.py
 delete mode 100644 spacy/tests/regression/test_issue4190.py
 delete mode 100644 spacy/tests/regression/test_issue4267.py
 delete mode 100644 spacy/tests/regression/test_issue4272.py
 delete mode 100644 spacy/tests/regression/test_issue4278.py
 delete mode 100644 spacy/tests/regression/test_issue4313.py
 delete mode 100644 spacy/tests/regression/test_issue4348.py
 delete mode 100644 spacy/tests/regression/test_issue4367.py
 delete mode 100644 spacy/tests/regression/test_issue4373.py
 delete mode 100644 spacy/tests/regression/test_issue4402.py
 create mode 100644 spacy/tests/regression/test_issue4501-5000.py
 delete mode 100644 spacy/tests/regression/test_issue4528.py
 delete mode 100644 spacy/tests/regression/test_issue4529.py
 delete mode 100644 spacy/tests/regression/test_issue4590.py
 delete mode 100644 spacy/tests/regression/test_issue4651.py
 delete mode 100644 spacy/tests/regression/test_issue4665.py
 delete mode 100644 spacy/tests/regression/test_issue4674.py
 delete mode 100644 spacy/tests/regression/test_issue4707.py
 delete mode 100644 spacy/tests/regression/test_issue4725.py
 delete mode 100644 spacy/tests/regression/test_issue4849.py
 delete mode 100644 spacy/tests/regression/test_issue4903.py
 delete mode 100644 spacy/tests/regression/test_issue4924.py

diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 67966f70e..8b998d216 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -23,6 +23,7 @@ def test_issue2070():
     assert len(doc) == 11
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
     """Test that spurious 'extra_labels' aren't created when initializing NER."""
     nlp = Italian()
@@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
     assert len(matches) == 3
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
     """Test we can serialize and deserialize a blank NER or parser model."""
     nlp = Italian()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 5d504a9c6..768ae33fe 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
     assert doc[0].like_num
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
     """Test issue that arises when too many labels are added to NER model.
     Used to cause segfault.
     """
     nlp = English()
     train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
+        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+    )
     entity_types = [str(i) for i in range(1000)]
     ner = nlp.create_pipe("ner")
     nlp.add_pipe(ner)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 1aceba68f..1d5bfcb92 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -88,6 +88,7 @@ def test_issue3199():
     assert list(doc[0:3].noun_chunks) == []
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
     """Test issue that occurred in spaCy nightly where NER labels were being
     mapped to classes incorrectly after loading the model, when the labels
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
new file mode 100644
index 000000000..5e2ee902c
--- /dev/null
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -0,0 +1,472 @@
+import pytest
+from spacy.language import Language
+from spacy.vocab import Vocab
+from spacy.pipeline import EntityRuler, DependencyParser
+from spacy.pipeline.defaults import default_parser
+from spacy import displacy, load
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc, Token
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.errors import MatchPatternError
+from spacy.util import minibatch
+from spacy.gold import Example
+from spacy.lang.hi import Hindi
+from spacy.lang.es import Spanish
+from spacy.lang.en import English
+from spacy.attrs import IS_ALPHA
+from thinc.api import compounding
+import spacy
+import srsly
+import numpy
+
+from ..util import make_tempdir, get_doc
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
+    assert tok.is_stop
+
+
+def test_issue_3526_1(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler_bytes = ruler.to_bytes()
+    assert len(ruler) == len(patterns)
+    assert len(ruler.labels) == 4
+    assert ruler.overwrite
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(ruler_bytes)
+    assert len(new_ruler) == len(ruler)
+    assert len(new_ruler.labels) == 4
+    assert new_ruler.overwrite == ruler.overwrite
+    assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+def test_issue_3526_2(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(bytes_old_style)
+    assert len(new_ruler) == len(ruler)
+    for pattern in ruler.patterns:
+        assert pattern in new_ruler.patterns
+    assert new_ruler.overwrite is not ruler.overwrite
+
+
+def test_issue_3526_3(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    with make_tempdir() as tmpdir:
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
+        for pattern in ruler.patterns:
+            assert pattern in new_ruler.patterns
+        assert len(new_ruler) == len(ruler)
+        assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue_3526_4(en_vocab):
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, overwrite_ents=True)
+    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+    nlp.add_pipe(ruler)
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
+        nlp2 = load(tmpdir)
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True
+
+
+def test_issue3531():
+    """Test that displaCy renderer doesn't require "settings" key."""
+    example_dep = {
+        "words": [
+            {"text": "But", "tag": "CCONJ"},
+            {"text": "Google", "tag": "PROPN"},
+            {"text": "is", "tag": "VERB"},
+            {"text": "starting", "tag": "VERB"},
+            {"text": "from", "tag": "ADP"},
+            {"text": "behind.", "tag": "ADV"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+        ],
+    }
+    example_ent = {
+        "text": "But Google is starting from behind.",
+        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+    }
+    dep_html = displacy.render(example_dep, style="dep", manual=True)
+    assert dep_html
+    ent_html = displacy.render(example_ent, style="ent", manual=True)
+    assert ent_html
+
+
+def test_issue3540(en_vocab):
+    words = ["I", "live", "in", "NewYork", "right", "now"]
+    tensor = numpy.asarray(
+        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+        dtype="f",
+    )
+    doc = Doc(en_vocab, words=words)
+    doc.tensor = tensor
+    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_1 = [token.vector for token in doc]
+    assert len(vectors_1) == len(doc)
+
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[3], 1), doc[2]]
+        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_2 = [token.vector for token in doc]
+    assert len(vectors_2) == len(doc)
+    assert vectors_1[0].tolist() == vectors_2[0].tolist()
+    assert vectors_1[1].tolist() == vectors_2[1].tolist()
+    assert vectors_1[2].tolist() == vectors_2[2].tolist()
+    assert vectors_1[4].tolist() == vectors_2[5].tolist()
+    assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
+def test_issue3549(en_vocab):
+    """Test that match pattern validation doesn't raise on empty errors."""
+    matcher = Matcher(en_vocab, validate=True)
+    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+    matcher.add("GOOD", [pattern])
+    with pytest.raises(MatchPatternError):
+        matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.xfail
+def test_issue3555(en_vocab):
+    """Test that custom extensions with default None don't break matcher."""
+    Token.set_extension("issue3555", default=None)
+    matcher = Matcher(en_vocab)
+    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["have", "apple"])
+    matcher(doc)
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training(X=x_train, Y=y_train)
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+
+
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
+
+
+def test_issue3803():
+    """Test that spanish num-like tokens have True for like_num attribute."""
+    nlp = Spanish()
+    text = "2 dos 1000 mil 12 doce"
+    doc = nlp(text)
+
+    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_no_subtok():
+    """Test that the parser doesn't have subtok label if not learn_tokens"""
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" not in parser.labels
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_with_subtok():
+    """Test that the parser does have subtok label if learn_tokens=True."""
+    config = {
+        "learn_tokens": True,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" in parser.labels
+
+
+def test_issue3839(en_vocab):
+    """Test that match IDs returned by the matcher are correct, are in the string """
+    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+    matcher = Matcher(en_vocab)
+    match_id = "PATTERN"
+    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+    matcher.add(match_id, [pattern1])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+    matcher = Matcher(en_vocab)
+    matcher.add(match_id, [pattern2])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+    doc = nlp(sentence)
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+def test_issue3879(en_vocab):
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    assert len(doc) == 5
+    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [pattern])
+    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("parser"))
+    nlp.add_pipe(nlp.create_pipe("ner"))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.get_pipe("parser").add_label("dep")
+    nlp.get_pipe("ner").add_label("PERSON")
+    nlp.get_pipe("tagger").add_label("NN")
+    nlp.begin_training()
+    for doc in nlp.pipe(texts):
+        pass
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)
+
+
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+def test_issue3959():
+    """ Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+        assert doc2[0].pos_ == "NOUN"
+
+
+def test_issue3962(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+def test_issue3962_long(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # head set to the new artificial head (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
+
+
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+
+    assert len(matches) == 2
+
+    # We should have a match for each of the two rules
+    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+    assert "A" in found_ids
+    assert "B" in found_ids
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
deleted file mode 100644
index 3d8ee9922..000000000
--- a/spacy/tests/regression/test_issue3521.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-def test_issue3521(en_tokenizer, word):
-    tok = en_tokenizer(word)[1]
-    # 'not' and 'would' should be stopwords, also in their abbreviated forms
-    assert tok.is_stop
diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py
deleted file mode 100644
index aa77028fb..000000000
--- a/spacy/tests/regression/test_issue3526.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import pytest
-from spacy.tokens import Span
-from spacy.language import Language
-from spacy.pipeline import EntityRuler
-from spacy import load
-import srsly
-
-from ..util import make_tempdir
-
-
-@pytest.fixture
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-
-
-@pytest.fixture
-def add_ent():
-    def add_ent_component(doc):
-        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
-        return doc
-
-    return add_ent_component
-
-
-def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    ruler_bytes = ruler.to_bytes()
-    assert len(ruler) == len(patterns)
-    assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(ruler_bytes)
-    assert len(new_ruler) == len(ruler)
-    assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, overwrite_ents=True)
-
-    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-    nlp.add_pipe(ruler)
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        ruler = nlp.get_pipe("entity_ruler")
-        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
-        nlp2 = load(tmpdir)
-        new_ruler = nlp2.get_pipe("entity_ruler")
-        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py
deleted file mode 100644
index 4c65a5bfe..000000000
--- a/spacy/tests/regression/test_issue3531.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from spacy import displacy
-
-
-def test_issue3531():
-    """Test that displaCy renderer doesn't require "settings" key."""
-    example_dep = {
-        "words": [
-            {"text": "But", "tag": "CCONJ"},
-            {"text": "Google", "tag": "PROPN"},
-            {"text": "is", "tag": "VERB"},
-            {"text": "starting", "tag": "VERB"},
-            {"text": "from", "tag": "ADP"},
-            {"text": "behind.", "tag": "ADV"},
-        ],
-        "arcs": [
-            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
-            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
-            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
-            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
-            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
-        ],
-    }
-    example_ent = {
-        "text": "But Google is starting from behind.",
-        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
-    }
-    dep_html = displacy.render(example_dep, style="dep", manual=True)
-    assert dep_html
-    ent_html = displacy.render(example_ent, style="ent", manual=True)
-    assert ent_html
diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py
deleted file mode 100644
index be9e04b0b..000000000
--- a/spacy/tests/regression/test_issue3540.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from spacy.tokens import Doc
-
-import numpy as np
-
-
-def test_issue3540(en_vocab):
-
-    words = ["I", "live", "in", "NewYork", "right", "now"]
-    tensor = np.asarray(
-        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
-        dtype="f",
-    )
-    doc = Doc(en_vocab, words=words)
-    doc.tensor = tensor
-
-    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_1 = [token.vector for token in doc]
-    assert len(vectors_1) == len(doc)
-
-    with doc.retokenize() as retokenizer:
-        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
-        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
-    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_2 = [token.vector for token in doc]
-    assert len(vectors_2) == len(doc)
-
-    assert vectors_1[0].tolist() == vectors_2[0].tolist()
-    assert vectors_1[1].tolist() == vectors_2[1].tolist()
-    assert vectors_1[2].tolist() == vectors_2[2].tolist()
-
-    assert vectors_1[4].tolist() == vectors_2[5].tolist()
-    assert vectors_1[5].tolist() == vectors_2[6].tolist()
diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py
deleted file mode 100644
index b3af59c2e..000000000
--- a/spacy/tests/regression/test_issue3549.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import pytest
-from spacy.matcher import Matcher
-from spacy.errors import MatchPatternError
-
-
-def test_issue3549(en_vocab):
-    """Test that match pattern validation doesn't raise on empty errors."""
-    matcher = Matcher(en_vocab, validate=True)
-    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-    matcher.add("GOOD", [pattern])
-    with pytest.raises(MatchPatternError):
-        matcher.add("BAD", [[{"X": "Y"}]])
diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py
deleted file mode 100644
index de047bcbc..000000000
--- a/spacy/tests/regression/test_issue3555.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher
-
-
-@pytest.mark.xfail
-def test_issue3555(en_vocab):
-    """Test that custom extensions with default None don't break matcher."""
-    Token.set_extension("issue3555", default=None)
-    matcher = Matcher(en_vocab)
-    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["have", "apple"])
-    matcher(doc)
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
deleted file mode 100644
index ef189c446..000000000
--- a/spacy/tests/regression/test_issue3611.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue3611():
-    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training(X=x_train, Y=y_train)
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py
deleted file mode 100644
index 51561b3ac..000000000
--- a/spacy/tests/regression/test_issue3625.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from spacy.lang.hi import Hindi
-
-
-def test_issue3625():
-    """Test that default punctuation rules applies to hindi unicode characters"""
-    nlp = Hindi()
-    doc = nlp("hi. how हुए. होटल, होटल")
-    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
-    assert [token.text for token in doc] == expected
diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py
deleted file mode 100644
index ab5250edf..000000000
--- a/spacy/tests/regression/test_issue3803.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from spacy.lang.es import Spanish
-
-
-def test_issue3803():
-    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
-
-    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py
deleted file mode 100644
index 06b7893a7..000000000
--- a/spacy/tests/regression/test_issue3830.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from spacy.pipeline.pipes import DependencyParser
-from spacy.vocab import Vocab
-
-from spacy.pipeline.defaults import default_parser
-
-
-def test_issue3830_no_subtok():
-    """Test that the parser doesn't have subtok label if not learn_tokens"""
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" not in parser.labels
-
-
-def test_issue3830_with_subtok():
-    """Test that the parser does have subtok label if learn_tokens=True."""
-    config = {
-        "learn_tokens": True,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" in parser.labels
diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py
deleted file mode 100644
index 27b1f5f29..000000000
--- a/spacy/tests/regression/test_issue3839.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3839(en_vocab):
-    """Test that match IDs returned by the matcher are correct, are in the string """
-    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
-    matcher = Matcher(en_vocab)
-    match_id = "PATTERN"
-    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
-    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
-    matcher.add(match_id, [pattern1])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-    matcher = Matcher(en_vocab)
-    matcher.add(match_id, [pattern2])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
deleted file mode 100644
index 0a851e869..000000000
--- a/spacy/tests/regression/test_issue3869.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import pytest
-from spacy.attrs import IS_ALPHA
-from spacy.lang.en import English
-
-
-@pytest.mark.parametrize(
-    "sentence",
-    [
-        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
-        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
-    ],
-)
-def test_issue3869(sentence):
-    """Test that the Doc's count_by function works consistently"""
-    nlp = English()
-    doc = nlp(sentence)
-
-    count = 0
-    for token in doc:
-        count += token.is_alpha
-
-    assert count == doc.count_by(IS_ALPHA).get(1, 0)
diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py
deleted file mode 100644
index 8500c09aa..000000000
--- a/spacy/tests/regression/test_issue3879.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3879(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
-    assert len(doc) == 5
-    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [pattern])
-    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
deleted file mode 100644
index 6e8ab6f43..000000000
--- a/spacy/tests/regression/test_issue3880.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue3880():
-    """Test that `nlp.pipe()` works when an empty string ends the batch.
-
-    Fixed in v7.0.5 of Thinc.
-    """
-    texts = ["hello", "world", "", ""]
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("parser"))
-    nlp.add_pipe(nlp.create_pipe("ner"))
-    nlp.add_pipe(nlp.create_pipe("tagger"))
-    nlp.get_pipe("parser").add_label("dep")
-    nlp.get_pipe("ner").add_label("PERSON")
-    nlp.get_pipe("tagger").add_label("NN")
-    nlp.begin_training()
-    for doc in nlp.pipe(texts):
-        pass
diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py
deleted file mode 100644
index fa616db1d..000000000
--- a/spacy/tests/regression/test_issue3882.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc
-
-
-def test_issue3882(en_vocab):
-    """Test that displaCy doesn't serialize the doc.user_data when making a
-    copy of the Doc.
-    """
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
-    doc.user_data["test"] = set()
-    parse_deps(doc)
diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py
deleted file mode 100644
index 6e4c9eeaa..000000000
--- a/spacy/tests/regression/test_issue3951.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3951(en_vocab):
-    """Test that combinations of optional rules are matched correctly."""
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"LOWER": "hello"},
-        {"LOWER": "this", "OP": "?"},
-        {"OP": "?"},
-        {"LOWER": "world"},
-    ]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
-    matches = matcher(doc)
-    assert len(matches) == 0
diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py
deleted file mode 100644
index 7db28a31f..000000000
--- a/spacy/tests/regression/test_issue3959.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-def test_issue3959():
-    """ Ensure that a modified pos attribute is serialized correctly."""
-    nlp = English()
-    doc = nlp(
-        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
-    )
-    assert doc[0].pos_ == ""
-
-    doc[0].pos_ = "NOUN"
-    assert doc[0].pos_ == "NOUN"
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    with make_tempdir() as tmp_dir:
-        file_path = tmp_dir / "my_doc"
-        doc.to_disk(file_path)
-
-        doc2 = nlp("")
-        doc2.from_disk(file_path)
-
-        assert doc2[0].pos_ == "NOUN"
diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py
deleted file mode 100644
index 971c9b08e..000000000
--- a/spacy/tests/regression/test_issue3962.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import pytest
-
-from ..util import get_doc
-
-
-@pytest.fixture
-def doc(en_tokenizer):
-    text = "He jests at scars, that never felt a wound."
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ccomp",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962(doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = doc[1:5]  # "jests at scars ,"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root
-    assert doc2[0].dep_ == "dep"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"  # head set to the new artificial root
-    assert doc2[3].dep_ == "dep"
-
-    # We should still have 1 sentence
-    assert len(list(doc2.sents)) == 1
-
-    span3 = doc[6:9]  # "never felt a"
-    doc3 = span3.as_doc()
-    doc3_json = doc3.to_json()
-    assert doc3_json
-
-    assert doc3[0].head.text == "felt"
-    assert doc3[0].dep_ == "neg"
-    assert doc3[1].head.text == "felt"
-    assert doc3[1].dep_ == "ROOT"
-    assert doc3[2].head.text == "felt"  # head set to ancestor
-    assert doc3[2].dep_ == "dep"
-
-    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
-    assert len(list(doc3.sents)) == 1
-
-
-@pytest.fixture
-def two_sent_doc(en_tokenizer):
-    text = "He jests at scars. They never felt a wound."
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ROOT",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962_long(two_sent_doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root (in sentence 1)
-    assert doc2[0].dep_ == "ROOT"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"
-    assert doc2[3].dep_ == "punct"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to itself, being the new artificial root (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to the new artificial head (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-
-    # We should still have 2 sentences
-    sents = list(doc2.sents)
-    assert len(sents) == 2
-    assert sents[0].text == "jests at scars ."
-    assert sents[1].text == "They never"
diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py
deleted file mode 100644
index fe5388950..000000000
--- a/spacy/tests/regression/test_issue3972.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
-    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
-    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
-    matches = matcher(doc)
-
-    assert len(matches) == 2
-
-    # We should have a match for each of the two rules
-    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
-    assert "A" in found_ids
-    assert "B" in found_ids
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
new file mode 100644
index 000000000..2981c6428
--- /dev/null
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -0,0 +1,469 @@
+import pytest
+from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
+from spacy.pipeline.defaults import default_ner
+from spacy.matcher import PhraseMatcher, Matcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example, Corpus
+from spacy.gold.converters import json2docs
+from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.util import minibatch, ensure_path, load_model
+from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
+from spacy.tokenizer import Tokenizer
+from spacy.lang.el import Greek
+from spacy.language import Language
+import spacy
+from thinc.api import compounding
+from collections import defaultdict
+
+from ..util import make_tempdir
+
+
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes.
+    """
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", [pattern1])
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", [pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
+def test_issue4030():
+    """ Test whether textcat works fine with empty doc """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+    # processing of an empty doc should result in 0.0 for all categories
+    doc = nlp("")
+    assert doc.cats["offensive"] == 0.0
+    assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = load_model(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        config = {
+            "learn_tokens": False,
+            "min_action_freq": 30,
+            "beam_width": 1,
+            "beam_update_prob": 1.0,
+        }
+        ner2 = EntityRecognizer(vocab, default_ner(), **config)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
+
+
+def test_issue4054(en_vocab):
+    """Test that a new blank model can be made with a vocab from file,
+    and that serialization does not drop the language at any point."""
+    nlp1 = English()
+    vocab1 = nlp1.vocab
+    with make_tempdir() as d:
+        vocab_dir = ensure_path(d / "vocab")
+        if not vocab_dir.exists():
+            vocab_dir.mkdir()
+        vocab1.to_disk(vocab_dir)
+        vocab2 = Vocab().from_disk(vocab_dir)
+        print("lang", vocab2.lang)
+        nlp2 = spacy.blank("en", vocab=vocab2)
+        nlp_dir = ensure_path(d / "nlp")
+        if not nlp_dir.exists():
+            nlp_dir.mkdir()
+        nlp2.to_disk(nlp_dir)
+        nlp3 = load_model(nlp_dir)
+        assert nlp3.lang == "en"
+
+
+def test_issue4120(en_vocab):
+    """Test that matches without a final {OP: ?} token are returned."""
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+    doc1 = Doc(en_vocab, words=["a"])
+    assert len(matcher(doc1)) == 1  # works
+    doc2 = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc2)) == 2  # fixed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc3)) == 2  # works
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc4)) == 3  # fixed
+
+
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    doc_bytes = doc.to_bytes()
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+    assert actual == pos
+
+
+def test_issue4190():
+    def customize_tokenizer(nlp):
+        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+        infix_re = compile_infix_regex(nlp.Defaults.infixes)
+        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+        exceptions = {
+            k: v
+            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+            if not (len(k) == 2 and k[1] == ".")
+        }
+        new_tokenizer = Tokenizer(
+            nlp.vocab,
+            exceptions,
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=nlp.tokenizer.token_match,
+        )
+        nlp.tokenizer = new_tokenizer
+
+    test_string = "Test c."
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = load_model(model_dir)
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+    assert result_1b == result_2
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4267():
+    """ Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    assert "ner" in nlp.pipe_names
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.is_nered
+    for token in doc1:
+        assert token.ent_iob == 2
+    # add entity ruler and run again
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.is_nered
+    for token in doc2:
+        assert token.ent_iob == 2
+
+
+def test_issue4272():
+    """Test that lookup table can be accessed from Token.lemma if no POS tags
+    are available."""
+    nlp = Greek()
+    doc = nlp("Χθες")
+    assert doc[0].lemma_
+
+
+def test_multiple_predictions():
+    class DummyPipe(Pipe):
+        def __init__(self):
+            self.model = "dummy_model"
+
+        def predict(self, docs):
+            return ([1, 2, 3], [4, 5, 6])
+
+        def set_annotations(self, docs, scores, tensors=None):
+            return docs
+
+    nlp = Language()
+    doc = nlp.make_doc("foo")
+    dummy_pipe = DummyPipe()
+    dummy_pipe(doc)
+
+
+@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+    nlp = English()
+    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+    TRAIN_DATA = [example, example]
+    tagger = nlp.create_pipe("tagger")
+    nlp.add_pipe(tagger)
+    optimizer = nlp.begin_training()
+    for i in range(5):
+        losses = {}
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+
+
+def test_issue4367():
+    """Test that docbin init goes well"""
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+def test_issue4373():
+    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+    matcher = Matcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+    matcher = PhraseMatcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+
+
+def test_issue4402():
+    json_data = {
+        "id": 0,
+        "paragraphs": [
+            {
+                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "How", "ner": "O"},
+                            {"id": 1, "orth": "should", "ner": "O"},
+                            {"id": 2, "orth": "I", "ner": "O"},
+                            {"id": 3, "orth": "cook", "ner": "O"},
+                            {"id": 4, "orth": "bacon", "ner": "O"},
+                            {"id": 5, "orth": "in", "ner": "O"},
+                            {"id": 6, "orth": "an", "ner": "O"},
+                            {"id": 7, "orth": "oven", "ner": "O"},
+                            {"id": 8, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {
+                        "tokens": [
+                            {"id": 9, "orth": "\n", "ner": "O"},
+                            {"id": 10, "orth": "I", "ner": "O"},
+                            {"id": 11, "orth": "'ve", "ner": "O"},
+                            {"id": 12, "orth": "heard", "ner": "O"},
+                            {"id": 13, "orth": "of", "ner": "O"},
+                            {"id": 14, "orth": "people", "ner": "O"},
+                            {"id": 15, "orth": "cooking", "ner": "O"},
+                            {"id": 16, "orth": "bacon", "ner": "O"},
+                            {"id": 17, "orth": "in", "ner": "O"},
+                            {"id": 18, "orth": "an", "ner": "O"},
+                            {"id": 19, "orth": "oven", "ner": "O"},
+                            {"id": 20, "orth": ".", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                ],
+                "cats": [
+                    {"label": "baking", "value": 1.0},
+                    {"label": "not_baking", "value": 0.0},
+                ],
+            },
+            {
+                "raw": "What is the difference between white and brown eggs?\n",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "What", "ner": "O"},
+                            {"id": 1, "orth": "is", "ner": "O"},
+                            {"id": 2, "orth": "the", "ner": "O"},
+                            {"id": 3, "orth": "difference", "ner": "O"},
+                            {"id": 4, "orth": "between", "ner": "O"},
+                            {"id": 5, "orth": "white", "ner": "O"},
+                            {"id": 6, "orth": "and", "ner": "O"},
+                            {"id": 7, "orth": "brown", "ner": "O"},
+                            {"id": 8, "orth": "eggs", "ner": "O"},
+                            {"id": 9, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+                ],
+                "cats": [
+                    {"label": "baking", "value": 0.0},
+                    {"label": "not_baking", "value": 1.0},
+                ],
+            },
+        ],
+    }
+    nlp = English()
+    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "test4402.spacy"
+        docs = json2docs([json_data])
+        data = DocBin(docs=docs, attrs=attrs).to_bytes()
+        with output_file.open("wb") as file_:
+            file_.write(data)
+        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
+
+        train_data = list(corpus.train_dataset(nlp))
+        assert len(train_data) == 2
+
+        split_train_data = []
+        for eg in train_data:
+            split_train_data.extend(eg.split_sents())
+        assert len(split_train_data) == 4
diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py
deleted file mode 100644
index 3ac26d3ab..000000000
--- a/spacy/tests/regression/test_issue4002.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes.
-    """
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern1 = Doc(en_vocab, words=["c", "d"])
-    assert [t.norm_ for t in pattern1] == ["c", "d"]
-    matcher.add("TEST", [pattern1])
-    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
-    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern2 = Doc(en_vocab, words=["1", "2"])
-    pattern2[0].norm_ = "c"
-    pattern2[1].norm_ = "d"
-    assert [t.norm_ for t in pattern2] == ["c", "d"]
-    matcher.add("TEST", [pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 1
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
deleted file mode 100644
index e40565501..000000000
--- a/spacy/tests/regression/test_issue4030.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue4030():
-    """ Test whether textcat works fine with empty doc """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
-
-    # processing of an empty doc should result in 0.0 for all categories
-    doc = nlp("")
-    assert doc.cats["offensive"] == 0.0
-    assert doc.cats["inoffensive"] == 0.0
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
deleted file mode 100644
index f47290b92..000000000
--- a/spacy/tests/regression/test_issue4042.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import spacy
-from spacy.pipeline import EntityRecognizer, EntityRuler
-from spacy.lang.en import English
-from spacy.tokens import Span
-from spacy.util import ensure_path
-from spacy.pipeline.defaults import default_ner
-
-from ..util import make_tempdir
-
-
-def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
-    nlp = English()
-
-    # add ner pipe
-    ner = nlp.create_pipe("ner")
-    ner.add_label("SOME_LABEL")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    # Add entity ruler
-    ruler = EntityRuler(nlp)
-    patterns = [
-        {"label": "MY_ORG", "pattern": "Apple"},
-        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
-    ]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
-    doc1 = nlp("What do you think about Apple ?")
-    assert doc1.ents[0].label_ == "MY_ORG"
-
-    with make_tempdir() as d:
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.to_disk(output_dir)
-
-        nlp2 = spacy.load(output_dir)
-        doc2 = nlp2("What do you think about Apple ?")
-        assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
-    """
-    Test that serialization of an NER works fine when new labels were added.
-    This is the second bug of two bugs underlying the issue 4042.
-    """
-    nlp1 = English()
-    vocab = nlp1.vocab
-
-    # add ner pipe
-    ner1 = nlp1.create_pipe("ner")
-    ner1.add_label("SOME_LABEL")
-    nlp1.add_pipe(ner1)
-    nlp1.begin_training()
-
-    # add a new label to the doc
-    doc1 = nlp1("What do you think about Apple ?")
-    assert len(ner1.labels) == 1
-    assert "SOME_LABEL" in ner1.labels
-    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
-    doc1.ents = list(doc1.ents) + [apple_ent]
-
-    # reapply the NER - at this point it should resize itself
-    ner1(doc1)
-    assert len(ner1.labels) == 2
-    assert "SOME_LABEL" in ner1.labels
-    assert "MY_ORG" in ner1.labels
-
-    with make_tempdir() as d:
-        # assert IO goes fine
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        ner1.to_disk(output_dir)
-
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-            "beam_width": 1,
-            "beam_update_prob": 1.0,
-        }
-        ner2 = EntityRecognizer(vocab, default_ner(), **config)
-        ner2.from_disk(output_dir)
-        assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py
deleted file mode 100644
index c52ded395..000000000
--- a/spacy/tests/regression/test_issue4054.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from spacy.vocab import Vocab
-import spacy
-from spacy.lang.en import English
-from spacy.util import ensure_path
-
-from ..util import make_tempdir
-
-
-def test_issue4054(en_vocab):
-    """Test that a new blank model can be made with a vocab from file,
-    and that serialization does not drop the language at any point."""
-    nlp1 = English()
-    vocab1 = nlp1.vocab
-
-    with make_tempdir() as d:
-        vocab_dir = ensure_path(d / "vocab")
-        if not vocab_dir.exists():
-            vocab_dir.mkdir()
-        vocab1.to_disk(vocab_dir)
-
-        vocab2 = Vocab().from_disk(vocab_dir)
-        print("lang", vocab2.lang)
-        nlp2 = spacy.blank("en", vocab=vocab2)
-
-        nlp_dir = ensure_path(d / "nlp")
-        if not nlp_dir.exists():
-            nlp_dir.mkdir()
-        nlp2.to_disk(nlp_dir)
-        nlp3 = spacy.load(nlp_dir)
-        assert nlp3.lang == "en"
diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py
deleted file mode 100644
index 4849aa238..000000000
--- a/spacy/tests/regression/test_issue4120.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue4120(en_vocab):
-    """Test that matches without a final {OP: ?} token are returned."""
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
-    doc1 = Doc(en_vocab, words=["a"])
-    assert len(matcher(doc1)) == 1  # works
-
-    doc2 = Doc(en_vocab, words=["a", "b", "c"])
-    assert len(matcher(doc2)) == 2  # fixed
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
-    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc3)) == 2  # works
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
-    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc4)) == 3  # fixed
diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py
deleted file mode 100644
index a726806d7..000000000
--- a/spacy/tests/regression/test_issue4133.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-
-
-def test_issue4133(en_vocab):
-    nlp = English()
-    vocab_bytes = nlp.vocab.to_bytes()
-    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
-    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
-    doc = Doc(en_vocab, words=words)
-    for i, token in enumerate(doc):
-        token.pos_ = pos[i]
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    doc_bytes = doc.to_bytes()
-
-    vocab = Vocab()
-    vocab = vocab.from_bytes(vocab_bytes)
-    doc = Doc(vocab).from_bytes(doc_bytes)
-
-    actual = []
-    for token in doc:
-        actual.append(token.pos_)
-
-    assert actual == pos
diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py
deleted file mode 100644
index 97d532d2a..000000000
--- a/spacy/tests/regression/test_issue4190.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokenizer import Tokenizer
-from spacy import util
-
-from ..util import make_tempdir
-
-
-def test_issue4190():
-    test_string = "Test c."
-    # Load default language
-    nlp_1 = English()
-    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]  # noqa: F841
-    # Modify tokenizer
-    customize_tokenizer(nlp_1)
-    doc_1b = nlp_1(test_string)
-    result_1b = [token.text for token in doc_1b]
-    # Save and Reload
-    with make_tempdir() as model_dir:
-        nlp_1.to_disk(model_dir)
-        nlp_2 = util.load_model(model_dir)
-    # This should be the modified tokenizer
-    doc_2 = nlp_2(test_string)
-    result_2 = [token.text for token in doc_2]
-    assert result_1b == result_2
-
-
-def customize_tokenizer(nlp):
-    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
-    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
-    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
-    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
-    exceptions = {
-        k: v
-        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
-        if not (len(k) == 2 and k[1] == ".")
-    }
-    new_tokenizer = Tokenizer(
-        nlp.vocab,
-        exceptions,
-        prefix_search=prefix_re.search,
-        suffix_search=suffix_re.search,
-        infix_finditer=infix_re.finditer,
-        token_match=nlp.tokenizer.token_match,
-    )
-    nlp.tokenizer = new_tokenizer
diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py
deleted file mode 100644
index 891f03b30..000000000
--- a/spacy/tests/regression/test_issue4267.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4267():
-    """ Test that running an entity_ruler after ner gives consistent results"""
-    nlp = English()
-    ner = nlp.create_pipe("ner")
-    ner.add_label("PEOPLE")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    assert "ner" in nlp.pipe_names
-
-    # assert that we have correct IOB annotations
-    doc1 = nlp("hi")
-    assert doc1.is_nered
-    for token in doc1:
-        assert token.ent_iob == 2
-
-    # add entity ruler and run again
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-    assert "entity_ruler" in nlp.pipe_names
-    assert "ner" in nlp.pipe_names
-
-    # assert that we still have correct IOB annotations
-    doc2 = nlp("hi")
-    assert doc2.is_nered
-    for token in doc2:
-        assert token.ent_iob == 2
diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py
deleted file mode 100644
index 4bac97a44..000000000
--- a/spacy/tests/regression/test_issue4272.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from spacy.lang.el import Greek
-
-
-def test_issue4272():
-    """Test that lookup table can be accessed from Token.lemma if no POS tags
-    are available."""
-    nlp = Greek()
-    doc = nlp("Χθες")
-    assert doc[0].lemma_
diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py
deleted file mode 100644
index ffbc41226..000000000
--- a/spacy/tests/regression/test_issue4278.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.pipeline import Pipe
-
-
-class DummyPipe(Pipe):
-    def __init__(self):
-        self.model = "dummy_model"
-
-    def predict(self, docs):
-        return ([1, 2, 3], [4, 5, 6])
-
-    def set_annotations(self, docs, scores, tensors=None):
-        return docs
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-def test_multiple_predictions(nlp):
-    doc = nlp.make_doc("foo")
-    dummy_pipe = DummyPipe()
-    dummy_pipe(doc)
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
deleted file mode 100644
index 3bddc26ca..000000000
--- a/spacy/tests/regression/test_issue4313.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from collections import defaultdict
-
-import pytest
-
-from spacy.pipeline.defaults import default_ner
-from spacy.pipeline import EntityRecognizer
-
-from spacy.lang.en import English
-from spacy.tokens import Span
-
-
-# skipped after removing Beam stuff during the Example/GoldParse refactor
-@pytest.mark.skip
-def test_issue4313():
-    """ This should not crash or exit with some strange error code """
-    beam_width = 16
-    beam_density = 0.0001
-    nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
-    ner.add_label("SOME_LABEL")
-    ner.begin_training([])
-    nlp.add_pipe(ner)
-
-    # add a new label to the doc
-    doc = nlp("What do you think about Apple ?")
-    assert len(ner.labels) == 1
-    assert "SOME_LABEL" in ner.labels
-    apple_ent = Span(doc, 5, 6, label="MY_ORG")
-    doc.ents = list(doc.ents) + [apple_ent]
-
-    # ensure the beam_parse still works with the new label
-    docs = [doc]
-    beams = nlp.entity.beam_parse(
-        docs, beam_width=beam_width, beam_density=beam_density
-    )
-
-    for doc, beam in zip(docs, beams):
-        entity_scores = defaultdict(float)
-        for score, ents in nlp.entity.moves.get_beam_parses(beam):
-            for start, end, label in ents:
-                entity_scores[(start, end, label)] += score
diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py
deleted file mode 100644
index 06b03df24..000000000
--- a/spacy/tests/regression/test_issue4348.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from spacy.gold import Example
-from spacy.lang.en import English
-from spacy.util import minibatch
-from thinc.api import compounding
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue4348():
-    """Test that training the tagger with empty data, doesn't throw errors"""
-
-    nlp = English()
-    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
-    TRAIN_DATA = [example, example]
-
-    tagger = nlp.create_pipe("tagger")
-    nlp.add_pipe(tagger)
-
-    optimizer = nlp.begin_training()
-    for i in range(5):
-        losses = {}
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
-        for batch in batches:
-            nlp.update(batch, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py
deleted file mode 100644
index 917847a05..000000000
--- a/spacy/tests/regression/test_issue4367.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue4367():
-    """Test that docbin init goes well"""
-    DocBin()
-    DocBin(attrs=["LEMMA"])
-    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py
deleted file mode 100644
index dbde1624e..000000000
--- a/spacy/tests/regression/test_issue4373.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.vocab import Vocab
-
-
-def test_issue4373():
-    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
-    matcher = Matcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-    matcher = PhraseMatcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
deleted file mode 100644
index 9c596aaf6..000000000
--- a/spacy/tests/regression/test_issue4402.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from spacy.gold import Corpus
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-from ...gold.converters import json2docs
-from ...tokens import DocBin
-
-
-def test_issue4402():
-    nlp = English()
-    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
-        data = DocBin(docs=docs, attrs=attrs).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
-        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
-
-        train_data = list(corpus.train_dataset(nlp))
-        assert len(train_data) == 2
-
-        split_train_data = []
-        for eg in train_data:
-            split_train_data.extend(eg.split_sents())
-        assert len(split_train_data) == 4
-
-
-json_data = {
-    "id": 0,
-    "paragraphs": [
-        {
-            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "How", "ner": "O"},
-                        {"id": 1, "orth": "should", "ner": "O"},
-                        {"id": 2, "orth": "I", "ner": "O"},
-                        {"id": 3, "orth": "cook", "ner": "O"},
-                        {"id": 4, "orth": "bacon", "ner": "O"},
-                        {"id": 5, "orth": "in", "ner": "O"},
-                        {"id": 6, "orth": "an", "ner": "O"},
-                        {"id": 7, "orth": "oven", "ner": "O"},
-                        {"id": 8, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {
-                    "tokens": [
-                        {"id": 9, "orth": "\n", "ner": "O"},
-                        {"id": 10, "orth": "I", "ner": "O"},
-                        {"id": 11, "orth": "'ve", "ner": "O"},
-                        {"id": 12, "orth": "heard", "ner": "O"},
-                        {"id": 13, "orth": "of", "ner": "O"},
-                        {"id": 14, "orth": "people", "ner": "O"},
-                        {"id": 15, "orth": "cooking", "ner": "O"},
-                        {"id": 16, "orth": "bacon", "ner": "O"},
-                        {"id": 17, "orth": "in", "ner": "O"},
-                        {"id": 18, "orth": "an", "ner": "O"},
-                        {"id": 19, "orth": "oven", "ner": "O"},
-                        {"id": 20, "orth": ".", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-            ],
-            "cats": [
-                {"label": "baking", "value": 1.0},
-                {"label": "not_baking", "value": 0.0},
-            ],
-        },
-        {
-            "raw": "What is the difference between white and brown eggs?\n",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "What", "ner": "O"},
-                        {"id": 1, "orth": "is", "ner": "O"},
-                        {"id": 2, "orth": "the", "ner": "O"},
-                        {"id": 3, "orth": "difference", "ner": "O"},
-                        {"id": 4, "orth": "between", "ner": "O"},
-                        {"id": 5, "orth": "white", "ner": "O"},
-                        {"id": 6, "orth": "and", "ner": "O"},
-                        {"id": 7, "orth": "brown", "ner": "O"},
-                        {"id": 8, "orth": "eggs", "ner": "O"},
-                        {"id": 9, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
-            ],
-            "cats": [
-                {"label": "baking", "value": 0.0},
-                {"label": "not_baking", "value": 1.0},
-            ],
-        },
-    ],
-}
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
new file mode 100644
index 000000000..9bace8fc7
--- /dev/null
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -0,0 +1,288 @@
+import pytest
+from mock import Mock
+from spacy.pipeline import EntityRuler
+from spacy.matcher import DependencyMatcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example
+from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.lang.en import English
+from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
+from spacy.language import Language
+from spacy.util import ensure_path, load_model_from_path
+import numpy
+import pickle
+
+from ..util import get_doc, make_tempdir
+
+
+def test_issue4528(en_vocab):
+    """Test that user_data is correctly serialized in DocBin."""
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc.user_data["foo"] = "bar"
+    # This is how extension attribute values are stored in the user data
+    doc.user_data[("._.", "foo", None, None)] = "bar"
+    doc_bin = DocBin(store_user_data=True)
+    doc_bin.add(doc)
+    doc_bin_bytes = doc_bin.to_bytes()
+    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+    assert new_doc.user_data["foo"] == "bar"
+    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    Example.from_dict(doc, {"words": words})
+
+
+def test_issue4590(en_vocab):
+    """Test that matches param in on_match method are the same as matches run with no on_match method"""
+    pattern = [
+        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+    ]
+
+    on_match = Mock()
+    matcher = DependencyMatcher(en_vocab)
+    matcher.add("pattern", on_match, pattern)
+    text = "The quick brown fox jumped over the lazy fox"
+    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
+    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+    matches = matcher(doc)
+    on_match_args = on_match.call_args
+    assert on_match_args[0][3] == matches
+
+
+def test_issue4651_with_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4651_without_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    not specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4665():
+    """
+    conllu2json should not raise an exception if the HEAD column contains an
+    underscore
+    """
+    input_data = """
+1	[	_	PUNCT	-LRB-	_	_	punct	_	_
+2	This	_	DET	DT	_	_	det	_	_
+3	killing	_	NOUN	NN	_	_	nsubj	_	_
+4	of	_	ADP	IN	_	_	case	_	_
+5	a	_	DET	DT	_	_	det	_	_
+6	respected	_	ADJ	JJ	_	_	amod	_	_
+7	cleric	_	NOUN	NN	_	_	nmod	_	_
+8	will	_	AUX	MD	_	_	aux	_	_
+9	be	_	AUX	VB	_	_	aux	_	_
+10	causing	_	VERB	VBG	_	_	root	_	_
+11	us	_	PRON	PRP	_	_	iobj	_	_
+12	trouble	_	NOUN	NN	_	_	dobj	_	_
+13	for	_	ADP	IN	_	_	case	_	_
+14	years	_	NOUN	NNS	_	_	nmod	_	_
+15	to	_	PART	TO	_	_	mark	_	_
+16	come	_	VERB	VB	_	_	acl	_	_
+17	.	_	PUNCT	.	_	_	punct	_	_
+18	]	_	PUNCT	-RRB-	_	_	punct	_	_
+"""
+    conllu2docs(input_data)
+
+
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    with pytest.warns(UserWarning):
+        kb.set_entities(
+            entity_list=["Q1", "Q1"],
+            freq_list=[32, 111],
+            vector_list=[vector1, vector2],
+        )
+    assert kb.get_size_entities() == 1
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.dump(str(file_path))
+        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
+        kb2.load_bulk(str(file_path))
+    assert kb2.get_size_entities() == 1
+
+
+def test_issue4707():
+    """Tests that disabled component names are also excluded from nlp.from_disk
+    by default when loading a model.
+    """
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
+    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
+    exclude = ["tokenizer", "sentencizer"]
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir, exclude=exclude)
+        new_nlp = load_model_from_path(tmpdir, disable=exclude)
+    assert "sentencizer" not in new_nlp.pipe_names
+    assert "entity_ruler" in new_nlp.pipe_names
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_1():
+    """ Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["min_action_freq"] == 342
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["min_action_freq"] == 342
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_2():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
+
+def test_issue4849():
+    nlp = English()
+    ruler = EntityRuler(
+        nlp,
+        patterns=[
+            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+        ],
+        phrase_matcher_attr="LOWER",
+    )
+    nlp.add_pipe(ruler)
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+    # USING 2 PROCESSES
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=2):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+
+
+def test_issue4903():
+    """Ensure that this runs correctly and doesn't hang or crash on Windows /
+    macOS."""
+
+    class CustomPipe:
+        name = "my_pipe"
+
+        def __init__(self):
+            Span.set_extension("my_ext", getter=self._get_my_ext)
+            Doc.set_extension("my_ext", default=None)
+
+        def __call__(self, doc):
+            gathered_ext = []
+            for sent in doc.sents:
+                sent_ext = self._get_my_ext(sent)
+                sent._.set("my_ext", sent_ext)
+                gathered_ext.append(sent_ext)
+
+            doc._.set("my_ext", "\n".join(gathered_ext))
+
+            return doc
+
+        @staticmethod
+        def _get_my_ext(span):
+            return str(span.end)
+
+    nlp = English()
+    custom_component = CustomPipe()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(custom_component, after="sentencizer")
+
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    docs = list(nlp.pipe(text, n_process=2))
+    assert docs[0].text == "I like bananas."
+    assert docs[1].text == "Do you like them?"
+    assert docs[2].text == "No, I prefer wasabi."
+
+
+def test_issue4924():
+    nlp = Language()
+    example = Example.from_dict(nlp.make_doc(""), {})
+    nlp.evaluate([example])
diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py
deleted file mode 100644
index 6f96c9f2d..000000000
--- a/spacy/tests/regression/test_issue4528.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from spacy.tokens import Doc, DocBin
-
-
-def test_issue4528(en_vocab):
-    """Test that user_data is correctly serialized in DocBin."""
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc.user_data["foo"] = "bar"
-    # This is how extension attribute values are stored in the user data
-    doc.user_data[("._.", "foo", None, None)] = "bar"
-    doc_bin = DocBin(store_user_data=True)
-    doc_bin.add(doc)
-    doc_bin_bytes = doc_bin.to_bytes()
-    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
-    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
-    assert new_doc.user_data["foo"] == "bar"
-    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py
deleted file mode 100644
index 0708499de..000000000
--- a/spacy/tests/regression/test_issue4529.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pytest
-
-from spacy.gold import Example
-
-
-@pytest.mark.parametrize(
-    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
-    doc = en_tokenizer(text)
-    Example.from_dict(doc, {"words": words})
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
deleted file mode 100644
index fc49c5117..000000000
--- a/spacy/tests/regression/test_issue4590.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from mock import Mock
-from spacy.matcher import DependencyMatcher
-from ..util import get_doc
-
-
-def test_issue4590(en_vocab):
-    """Test that matches param in on_match method are the same as matches run with no on_match method"""
-    pattern = [
-        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {
-            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-        {
-            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-    ]
-
-    on_match = Mock()
-
-    matcher = DependencyMatcher(en_vocab)
-    matcher.add("pattern", on_match, pattern)
-
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
-
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-
-    matches = matcher(doc)
-
-    on_match_args = on_match.call_args
-
-    assert on_match_args[0][3] == matches
diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py
deleted file mode 100644
index 3f6c1a57c..000000000
--- a/spacy/tests/regression/test_issue4651.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-from ..util import make_tempdir
-
-
-def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
-
-
-def test_issue4651_without_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    not specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py
deleted file mode 100644
index e28d0f44a..000000000
--- a/spacy/tests/regression/test_issue4665.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-
-# TODO
-# from spacy.gold.converters.conllu2docs import conllu2docs
-
-input_data = """
-1	[	_	PUNCT	-LRB-	_	_	punct	_	_
-2	This	_	DET	DT	_	_	det	_	_
-3	killing	_	NOUN	NN	_	_	nsubj	_	_
-4	of	_	ADP	IN	_	_	case	_	_
-5	a	_	DET	DT	_	_	det	_	_
-6	respected	_	ADJ	JJ	_	_	amod	_	_
-7	cleric	_	NOUN	NN	_	_	nmod	_	_
-8	will	_	AUX	MD	_	_	aux	_	_
-9	be	_	AUX	VB	_	_	aux	_	_
-10	causing	_	VERB	VBG	_	_	root	_	_
-11	us	_	PRON	PRP	_	_	iobj	_	_
-12	trouble	_	NOUN	NN	_	_	dobj	_	_
-13	for	_	ADP	IN	_	_	case	_	_
-14	years	_	NOUN	NNS	_	_	nmod	_	_
-15	to	_	PART	TO	_	_	mark	_	_
-16	come	_	VERB	VB	_	_	acl	_	_
-17	.	_	PUNCT	.	_	_	punct	_	_
-18	]	_	PUNCT	-RRB-	_	_	punct	_	_
-"""
-
-
-@pytest.mark.xfail
-def test_issue4665():
-    """
-    conllu2json should not raise an exception if the HEAD column contains an
-    underscore
-    """
-    pass
-    # conllu2json(input_data)
diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py
deleted file mode 100644
index 149e1431b..000000000
--- a/spacy/tests/regression/test_issue4674.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import pytest
-from spacy.kb import KnowledgeBase
-from spacy.util import ensure_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4674():
-    """Test that setting entities with overlapping identifiers does not mess up IO"""
-    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-
-    vector1 = [0.9, 1.1, 1.01]
-    vector2 = [1.8, 2.25, 2.01]
-    with pytest.warns(UserWarning):
-        kb.set_entities(
-            entity_list=["Q1", "Q1"],
-            freq_list=[32, 111],
-            vector_list=[vector1, vector2],
-        )
-
-    assert kb.get_size_entities() == 1
-
-    # dumping to file & loading back in
-    with make_tempdir() as d:
-        dir_path = ensure_path(d)
-        if not dir_path.exists():
-            dir_path.mkdir()
-        file_path = dir_path / "kb"
-        kb.dump(str(file_path))
-
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-        kb2.load_bulk(str(file_path))
-
-    assert kb2.get_size_entities() == 1
diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py
deleted file mode 100644
index d9798ef84..000000000
--- a/spacy/tests/regression/test_issue4707.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from spacy.util import load_model_from_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4707():
-    """Tests that disabled component names are also excluded from nlp.from_disk
-    by default when loading a model.
-    """
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
-    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
-    exclude = ["tokenizer", "sentencizer"]
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir, exclude=exclude)
-        new_nlp = load_model_from_path(tmpdir, disable=exclude)
-    assert "sentencizer" not in new_nlp.pipe_names
-    assert "entity_ruler" in new_nlp.pipe_names
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
deleted file mode 100644
index cdc3c09ca..000000000
--- a/spacy/tests/regression/test_issue4725.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pickle
-import numpy
-
-from spacy.lang.en import English
-from spacy.vocab import Vocab
-
-from spacy.tests.util import make_tempdir
-
-
-def test_pickle_ner():
-    """ Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
-    with make_tempdir() as tmp_path:
-        with (tmp_path / "ner.pkl").open("wb") as file_:
-            pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
-
-        with (tmp_path / "ner.pkl").open("rb") as file_:
-            ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
-
-
-def test_issue4725():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py
deleted file mode 100644
index ddbf6f7a0..000000000
--- a/spacy/tests/regression/test_issue4849.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4849():
-    nlp = English()
-
-    ruler = EntityRuler(
-        nlp,
-        patterns=[
-            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
-            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
-        ],
-        phrase_matcher_attr="LOWER",
-    )
-
-    nlp.add_pipe(ruler)
-
-    text = """
-    The left is starting to take aim at Democratic front-runner Joe Biden.
-    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
-    """
-
-    # USING 1 PROCESS
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=1):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
-
-    # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
deleted file mode 100644
index a3dff16aa..000000000
--- a/spacy/tests/regression/test_issue4903.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Span, Doc
-
-
-class CustomPipe:
-    name = "my_pipe"
-
-    def __init__(self):
-        Span.set_extension("my_ext", getter=self._get_my_ext)
-        Doc.set_extension("my_ext", default=None)
-
-    def __call__(self, doc):
-        gathered_ext = []
-        for sent in doc.sents:
-            sent_ext = self._get_my_ext(sent)
-            sent._.set("my_ext", sent_ext)
-            gathered_ext.append(sent_ext)
-
-        doc._.set("my_ext", "\n".join(gathered_ext))
-
-        return doc
-
-    @staticmethod
-    def _get_my_ext(span):
-        return str(span.end)
-
-
-def test_issue4903():
-    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
-
-    nlp = English()
-    custom_component = CustomPipe()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(custom_component, after="sentencizer")
-
-    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py
deleted file mode 100644
index c3d3c4326..000000000
--- a/spacy/tests/regression/test_issue4924.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from spacy.gold import Example
-from spacy.language import Language
-
-
-def test_issue4924():
-    nlp = Language()
-    example = Example.from_dict(nlp.make_doc(""), {})
-    nlp.evaluate([example])
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
index a9a57746d..3c1cee5c3 100644
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@@ -1,6 +1,8 @@
+import pytest
 from spacy.lang.en import English
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
     # Test that the comparison between a Span and a Token, goes well
     # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@@ -8,7 +10,6 @@ def test_issue5152():
     text = nlp("Talk about being boring!")
     text_var = nlp("Talk of being boring!")
     y = nlp("Let")
-
     span = text[0:3]  # Talk about being
     span_2 = text[0:3]  # Talk about being
     span_3 = text_var[0:3]  # Talk of being
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 9ffa3862c..86020bf17 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -63,7 +63,8 @@ def tagger():
     # need to add model for two reasons:
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
+        tagger.begin_training(pipeline=nlp.pipeline)
     return tagger
 
 

From b6deef80f84567d707c368486c68895f7dbb0aa9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 6 Jul 2020 16:43:45 +0200
Subject: [PATCH 02/21] Fix class to pickling works as expected

---
 spacy/tests/regression/test_issue4501-5000.py | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 9bace8fc7..01d7a1dbb 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -244,32 +244,32 @@ def test_issue4849():
     assert count_ents == 2
 
 
+class CustomPipe:
+    name = "my_pipe"
+
+    def __init__(self):
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
 def test_issue4903():
     """Ensure that this runs correctly and doesn't hang or crash on Windows /
     macOS."""
-
-    class CustomPipe:
-        name = "my_pipe"
-
-        def __init__(self):
-            Span.set_extension("my_ext", getter=self._get_my_ext)
-            Doc.set_extension("my_ext", default=None)
-
-        def __call__(self, doc):
-            gathered_ext = []
-            for sent in doc.sents:
-                sent_ext = self._get_my_ext(sent)
-                sent._.set("my_ext", sent_ext)
-                gathered_ext.append(sent_ext)
-
-            doc._.set("my_ext", "\n".join(gathered_ext))
-
-            return doc
-
-        @staticmethod
-        def _get_my_ext(span):
-            return str(span.end)
-
     nlp = English()
     custom_component = CustomPipe()
     nlp.add_pipe(nlp.create_pipe("sentencizer"))

From cc477be952e2f07308cd7dfd16ee37ed0e73dd56 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 6 Jul 2020 17:39:31 +0200
Subject: [PATCH 03/21] Improve gold-standard alignment (#5711)

* Remove previous alignment

* Implement better alignment, using ragged data structure

* Use pytokenizations for alignment

* Fixes

* Fixes

* Fix overlapping entities in alignment

* Fix align split_sents

* Update test

* Commit align.py

* Try to appease setuptools

* Fix flake8

* use realistic entities for testing

* Update tests for better alignment

* Improve alignment heuristic

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 pyproject.toml                  |   3 +-
 requirements.txt                |   1 +
 setup.cfg                       |   1 +
 setup.py                        |   3 +-
 spacy/gold/__init__.py          |   2 +-
 spacy/gold/align.pxd            |   8 --
 spacy/gold/align.py             |  30 ++++++++
 spacy/gold/align.pyx            | 101 ------------------------
 spacy/gold/example.pxd          |   3 +-
 spacy/gold/example.pyx          | 131 ++++++++++++++++----------------
 spacy/scorer.py                 |  22 +++---
 spacy/tests/test_gold.py        | 105 ++++++++++++-------------
 spacy/tests/test_new_example.py |   2 +-
 13 files changed, 167 insertions(+), 245 deletions(-)
 delete mode 100644 spacy/gold/align.pxd
 create mode 100644 spacy/gold/align.py
 delete mode 100644 spacy/gold/align.pyx

diff --git a/pyproject.toml b/pyproject.toml
index 480c3290e..2c020ef66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ requires = [
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
     "thinc>=8.0.0a12,<8.0.0a20",
-    "blis>=0.4.0,<0.5.0"
+    "blis>=0.4.0,<0.5.0",
+    "pytokenizations"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index cd123e341..3e1329de9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.3.0,<2.0.0
+pytokenizations
 # Official Python utilities
 setuptools
 packaging
diff --git a/setup.cfg b/setup.cfg
index 43a74d97e..9793bbb08 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,6 +51,7 @@ install_requires =
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0
     pydantic>=1.3.0,<2.0.0
+    pytokenizations
     # Official Python utilities
     setuptools
     packaging
diff --git a/setup.py b/setup.py
index 731a19cba..3b43ca2d2 100755
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
+from setuptools import Extension, setup, find_packages
 import sys
 import platform
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
 import distutils.util
 from distutils import ccompiler, msvccompiler
-from setuptools import Extension, setup, find_packages
 import numpy
 from pathlib import Path
 import shutil
@@ -23,7 +23,6 @@ Options.docstrings = True
 
 PACKAGES = find_packages()
 MOD_NAMES = [
-    "spacy.gold.align",
     "spacy.gold.example",
     "spacy.parts_of_speech",
     "spacy.strings",
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index 9416bdd81..c8b5fc44d 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -1,6 +1,6 @@
 from .corpus import Corpus
 from .example import Example
-from .align import align
+from .align import Alignment
 
 from .iob_utils import iob_to_biluo, biluo_to_iob
 from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
diff --git a/spacy/gold/align.pxd b/spacy/gold/align.pxd
deleted file mode 100644
index ea3615863..000000000
--- a/spacy/gold/align.pxd
+++ /dev/null
@@ -1,8 +0,0 @@
-cdef class Alignment:
-    cdef public object cost
-    cdef public object i2j
-    cdef public object j2i
-    cdef public object i2j_multi
-    cdef public object j2i_multi
-    cdef public object cand_to_gold
-    cdef public object gold_to_cand
diff --git a/spacy/gold/align.py b/spacy/gold/align.py
new file mode 100644
index 000000000..0dd48d4cf
--- /dev/null
+++ b/spacy/gold/align.py
@@ -0,0 +1,30 @@
+from typing import List
+import numpy
+from thinc.types import Ragged
+from dataclasses import dataclass
+import tokenizations
+
+
+@dataclass
+class Alignment:
+    x2y: Ragged
+    y2x: Ragged
+
+    @classmethod
+    def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
+        x2y = _make_ragged(x2y)
+        y2x = _make_ragged(y2x)
+        return Alignment(x2y=x2y, y2x=y2x)
+    
+    @classmethod
+    def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
+        x2y, y2x = tokenizations.get_alignments(A, B)
+        return Alignment.from_indices(x2y=x2y, y2x=y2x)
+
+
+def _make_ragged(indices):
+    lengths = numpy.array([len(x) for x in indices], dtype="i")
+    flat = []
+    for x in indices:
+        flat.extend(x)
+    return Ragged(numpy.array(flat, dtype="i"), lengths)
diff --git a/spacy/gold/align.pyx b/spacy/gold/align.pyx
deleted file mode 100644
index 80ba0346a..000000000
--- a/spacy/gold/align.pyx
+++ /dev/null
@@ -1,101 +0,0 @@
-import numpy
-from ..errors import Errors, AlignmentError
-
-
-cdef class Alignment:
-    def __init__(self, spacy_words, gold_words):
-        # Do many-to-one alignment for misaligned tokens.
-        # If we over-segment, we'll have one gold word that covers a sequence
-        # of predicted words
-        # If we under-segment, we'll have one predicted word that covers a
-        # sequence of gold words.
-        # If we "mis-segment", we'll have a sequence of predicted words covering
-        # a sequence of gold words. That's many-to-many -- we don't do that
-        # except for NER spans where the start and end can be aligned.
-        cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
-        self.cost = cost
-        self.i2j = i2j
-        self.j2i = j2i
-        self.i2j_multi = i2j_multi
-        self.j2i_multi = j2i_multi
-        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
-        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
-
-
-def align(tokens_a, tokens_b):
-    """Calculate alignment tables between two tokenizations.
-
-    tokens_a (List[str]): The candidate tokenization.
-    tokens_b (List[str]): The reference tokenization.
-    RETURNS: (tuple): A 5-tuple consisting of the following information:
-      * cost (int): The number of misaligned tokens.
-      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
-        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
-        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
-        it has the value -1.
-      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
-      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
-        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
-        the same token of `tokens_b`.
-      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
-            direction.
-    """
-    tokens_a = _normalize_for_alignment(tokens_a)
-    tokens_b = _normalize_for_alignment(tokens_b)
-    cost = 0
-    a2b = numpy.empty(len(tokens_a), dtype="i")
-    b2a = numpy.empty(len(tokens_b), dtype="i")
-    a2b.fill(-1)
-    b2a.fill(-1)
-    a2b_multi = {}
-    b2a_multi = {}
-    i = 0
-    j = 0
-    offset_a = 0
-    offset_b = 0
-    while i < len(tokens_a) and j < len(tokens_b):
-        a = tokens_a[i][offset_a:]
-        b = tokens_b[j][offset_b:]
-        if a == b:
-            if offset_a == offset_b == 0:
-                a2b[i] = j
-                b2a[j] = i
-            elif offset_a == 0:
-                cost += 2
-                a2b_multi[i] = j
-            elif offset_b == 0:
-                cost += 2
-                b2a_multi[j] = i
-            offset_a = offset_b = 0
-            i += 1
-            j += 1
-        elif a == "":
-            assert offset_a == 0
-            cost += 1
-            i += 1
-        elif b == "":
-            assert offset_b == 0
-            cost += 1
-            j += 1
-        elif b.startswith(a):
-            cost += 1
-            if offset_a == 0:
-                a2b_multi[i] = j
-            i += 1
-            offset_a = 0
-            offset_b += len(a)
-        elif a.startswith(b):
-            cost += 1
-            if offset_b == 0:
-                b2a_multi[j] = i
-            j += 1
-            offset_b = 0
-            offset_a += len(b)
-        else:
-            assert "".join(tokens_a) != "".join(tokens_b)
-            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
-    return cost, a2b, b2a, a2b_multi, b2a_multi
-
-
-def _normalize_for_alignment(tokens):
-    return [w.replace(" ", "").lower() for w in tokens]
diff --git a/spacy/gold/example.pxd b/spacy/gold/example.pxd
index 736969ecd..1f63b12d0 100644
--- a/spacy/gold/example.pxd
+++ b/spacy/gold/example.pxd
@@ -1,8 +1,7 @@
 from ..tokens.doc cimport Doc
-from .align cimport Alignment
 
 
 cdef class Example:
     cdef readonly Doc x
     cdef readonly Doc y
-    cdef readonly Alignment _alignment
+    cdef readonly object _alignment
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 7b629dcd2..ce1a0928b 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
-from .align cimport Alignment
+from .align import Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .iob_utils import spans_from_biluo_tags
-from .align import Alignment
 from ..errors import Errors, Warnings
 from ..syntax import nonproj
 
@@ -28,7 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 
 
 cdef class Example:
-    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
+    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
         """ Doc can either be text, or an actual Doc """
         if predicted is None:
             raise TypeError(Errors.E972.format(arg="predicted"))
@@ -83,34 +82,38 @@ cdef class Example:
             gold_words = [token.orth_ for token in self.reference]
             if gold_words == []:
                 gold_words = spacy_words
-            self._alignment = Alignment(spacy_words, gold_words)
+            self._alignment = Alignment.from_strings(spacy_words, gold_words)
         return self._alignment
 
     def get_aligned(self, field, as_string=False):
         """Return an aligned array for a token attribute."""
-        i2j_multi = self.alignment.i2j_multi
-        cand_to_gold = self.alignment.cand_to_gold
+        align = self.alignment.x2y
 
         vocab = self.reference.vocab
         gold_values = self.reference.to_array([field])
         output = [None] * len(self.predicted)
-        for i, gold_i in enumerate(cand_to_gold):
-            if self.predicted[i].text.isspace():
-                output[i] = None
-            if gold_i is None:
-                if i in i2j_multi:
-                    output[i] = gold_values[i2j_multi[i]]
-                else:
-                    output[i] = None
+        for token in self.predicted:
+            if token.is_space:
+                output[token.i] = None
             else:
-                output[i] = gold_values[gold_i]
+                values = gold_values[align[token.i].dataXd]
+                values = values.ravel()
+                if len(values) == 0:
+                    output[token.i] = None
+                elif len(values) == 1:
+                    output[token.i] = values[0]
+                elif len(set(list(values))) == 1:
+                    # If all aligned tokens have the same value, use it.
+                    output[token.i] = values[0]
+                else:
+                    output[token.i] = None
         if as_string and field not in ["ENT_IOB", "SENT_START"]:
             output = [vocab.strings[o] if o is not None else o for o in output]
         return output
 
     def get_aligned_parse(self, projectivize=True):
-        cand_to_gold = self.alignment.cand_to_gold
-        gold_to_cand = self.alignment.gold_to_cand
+        cand_to_gold = self.alignment.x2y
+        gold_to_cand = self.alignment.y2x
         aligned_heads = [None] * self.x.length
         aligned_deps = [None] * self.x.length
         heads = [token.head.i for token in self.y]
@@ -118,52 +121,51 @@ cdef class Example:
         if projectivize:
             heads, deps = nonproj.projectivize(heads, deps)
         for cand_i in range(self.x.length):
-            gold_i = cand_to_gold[cand_i]
-            if gold_i is not None: # Alignment found
-                gold_head = gold_to_cand[heads[gold_i]]
-                if gold_head is not None:
-                    aligned_heads[cand_i] = gold_head
+            if cand_to_gold.lengths[cand_i] == 1:
+                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
+                if gold_to_cand.lengths[heads[gold_i]] == 1:
+                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
                     aligned_deps[cand_i] = deps[gold_i]
         return aligned_heads, aligned_deps
 
+    def get_aligned_spans_x2y(self, x_spans):
+        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
+
+    def get_aligned_spans_y2x(self, y_spans):
+        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
+    
+    def _get_aligned_spans(self, doc, spans, align):
+        seen = set()
+        output = []
+        for span in spans:
+            indices = align[span.start : span.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen]
+            if len(indices) >= 1:
+                aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
+                target_text = span.text.lower().strip().replace(" ", "")
+                our_text = aligned_span.text.lower().strip().replace(" ", "")
+                if our_text == target_text:
+                    output.append(aligned_span)
+                    seen.update(indices)
+        return output
+
     def get_aligned_ner(self):
         if not self.y.is_nered:
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
-        x_text = self.x.text
-        # Get a list of entities, and make spans for non-entity tokens.
-        # We then work through the spans in order, trying to find them in
-        # the text and using that to get the offset. Any token that doesn't
-        # get a tag set this way is tagged None.
-        # This could maybe be improved? It at least feels easy to reason about.
-        y_spans = list(self.y.ents)
-        y_spans.sort()
-        x_text_offset = 0
-        x_spans = []
-        for y_span in y_spans:
-            if x_text.count(y_span.text) >= 1:
-                start_char = x_text.index(y_span.text) + x_text_offset
-                end_char = start_char + len(y_span.text)
-                x_span = self.x.char_span(start_char, end_char, label=y_span.label)
-                if x_span is not None:
-                    x_spans.append(x_span)
-                    x_text = self.x.text[end_char:]
-                    x_text_offset = end_char
+        x_ents = self.get_aligned_spans_y2x(self.y.ents)
+        # Default to 'None' for missing values
         x_tags = biluo_tags_from_offsets(
             self.x,
-            [(e.start_char, e.end_char, e.label_) for e in x_spans],
+            [(e.start_char, e.end_char, e.label_) for e in x_ents],
             missing=None
         )
-        gold_to_cand = self.alignment.gold_to_cand
-        for token in self.y:
-            if token.ent_iob_ == "O":
-                cand_i = gold_to_cand[token.i]
-                if cand_i is not None and x_tags[cand_i] is None:
-                    x_tags[cand_i] = "O"
-        i2j_multi = self.alignment.i2j_multi
-        for i, tag in enumerate(x_tags):
-            if tag is None and i in i2j_multi:
-                gold_i = i2j_multi[i]
-                if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
+        # Now fill the tokens we can align to O.
+        O = 2 # I=1, O=2, B=3
+        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
+            if x_tags[i] is None:
+                if ent_iob == O:
+                    x_tags[i] = "O"
+                elif self.x[i].is_space:
                     x_tags[i] = "O"
         return x_tags
 
@@ -194,25 +196,22 @@ cdef class Example:
                 links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
         return links
 
-
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
         if not self.reference.is_sentenced:
             return [self]
-
-        sent_starts = self.get_aligned("SENT_START")
-        sent_starts.append(1)   # appending virtual start of a next sentence to facilitate search
-
+        
+        align = self.alignment.y2x
+        seen_indices = set()
         output = []
-        pred_start = 0
-        for sent in self.reference.sents:
-            new_ref = sent.as_doc()
-            pred_end = sent_starts.index(1, pred_start+1)  # find where the next sentence starts
-            new_pred = self.predicted[pred_start : pred_end].as_doc()
-            output.append(Example(new_pred, new_ref))
-            pred_start = pred_end
-
+        for y_sent in self.reference.sents:
+            indices = align[y_sent.start : y_sent.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen_indices]
+            if indices:
+                x_sent = self.predicted[indices[0] : indices[-1] + 1]
+                output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
+                seen_indices.update(indices)
         return output
 
     property text:
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 87033d234..6fc86e412 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -326,10 +326,11 @@ class Scorer(object):
         for token in doc:
             if token.orth_.isspace():
                 continue
-            gold_i = align.cand_to_gold[token.i]
-            if gold_i is None:
+            if align.x2y.lengths[token.i] != 1:
                 self.tokens.fp += 1
+                gold_i = None
             else:
+                gold_i = align.x2y[token.i].dataXd[0, 0]
                 self.tokens.tp += 1
                 cand_tags.add((gold_i, token.tag_))
                 cand_pos.add((gold_i, token.pos_))
@@ -345,7 +346,10 @@ class Scorer(object):
                 if token.is_sent_start:
                     cand_sent_starts.add(gold_i)
             if token.dep_.lower() not in punct_labels and token.orth_.strip():
-                gold_head = align.cand_to_gold[token.head.i]
+                if align.x2y.lengths[token.head.i] == 1:
+                    gold_head = align.x2y[token.head.i].dataXd[0, 0]
+                else:
+                    gold_head = None
                 # None is indistinct, so we can't just add it to the set
                 # Multiple (None, None) deps are possible
                 if gold_i is None or gold_head is None:
@@ -381,15 +385,9 @@ class Scorer(object):
                 gold_ents.add(gold_ent)
                 gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
             cand_per_ents = {ent_label: set() for ent_label in ent_labels}
-            for ent in doc.ents:
-                first = align.cand_to_gold[ent.start]
-                last = align.cand_to_gold[ent.end - 1]
-                if first is None or last is None:
-                    self.ner.fp += 1
-                    self.ner_per_ents[ent.label_].fp += 1
-                else:
-                    cand_ents.add((ent.label_, first, last))
-                    cand_per_ents[ent.label_].add((ent.label_, first, last))
+            for ent in example.get_aligned_spans_x2y(doc.ents):
+                cand_ents.add((ent.label_, ent.start, ent.end - 1))
+                cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
             # Scores per ent
             for k, v in self.ner_per_ents.items():
                 if k in cand_per_ents:
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index cd354ff92..24f2bbc13 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,6 +1,6 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
+from spacy.gold import spans_from_biluo_tags, iob_to_biluo
 from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.gold.converters import json2docs
@@ -271,75 +271,76 @@ def test_split_sentences(en_vocab):
     assert split_examples[1].text == "had loads of fun "
 
 
-@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
-    words = ["I", "flew to", "San Francisco Valley", "."]
-    spaces = [True, True, False, False]
+    words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    spaces = [True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    prefix = "Mr. and Mrs. Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "U-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
 
     entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
+    assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
 
     entities = [
-        (len("I "), len("I flew"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr. and "), len("Mr. and Mrs."), "PERSON"),  # "Mrs." is a Person
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", None, "U-LOC", "O"]
+    assert ner_tags == ["O", None, "O", "U-LOC", "O"]
 
 
 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
-    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    spaces = [True, True, True, True, True, True, True, False, False]
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    prefix = "Mr. and Mrs. Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+
+    entities = [
+        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+
+
+def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
+    words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
     spaces = [True, True, True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    prefix = "Mr. and Mrs. Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 
     entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
-
-
-@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
-def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
-    words = ["I flew", "to", "San Francisco", "Valley", "."]
-    spaces = [True, True, True, False, False]
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
-    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
-    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
-
-    entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
-    ]
-    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
-    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
-    ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
+    assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
 
 
 def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
@@ -349,7 +350,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
         "I flew  to San Francisco Valley.",
     )
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"), "LOC")]
+    prefix = "I flew  to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
     gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
     gold_spaces = [True, True, False, True, False, False]
     example = Example.from_dict(
@@ -514,6 +516,7 @@ def test_make_orth_variants(doc):
         make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
 
 
+@pytest.mark.skip("Outdated")
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
     [
@@ -537,12 +540,12 @@ def test_make_orth_variants(doc):
         ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
     ],
 )
-def test_align(tokens_a, tokens_b, expected):
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
-    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
+def test_align(tokens_a, tokens_b, expected):  # noqa
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa
+    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa
     # check symmetry
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
-    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa
+    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa
 
 
 def test_goldparse_startswith_space(en_tokenizer):
@@ -556,7 +559,7 @@ def test_goldparse_startswith_space(en_tokenizer):
         doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
     )
     ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, "U-DATE"]
+    assert ner_tags == ["O", "U-DATE"]
     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
 
 
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index 58eab4a54..f858b0759 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -55,7 +55,7 @@ def test_aligned_tags():
     predicted = Doc(vocab, words=pred_words)
     example = Example.from_dict(predicted, annots)
     aligned_tags = example.get_aligned("tag", as_string=True)
-    assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
+    assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
 
 
 def test_aligned_tags_multi():

From 19d42f42de30ba57e17427798ea2562cdab2c9f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 6 Jul 2020 17:43:12 +0200
Subject: [PATCH 04/21] Set version to v3.0.0a2

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 5b2a89c61..057e21c87 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a1"
+__version__ = "3.0.0a2"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 709fc5e4ade928a779df3db787056e8e80ed4a57 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Mon, 6 Jul 2020 17:50:21 +0200
Subject: [PATCH 05/21] Clarify dropout and seed in Tok2Vec

---
 spacy/ml/models/tok2vec.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index d2b70c36e..f1a9c7d1f 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -263,20 +263,20 @@ def build_Tok2Vec_model(
     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
         norm = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
+            nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
             seed=0
         )
         if subword_features:
             prefix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
                 seed=1
             )
             suffix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
                 seed=2
             )
             shape = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
                 seed=3
             )
         else:
@@ -296,7 +296,7 @@ def build_Tok2Vec_model(
                     >> Maxout(
                         nO=width,
                         nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                         dropout=0.0,
                         normalize=True,
                     ),
@@ -309,7 +309,7 @@ def build_Tok2Vec_model(
                     >> Maxout(
                         nO=width,
                         nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                         dropout=0.0,
                         normalize=True,
                     ),
@@ -322,7 +322,7 @@ def build_Tok2Vec_model(
                 >> Maxout(
                     nO=width,
                     nI=width * columns,
-                    nP=maxout_pieces,
+                    nP=3,
                     dropout=0.0,
                     normalize=True,
                 ),
@@ -335,7 +335,7 @@ def build_Tok2Vec_model(
             reduce_dimensions = Maxout(
                 nO=width,
                 nI=nM * nC + width,
-                nP=maxout_pieces,
+                nP=3,
                 dropout=0.0,
                 normalize=True,
             )

From f25761e513559fc8d72fae1e27fead309491f76e Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Mon, 6 Jul 2020 17:51:25 +0200
Subject: [PATCH 06/21] Dont randomize cuts in parser

---
 spacy/syntax/nn_parser.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0295241c6..1732805a9 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -280,11 +280,12 @@ cdef class Parser:
             [eg.predicted for eg in examples])
         if self.cfg["update_with_oracle_cut_size"] >= 1:
             # Chop sequences into lengths of this many transitions, to make the
-            # batch uniform length. We randomize this to overfit less.
+            # batch uniform length.
+            # We used to randomize this, but it's not clear that actually helps?
             cut_size = self.cfg["update_with_oracle_cut_size"]
             states, golds, max_steps = self._init_gold_batch(
                 examples,
-                max_length=numpy.random.choice(range(5, cut_size))
+                max_length=cut_size 
             )
         else:
             states, golds, _ = self.moves.init_gold_batch(examples)

From 1eb1654941e8a3dd81d306f621985af2c3ec7ddd Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Mon, 6 Jul 2020 17:51:37 +0200
Subject: [PATCH 07/21] Update configs

---
 examples/experiments/onto-ner.cfg | 32 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg
index 48fe25a67..8970bb3c0 100644
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@@ -9,12 +9,12 @@ max_length = 5000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-dropout = 0.2
+dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
+patience = 100000
 max_epochs = 0
-max_steps = 20000
-eval_frequency = 500
+max_steps = 100000
+eval_frequency = 2000
 # Other settings
 seed = 0
 accumulate_gradient = 1
@@ -30,25 +30,25 @@ omit_extra_lookups = false
 [training.batch_size]
 @schedules = "compounding.v1"
 start = 100
-stop = 1000
+stop = 2000
 compound = 1.001
 
 [training.optimizer]
 @optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
-L2_is_weight_decay = false
-L2 = 1e-6
+L2_is_weight_decay = true
+L2 = 0.0
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001
 
-#[optimizer.learn_rate]
+#[training.optimizer.learn_rate]
 #@schedules = "warmup_linear.v1"
-#warmup_steps = 250
-#total_steps = 20000
-#initial_rate = 0.001
+#warmup_steps = 1000
+#total_steps = 50000
+#initial_rate = 0.003
 
 [nlp]
 lang = "en"
@@ -58,23 +58,21 @@ vectors = null
 factory = "ner"
 learn_tokens = false
 min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
 
 [nlp.pipeline.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 3
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
+use_upper = false
 
 [nlp.pipeline.ner.model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = ${nlp:vectors}
-width = 96
+width = 300
 depth = 4
 window_size = 1
-embed_size = 2000
-maxout_pieces = 3
+embed_size = 7000
+maxout_pieces = 1
 subword_features = true
 dropout = ${training:dropout}

From d1fd3438c31a3be94c111cdedd1a3c3a92c66b05 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Jul 2020 01:38:15 +0200
Subject: [PATCH 08/21] Add dropout to parser hidden layer

---
 spacy/ml/_precomputable_affine.py |  3 ++-
 spacy/ml/tb_framework.py          |  2 +-
 spacy/syntax/_parser_model.pyx    | 14 ++++++++++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index 3b5f09e7b..20d5fb3fb 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -1,13 +1,14 @@
 from thinc.api import Model, normal_init
 
 
-def PrecomputableAffine(nO, nI, nF, nP):
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
     model = Model(
         "precomputable_affine",
         forward,
         init=init,
         dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
         params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout}
     )
     return model
 
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index 88f27f0bf..39d4b0a14 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
 from ..syntax._parser_model import ParserStepModel
 
 
-def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
+def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
     """Set up a stepwise transition-based model"""
     if upper is None:
         has_upper = False
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 853facdc6..42baa737b 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
 
 
 class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
         self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
         if layers[1].get_dim("nP") >= 2:
             activation = "maxout"
@@ -289,11 +291,17 @@ class ParserStepModel(Model):
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+NUMPY_OPS = NumpyOps()
 
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
     if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
@@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train):
         # Zero vectors for unseen classes
         d_scores *= model._class_mask
         d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
         if isinstance(model.state2vec.ops, CupyOps) \
         and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
@@ -437,7 +447,7 @@ cdef class precompute_hiddens:
         sum_state_features(<float*>state_vector.data,
             feat_weights, &ids[0,0],
             token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector = state_vector + self.bias
+        state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
         def backward(d_state_vector_ids):

From a4164f67cac6388b16707e6c7dcc9100cd8926e7 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Jul 2020 17:21:58 +0200
Subject: [PATCH 09/21] Don't normalize gradients

---
 spacy/pipeline/pipes.pyx   |  2 +-
 spacy/syntax/nn_parser.pyx | 17 +++++------------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 61cf155a2..2b147785e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -334,7 +334,7 @@ class Tagger(Pipe):
             losses[self.name] += (gradient**2).sum()
 
     def get_loss(self, examples, scores):
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
         truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 1732805a9..19d424823 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -65,7 +65,6 @@ cdef class Parser:
             self.set_output(self.moves.n_moves)
         self.cfg = dict(cfg)
         self.cfg.setdefault("update_with_oracle_cut_size", 100)
-        self.cfg.setdefault("normalize_gradients_with_batch_size", True)
         self._multitasks = []
         for multitask in cfg.get("multitasks", []):
             self.add_multitask_objective(multitask)
@@ -300,17 +299,10 @@ cdef class Parser:
             states, golds = zip(*states_golds)
             scores, backprop = model.begin_update(states)
             d_scores = self.get_batch_loss(states, golds, scores, losses)
-            if self.cfg["normalize_gradients_with_batch_size"]:
-                # We have to be very careful how we do this, because of the way we
-                # cut up the batch. We subdivide long sequences. If we normalize
-                # naively, we end up normalizing by sequence length, which
-                # is bad: that would mean that states in long sequences
-                # consistently get smaller gradients. Imagine if we have two
-                # sequences, one length 1000, one length 20. If we cut up
-                # the 1k sequence so that we have a "batch" of 50 subsequences,
-                # we don't want the gradients to get 50 times smaller!
-                d_scores /= n_examples
-
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
             backprop(d_scores)
             # Follow the predicted action
             self.transition_states(states, scores)
@@ -408,6 +400,7 @@ cdef class Parser:
             cpu_log_loss(c_d_scores,
                 costs, is_valid, &scores[i, 0], d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
             losses.setdefault(self.name, 0.)
             losses[self.name] += (d_scores**2).sum()

From 433dc3c9c98de097c6f11debf85bcad47b23f9c6 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Jul 2020 17:22:47 +0200
Subject: [PATCH 10/21] Simplify PrecomputableAffine slightly

---
 spacy/ml/_precomputable_affine.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index 20d5fb3fb..a3e2633e9 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -49,17 +49,14 @@ def forward(model, X, is_train):
         model.inc_grad("b", dY.sum(axis=0))
         dY = dY.reshape((dY.shape[0], nO * nP))
 
-        Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
+        Wopfi = W.transpose((1, 2, 0, 3))
         Wopfi = Wopfi.reshape((nO * nP, nF * nI))
         dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
 
-        # Reuse the buffer
-        dWopfi = Wopfi
-        dWopfi.fill(0.0)
-        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
         dWopfi = dWopfi.reshape((nO, nP, nF, nI))
         # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
         model.inc_grad("W", dWopfi)
         return dXf.reshape((dXf.shape[0], nF, nI))
 

From a39a110c4e744d677a6fee938615667d7b102b1d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 7 Jul 2020 18:46:00 +0200
Subject: [PATCH 11/21] Few more Example unit tests (#5720)

* small fixes in Example, UX

* add gold tests for aligned_spans and get_aligned_parse

* sentencizer unnecessary
---
 spacy/errors.py                    |  5 +-
 spacy/gold/example.pyx             | 15 ++---
 spacy/tests/parser/test_nonproj.py |  2 +-
 spacy/tests/test_gold.py           | 88 ++++++++++++++++++++++++------
 4 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 31533e7e2..5a4e0d0c7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,15 +477,14 @@ class Errors(object):
     E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 
     # TODO: fix numbering after merging develop into master
+    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
     E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
     E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
             "array and {doc_length} for the Doc itself.")
     E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
     E973 = ("Unexpected type for NER data")
     E974 = ("Unknown {obj} attribute: {key}")
-    E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
-            "but got {type}")
-    E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
+    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
             "but received None.")
     E977 = ("Can not compare a MorphAnalysis with a string object. "
             "This is likely a bug in spaCy, so feel free to open an issue.")
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index ce1a0928b..f5b9f0eeb 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 
 cdef class Example:
     def __init__(self, Doc predicted, Doc reference, *, alignment=None):
-        """ Doc can either be text, or an actual Doc """
         if predicted is None:
             raise TypeError(Errors.E972.format(arg="predicted"))
         if reference is None:
@@ -59,17 +58,15 @@ cdef class Example:
 
     @classmethod
     def from_dict(cls, Doc predicted, dict example_dict):
+        if predicted is None:
+            raise ValueError(Errors.E976.format(n="first", type="Doc"))
         if example_dict is None:
-            raise ValueError(Errors.E976)
-        if not isinstance(predicted, Doc):
-            raise TypeError(Errors.E975.format(type=type(predicted)))
+            raise ValueError(Errors.E976.format(n="second", type="dict"))
         example_dict = _fix_legacy_dict_data(example_dict)
         tok_dict, doc_dict = _parse_example_dict_data(example_dict)
         if "ORTH" not in tok_dict:
             tok_dict["ORTH"] = [tok.text for tok in predicted]
             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
-        if not _has_field(tok_dict, "SPACY"):
-            spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
         return Example(
             predicted,
             annotations2doc(predicted.vocab, tok_dict, doc_dict)
@@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
             values.append([vocab.morphology.add(v) for v in value])
         else:
             attrs.append(key)
-            values.append([vocab.strings.add(v) for v in value])
+            try:
+                values.append([vocab.strings.add(v) for v in value])
+            except TypeError:
+                types= set([type(v) for v in value])
+                raise TypeError(Errors.E969.format(field=key, types=types))
 
     array = numpy.asarray(values, dtype="uint64")
     return attrs, array.T
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 86d9a0180..496ec7e03 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
 
 def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
     assert contains_cycle(tree) is None
-    assert contains_cycle(cyclic_tree) == set([3, 4, 5])
+    assert contains_cycle(cyclic_tree) == {3, 4, 5}
     assert contains_cycle(partial_tree) is None
     assert contains_cycle(multirooted_tree) is None
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 24f2bbc13..7d3033560 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.gold.converters import json2docs
 from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
@@ -272,72 +273,72 @@ def test_split_sentences(en_vocab):
 
 
 def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
-    words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
     spaces = [True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
 
     entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
 
     entities = [
-        (len("Mr. and "), len("Mr. and Mrs."), "PERSON"),  # "Mrs." is a Person
+        (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person
         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", None, "O", "U-LOC", "O"]
 
 
 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
-    words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
     spaces = [True, True, True, True, True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 
     entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 
 
 def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
-    words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
+    words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
     spaces = [True, True, True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
+    gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 
     entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
     ]
-    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
     ner_tags = example.get_aligned_ner()
     assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
     assert spans[1].label_ == "GPE"
 
 
+def test_aligned_spans_y2x(en_vocab, en_tokenizer):
+    words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
+    spaces = [True, True, True, False, False]
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    ents_ref = example.reference.ents
+    assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
+    ents_y2x = example.get_aligned_spans_y2x(ents_ref)
+    assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
+
+
+def test_aligned_spans_x2y(en_vocab, en_tokenizer):
+    text = "Mr and Mrs Smith flew to San Francisco Valley"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
+                {"label": "LOC", "pattern": "San Francisco Valley"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
+
+    # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
+    ents_pred = example.predicted.ents
+    assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
+    ents_x2y = example.get_aligned_spans_x2y(ents_pred)
+    assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
+
+
 def test_gold_ner_missing_tags(en_tokenizer):
     doc = en_tokenizer("I flew to Silicon Valley via London.")
     biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
     assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
 
 
+def test_projectivize(en_tokenizer):
+    doc = en_tokenizer("He pretty quickly walks away")
+    heads = [3, 2, 3, 0, 2]
+    example = Example.from_dict(doc, {"heads": heads})
+    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
+    nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
+    assert proj_heads == [3, 2, 3, 0, 3]
+    assert nonproj_heads == [3, 2, 3, 0, 2]
+
+
 def test_iob_to_biluo():
     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
     good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]

From 8cb7f9ccff5da3a5eaeb3c3ebe99214f6673d084 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 7 Jul 2020 20:51:50 +0200
Subject: [PATCH 12/21] Improve assets and DVC handling (#5719)

* Improve assets and DVC handling

* Remove outdated comment [ci skip]
---
 spacy/cli/project.py | 305 ++++++++++++++++++++++++++++---------------
 spacy/schemas.py     |   2 +-
 2 files changed, 202 insertions(+), 105 deletions(-)

diff --git a/spacy/cli/project.py b/spacy/cli/project.py
index 200471127..33a8ff11a 100644
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Optional, Sequence
+from typing import List, Dict, Any, Optional, Sequence, Union
 import typer
 import srsly
 from pathlib import Path
@@ -18,7 +18,7 @@ from ..util import ensure_path, run_command, make_tempdir, working_dir
 from ..util import get_hash, get_checksum, split_command
 
 
-CONFIG_FILE = "project.yml"
+PROJECT_FILE = "project.yml"
 DVC_CONFIG = "dvc.yaml"
 DVC_DIR = ".dvc"
 DIRS = [
@@ -38,12 +38,12 @@ CACHES = [
     os.environ.get("TORCH_HOME"),
     Path.home() / ".keras",
 ]
-DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
-# it directly and edit the project.yml instead and re-run the project."""
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit
+# it directly and edit the {PROJECT_FILE} instead and re-run the project."""
 CLI_HELP = f"""Command-line interface for spaCy projects and working with project
 templates. You'd typically start by cloning a project template to a local
 directory and fetching its assets like datasets etc. See the project's
-{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
+{PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
 Version Control) to manage input and output files and to ensure steps are only
 re-run if their inputs change.
 """
@@ -91,7 +91,7 @@ def project_init_cli(
     # fmt: off
     path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
+    force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"),
     # fmt: on
 ):
     """Initialize a project directory with DVC and optionally Git. This should
@@ -100,7 +100,7 @@ def project_init_cli(
     be a Git repo, it should be initialized with Git first, before initializing
     DVC. This allows DVC to integrate with Git.
     """
-    project_init(path, git=git, force=force, silent=True)
+    project_init(path, git=git, force=force)
 
 
 @project_cli.command("assets")
@@ -110,11 +110,11 @@ def project_assets_cli(
     # fmt: on
 ):
     """Use DVC (Data Version Control) to fetch project assets. Assets are
-    defined in the "assets" section of the project config. If possible, DVC
+    defined in the "assets" section of the project.yml. If possible, DVC
     will try to track the files so you can pull changes from upstream. It will
     also try and store the checksum so the assets are versioned. If the file
     can't be tracked or checked, it will be downloaded without DVC. If a checksum
-    is provided in the project config, the file is only downloaded if no local
+    is provided in the project.yml, the file is only downloaded if no local
     file with the same checksum exists.
     """
     project_assets(project_dir)
@@ -132,7 +132,7 @@ def project_run_all_cli(
     # fmt: on
 ):
     """Run all commands defined in the project. This command will use DVC and
-    the defined outputs and dependencies in the project config to determine
+    the defined outputs and dependencies in the project.yml to determine
     which steps need to be re-run and where to start. This means you're only
     re-generating data if the inputs have changed.
 
@@ -151,12 +151,12 @@ def project_run_all_cli(
 def project_run_cli(
     # fmt: off
     ctx: typer.Context,
-    subcommand: str = Arg(None, help="Name of command defined in project config"),
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
     # fmt: on
 ):
-    """Run a named script defined in the project config. If the command is
+    """Run a named script defined in the project.yml. If the command is
     part of the default pipeline defined in the "run" section, DVC is used to
     determine whether the step should re-run if its inputs have changed, or
     whether everything is up to date. If the script is not part of the default
@@ -175,13 +175,13 @@ def project_run_cli(
 @project_cli.command("exec", hidden=True)
 def project_exec_cli(
     # fmt: off
-    subcommand: str = Arg(..., help="Name of command defined in project config"),
+    subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"),
     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
     # fmt: on
 ):
-    """Execute a command defined in the project config. This CLI command is
+    """Execute a command defined in the project.yml. This CLI command is
     only called internally in auto-generated DVC pipelines, as a shortcut for
-    multi-step commands in the project config. You typically shouldn't have to
+    multi-step commands in the project.yml. You typically shouldn't have to
     call it yourself. To run a command, call "run" or "run-all".
     """
     project_exec(project_dir, subcommand)
@@ -196,15 +196,15 @@ def project_update_dvc_cli(
     # fmt: on
 ):
     """Update the auto-generated DVC config file. Uses the steps defined in the
-    "run" section of the project config. This typically happens automatically
+    "run" section of the project.yml. This typically happens automatically
     when running a command, but can also be triggered manually if needed.
     """
     config = load_project_config(project_dir)
     updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
     if updated:
-        msg.good(f"Updated DVC config from {CONFIG_FILE}")
+        msg.good(f"Updated DVC config from {PROJECT_FILE}")
     else:
-        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed")
 
 
 app.add_typer(project_cli, name="project")
@@ -241,7 +241,7 @@ def project_clone(
         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
         try:
             run_command(cmd)
-        except SystemExit:
+        except DVCError:
             err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
             msg.fail(err)
         with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
@@ -249,7 +249,7 @@ def project_clone(
         try:
             run_command(["git", "-C", str(tmp_dir), "fetch"])
             run_command(["git", "-C", str(tmp_dir), "checkout"])
-        except SystemExit:
+        except DVCError:
             err = f"Could not clone '{name}' in the repo '{repo}'."
             msg.fail(err)
         shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
@@ -282,27 +282,29 @@ def project_init(
     with working_dir(project_dir) as cwd:
         if git:
             run_command(["git", "init"])
-        init_cmd = ["dvc", "init"]
-        if silent:
-            init_cmd.append("--quiet")
-        if not git:
-            init_cmd.append("--no-scm")
-        if force:
-            init_cmd.append("--force")
-        run_command(init_cmd)
+        flags = {"--force": force, "--quiet": silent, "--no-scm": not git}
+        try:
+            run_dvc_command(["init"], flags=flags)
+        except DVCError:
+            msg.fail(
+                "Failed to initialize project. This likely means that the "
+                "project is already initialized and has a .dvc directory. "
+                "To force-initialize, use the --force flag.",
+                exits=1,
+            )
         # We don't want to have analytics on by default – our users should
         # opt-in explicitly. If they want it, they can always enable it.
         if not analytics:
-            run_command(["dvc", "config", "core.analytics", "false"])
-        # Remove unused and confusing plot templates from .dvc directory
-        # TODO: maybe we shouldn't do this, but it's otherwise super confusing
-        # once you commit your changes via Git and it creates a bunch of files
-        # that have no purpose
+            run_dvc_command(["config", "core.analytics", "false"])
+        # Remove unused and confusing plot templates from .dvc directory.
+        # Otherwise super confusing once you commit your changes via Git and it
+        # creates a bunch of files that have no purpose.
         plots_dir = cwd / DVC_DIR / "plots"
         if plots_dir.exists():
             shutil.rmtree(str(plots_dir))
         config = load_project_config(cwd)
         setup_check_dvc(cwd, config)
+    msg.good("Initialized project")
 
 
 def project_assets(project_dir: Path) -> None:
@@ -315,19 +317,33 @@ def project_assets(project_dir: Path) -> None:
     setup_check_dvc(project_path, config)
     assets = config.get("assets", {})
     if not assets:
-        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
     msg.info(f"Fetching {len(assets)} asset(s)")
     variables = config.get("variables", {})
     fetched_assets = []
     for asset in assets:
-        url = asset["url"].format(**variables)
         dest = asset["dest"].format(**variables)
-        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            if not Path(dest).exists():
+                err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+                msg.warn(err)
+            else:
+                if checksum == get_checksum(dest):
+                    msg.good(f"Asset exists with matching checksum: {dest}")
+                    fetched_assets.append((project_path / dest).resolve())
+                else:
+                    msg.fail(f"Asset available but with incorrect checksum: {dest}")
+            continue
+        url = url.format(**variables)
+        fetched_path = fetch_asset(project_path, url, dest, checksum)
         if fetched_path:
             fetched_assets.append(str(fetched_path))
     if fetched_assets:
         with working_dir(project_path):
-            run_command(["dvc", "add", *fetched_assets, "--external"])
+            run_dvc_command(["add", *fetched_assets, "--external"])
 
 
 def fetch_asset(
@@ -359,19 +375,17 @@ def fetch_asset(
             # Try with tracking the source first, then just downloading with
             # DVC, then a regular non-DVC download.
             try:
-                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-            except subprocess.CalledProcessError:
-                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-        except subprocess.CalledProcessError:
+                run_dvc_command(["import-url", url, str(dest_path)])
+            except DVCError:
+                run_dvc_command(["get-url", url, str(dest_path)])
+        except DVCError:
             try:
                 download_file(url, dest_path)
             except requests.exceptions.HTTPError as e:
                 msg.fail(f"Download failed: {dest}", e)
                 return None
     if checksum and checksum != get_checksum(dest_path):
-        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
     msg.good(f"Fetched asset {dest}")
     return dest_path
 
@@ -384,13 +398,17 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
     """
     config = load_project_config(project_dir)
     setup_check_dvc(project_dir, config)
-    dvc_cmd = ["dvc", "repro", *dvc_args]
     with working_dir(project_dir):
-        run_command(dvc_cmd)
+        try:
+            run_dvc_command(["repro", *dvc_args])
+        except DVCError:
+            # We could raise a custom error here, but the output produced by
+            # DVC is already pretty substantial.
+            sys.exit(1)
 
 
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project config.
+    """Simulate a CLI help prompt using the info available in the project.yml.
 
     project_dir (Path): The project directory.
     subcommand (Optional[str]): The subcommand or None. If a subcommand is
@@ -408,15 +426,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
         if help_text:
             msg.text(f"\n{help_text}\n")
     else:
-        print(f"\nAvailable commands in {CONFIG_FILE}")
+        print(f"\nAvailable commands in {PROJECT_FILE}")
         print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
         msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        msg.text("Run all commands defined in the 'run' block of the project config:")
+        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
         print(f"{COMMAND} project run-all {project_dir}")
 
 
 def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
-    """Run a named script defined in the project config. If the script is part
+    """Run a named script defined in the project.yml. If the script is part
     of the default pipeline (defined in the "run" section), DVC is used to
     execute the command, so it can determine whether to rerun it. It then
     calls into "exec" to execute it.
@@ -433,9 +451,13 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
     validate_subcommand(commands.keys(), subcommand)
     if subcommand in config.get("run", []):
         # This is one of the pipeline commands tracked in DVC
-        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
         with working_dir(project_dir):
-            run_command(dvc_cmd)
+            try:
+                run_dvc_command(["repro", subcommand, *dvc_args])
+            except DVCError:
+                # We could raise a custom error here, but the output produced by
+                # DVC is already pretty substantial.
+                sys.exit(1)
     else:
         cmd = commands[subcommand]
         # Deps in non-DVC commands aren't tracked, but if they're defined,
@@ -448,8 +470,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
             run_commands(cmd["script"], variables)
 
 
-def project_exec(project_dir: Path, subcommand: str):
-    """Execute a command defined in the project config.
+def project_exec(project_dir: Path, subcommand: str) -> None:
+    """Execute a command defined in the project.yml.
 
     project_dir (Path): Path to project directory.
     subcommand (str): Name of command to run.
@@ -468,15 +490,15 @@ def project_exec(project_dir: Path, subcommand: str):
 
 
 def load_project_config(path: Path) -> Dict[str, Any]:
-    """Load the project config file from a directory and validate it.
+    """Load the project.yml file from a directory and validate it.
 
     path (Path): The path to the project directory.
-    RETURNS (Dict[str, Any]): The loaded project config.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
     """
-    config_path = path / CONFIG_FILE
+    config_path = path / PROJECT_FILE
     if not config_path.exists():
-        msg.fail("Can't find project config", config_path, exits=1)
-    invalid_err = f"Invalid project config in {CONFIG_FILE}"
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
     try:
         config = srsly.read_yaml(config_path)
     except ValueError as e:
@@ -500,7 +522,7 @@ def update_dvc_config(
     dict, so if any of the config values change, the DVC config is regenerated.
 
     path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
+    config (Dict[str, Any]): The loaded project.yml.
     verbose (bool): Whether to print additional info (via DVC).
     silent (bool): Don't output anything (via DVC).
     force (bool): Force update, even if hashes match.
@@ -514,10 +536,10 @@ def update_dvc_config(
         with dvc_config_path.open("r", encoding="utf8") as f:
             ref_hash = f.readline().strip().replace("# ", "")
         if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project config, don't need to update
+            return False  # Nothing has changed in project.yml, don't need to update
         dvc_config_path.unlink()
     variables = config.get("variables", {})
-    commands = []
+    dvc_commands = []
     # We only want to include commands that are part of the main list of "run"
     # commands in project.yml and should be run in sequence
     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@@ -535,15 +557,12 @@ def update_dvc_config(
         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
-        if verbose:
-            dvc_cmd.append("--verbose")
-        if silent:
-            dvc_cmd.append("--quiet")
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        commands.append(" ".join(full_cmd))
+        dvc_commands.append(" ".join(full_cmd))
     with working_dir(path):
-        run_commands(commands, variables, silent=True)
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
     with dvc_config_path.open("r+", encoding="utf8") as f:
         content = f.read()
         f.seek(0, 0)
@@ -571,7 +590,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
     DVC project.
 
     project_dir (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
+    config (Dict[str, Any]): The loaded project.yml.
     """
     if not project_dir.exists():
         msg.fail(f"Can't find project directory: {project_dir}")
@@ -586,38 +605,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
     with msg.loading("Updating DVC config..."):
         updated = update_dvc_config(project_dir, config, silent=True)
     if updated:
-        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
-
-
-def run_commands(
-    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    silent (bool): Don't print the commands.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {' '.join(command)}")
-        run_command(command)
+        msg.good(f"Updated DVC config from changed {PROJECT_FILE}")
 
 
 def convert_asset_url(url: str) -> str:
@@ -627,7 +615,7 @@ def convert_asset_url(url: str) -> str:
     RETURNS (str): The converted URL.
     """
     # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match("(http(s?)):\/\/github.com", url):
+    if re.match(r"(http(s?)):\/\/github.com", url):
         converted = url.replace("github.com", "raw.githubusercontent.com")
         converted = re.sub(r"/(tree|blob)/", "/", converted)
         msg.warn(
@@ -679,7 +667,7 @@ def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
     """
     if subcommand not in commands:
         msg.fail(
-            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
+            f"Can't find command '{subcommand}' in {PROJECT_FILE}. "
             f"Available commands: {', '.join(commands)}",
             exits=1,
         )
@@ -706,3 +694,112 @@ def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
         for data in response.iter_content(chunk_size=chunk_size):
             size = f.write(data)
             bar.update(size)
+
+
+def run_commands(
+    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {' '.join(command)}")
+        run_command(command)
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        run_dvc_command(command, flags=flags)
+
+
+def run_dvc_command(
+    command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False
+) -> None:
+    """Run a DVC command in a subprocess. This wrapper gives us a bit more
+    control over how the output and errors are presented. Raises a DVC error if
+    the "dvc" command returns a non-zero exit code and uses the error message
+    logged by DVC.
+
+    command (Union[str, List[str]]): The command, without the leading "dvc".
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    silent (bool): Don't print any output.
+    """
+    if isinstance(command, str):
+        command = split_command(command)
+    dvc_command = ["dvc", *command]
+    # Add the flags if they are set to True
+    for flag, is_active in flags.items():
+        if is_active:
+            dvc_command.append(flag)
+    proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    if not silent:
+        lines = proc.stdout.read().decode("utf8").split("\n\n")
+        for line in lines:
+            line = line.strip()
+            if is_relevant_dvc_output(line):
+                print(f"{line}\n")
+    _, err = proc.communicate()  # Important: otherwise returncode will be None!
+    if proc.returncode != 0:
+        if isinstance(err, bytes):
+            err = err.decode("utf8")
+        raise DVCError(err)
+
+
+def is_relevant_dvc_output(line: str) -> bool:
+    """Check whether the output by DVC is something we want to keep.
+
+    line (str): A line written to stdout,.
+    RETURNS (bool): Whether to use/print the line.
+    """
+    # Writing them like this for readability but maybe replace with regex?
+    conditions = [
+        not line,
+        line.startswith("What's next?"),
+        line.startswith("Having any troubles?"),
+    ]
+    return not any(conditions)
+
+
+class DVCError(RuntimeError):
+    """Custom error type for anything produced by the DVC CLI."""
+
+    pass
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 38e08b4cb..ca17fe50b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
 class ProjectConfigAsset(BaseModel):
     # fmt: off
     dest: StrictStr = Field(..., title="Destination of downloaded asset")
-    url: StrictStr = Field(..., title="URL of asset")
+    url: Optional[StrictStr] = Field(None, title="URL of asset")
     checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
     # fmt: on
 

From 42e1109defaf95a8d7b497f03f937f5027fa65e4 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Jul 2020 11:26:54 +0200
Subject: [PATCH 13/21] Support option to not batch by number of words

---
 spacy/cli/train.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3b71cdb9a..398b72952 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -203,7 +203,8 @@ def train(
         msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
         train_examples = list(
             corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"]
+                nlp, shuffle=False, gold_preproc=training["gold_preproc"],
+                max_length=training["max_length"]
             )
         )
         nlp.begin_training(lambda: train_examples)
@@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg):
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
         epoch += 1
-        batches = util.minibatch_by_words(
-            train_examples,
-            size=cfg["batch_size"],
-            discard_oversize=cfg["discard_oversize"],
-        )
+        if cfg.get("batch_by_words"):
+            batches = util.minibatch_by_words(
+                train_examples,
+                size=cfg["batch_size"],
+                discard_oversize=cfg["discard_oversize"],
+            )
+        else:
+            batches = util.minibatch(
+                train_examples,
+                size=cfg["batch_size"],
+            )
+ 
         # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
         try:
             first = next(batches)

From ca989f4cc4f3ad5c89c11c3a325b0fc79e4961ce Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Jul 2020 11:27:54 +0200
Subject: [PATCH 14/21] Improve cutting logic in parser

---
 spacy/syntax/nn_parser.pyx | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 19d424823..8bac8cd89 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -292,10 +292,8 @@ cdef class Parser:
         if not states:
             return losses
         all_states = list(states)
-        states_golds = zip(states, golds)
-        for _ in range(max_steps):
-            if not states_golds:
-                break
+        states_golds = list(zip(states, golds))
+        while states_golds:
             states, golds = zip(*states_golds)
             scores, backprop = model.begin_update(states)
             d_scores = self.get_batch_loss(states, golds, scores, losses)
@@ -519,21 +517,25 @@ cdef class Parser:
             StateClass state
             Transition action
         all_states = self.moves.init_batch([eg.predicted for eg in examples])
+        states = []
+        golds = []
         kept = []
         max_length_seen = 0
         for state, eg in zip(all_states, examples):
             if self.moves.has_gold(eg) and not state.is_final():
                 gold = self.moves.init_gold(state, eg)
-                oracle_actions = self.moves.get_oracle_sequence_from_state(
-                    state.copy(), gold)
-                kept.append((eg, state, gold, oracle_actions))
-                min_length = min(min_length, len(oracle_actions))
-                max_length_seen = max(max_length, len(oracle_actions))
+                if len(eg.x) < max_length:
+                    states.append(state)
+                    golds.append(gold)
+                else:
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                        state.copy(), gold)
+                    kept.append((eg, state, gold, oracle_actions))
+                    min_length = min(min_length, len(oracle_actions))
+                    max_length_seen = max(max_length, len(oracle_actions))
         if not kept:
-            return [], [], 0
+            return states, golds, 0
         max_length = max(min_length, min(max_length, max_length_seen))
-        states = []
-        golds = []
         cdef int clas
         max_moves = 0
         for eg, state, gold, oracle_actions in kept:

From c9f0f75778515a2cd00a96681b57358c95b83acf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Jul 2020 13:59:28 +0200
Subject: [PATCH 15/21] Update get_loss for senter and morphologizer (#5724)

* Update get_loss for senter

Update `SentenceRecognizer.get_loss` to keep it similar to `Tagger`.

* Update get_loss for morphologizer

Update `Morphologizer.get_loss` to keep it similar to `Tagger`.
---
 spacy/morphology.pyx             |  2 +-
 spacy/pipeline/morphologizer.pyx | 30 +++++++++---------------------
 spacy/pipeline/pipes.pyx         | 29 ++++++-----------------------
 3 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 78e8e17c0..a3aa8be22 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -272,7 +272,7 @@ cdef class Morphology:
 
     @staticmethod
     def feats_to_dict(feats):
-        if not feats:
+        if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
         return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
                 [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index f792d57b0..57b778434 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -3,7 +3,7 @@ cimport numpy as np
 
 import numpy
 import srsly
-from thinc.api import to_categorical
+from thinc.api import SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@@ -85,13 +85,10 @@ class Morphologizer(Tagger):
             doc.is_morphed = True
 
     def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        truths = []
         for eg in examples:
+            eg_truths = []
             pos_tags = eg.get_aligned("POS", as_string=True)
             morphs = eg.get_aligned("MORPH", as_string=True)
             for i in range(len(morphs)):
@@ -104,20 +101,11 @@ class Morphologizer(Tagger):
                     morph = self.vocab.strings[self.vocab.morphology.add(feats)]
                 if morph == "":
                     morph = Morphology.EMPTY_MORPH
-                if morph is None:
-                    correct[idx] = guesses[idx]
-                elif morph in tag_index:
-                    correct[idx] = tag_index[morph]
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                eg_truths.append(morph)
+            truths.append(eg_truths)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
     def to_bytes(self, exclude=tuple()):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2b147785e..cc3c39f03 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger):
                         doc.c[j].sent_start = -1
 
     def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = range(len(self.labels))
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
-        for eg in examples:
-            sent_starts = eg.get_aligned("sent_start")
-            for sent_start in sent_starts:
-                if sent_start is None:
-                    correct[idx] = guesses[idx]
-                elif sent_start in tag_index:
-                    correct[idx] = sent_start
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+        labels = self.labels
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,

From 0a3d41bb1d0715d43067c7d1cd661255c22666d8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 8 Jul 2020 14:00:07 +0200
Subject: [PATCH 16/21] Deprecat model shortcuts and simplify download (#5722)

---
 spacy/about.py        |  1 -
 spacy/cli/download.py | 42 ++++++++++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 057e21c87..8f374e2fe 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,5 +4,4 @@ __version__ = "3.0.0a2"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
 __projects__ = "https://github.com/explosion/spacy-boilerplates"
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea5e7a890..f192cb196 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,4 +1,4 @@
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence
 import requests
 import sys
 from wasabi import msg
@@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
 from .. import about
 from ..util import is_package, get_base_version, run_command
 
+# These are the old shortcuts we previously supported in spacy download. As of
+# v3, shortcuts are deprecated so we're not expecting to add anything to this
+# list. It only exists to show users warnings.
+OLD_SHORTCUTS = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm",
+    "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "el": "el_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "xx": "xx_ent_wiki_sm",
+}
+
 
 @app.command(
     "download",
@@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
         version = components[-1]
         download_model(dl_tpl.format(m=model_name, v=version), pip_args)
     else:
-        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
-        model_name = shortcuts.get(model, model)
+        model_name = model
+        if model in OLD_SHORTCUTS:
+            msg.warn(
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
+                f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
+            )
+            model_name = OLD_SHORTCUTS[model]
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
         download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
     )
 
 
-def get_json(url: str, desc: str) -> Union[dict, list]:
-    r = requests.get(url)
+def get_compatibility() -> dict:
+    version = get_base_version(about.__version__)
+    r = requests.get(about.__compatibility__)
     if r.status_code != 200:
         msg.fail(
             f"Server error ({r.status_code})",
-            f"Couldn't fetch {desc}. Please find a model for your spaCy "
+            f"Couldn't fetch compatibility table. Please find a model for your spaCy "
             f"installation (v{about.__version__}), and download it manually. "
             f"For more details, see the documentation: "
             f"https://spacy.io/usage/models",
             exits=1,
         )
-    return r.json()
-
-
-def get_compatibility() -> dict:
-    version = get_base_version(about.__version__)
-    comp_table = get_json(about.__compatibility__, "compatibility table")
+    comp_table = r.json()
     comp = comp_table["spacy"]
     if version not in comp:
         msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)

From 93e50da46a6d9cc847740410a8f9a960aa510825 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Jul 2020 21:36:51 +0200
Subject: [PATCH 17/21] Remove auto 'set_annotation' in training to address GPU
 memory

---
 spacy/language.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index da45c058c..a95b6d279 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -540,19 +540,15 @@ class Language(object):
 
         if component_cfg is None:
             component_cfg = {}
-        component_deps = count_pipeline_interdependencies(self.pipeline)
-        # Determine whether component should set annotations. In theory I guess
-        # we should do this by inspecting the meta? Or we could just always
-        # say "yes"
         for i, (name, proc) in enumerate(self.pipeline):
             component_cfg.setdefault(name, {})
             component_cfg[name].setdefault("drop", drop)
-            component_cfg[name]["set_annotations"] = bool(component_deps[i])
+            component_cfg[name].setdefault("set_annotations", False)
         for name, proc in self.pipeline:
             if not hasattr(proc, "update"):
                 continue
             proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd is not False:
+        if sgd not in (None, False):
             for name, proc in self.pipeline:
                 if hasattr(proc, "model"):
                     proc.model.finish_update(sgd)

From 1b20ffac3814b111d76f95d1b08c72f4b770ce77 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Jul 2020 21:37:06 +0200
Subject: [PATCH 18/21] batch_by_words by default

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 398b72952..bda3c9ca2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -307,7 +307,7 @@ def create_train_batches(nlp, corpus, cfg):
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
         epoch += 1
-        if cfg.get("batch_by_words"):
+        if cfg.get("batch_by_words", True):
             batches = util.minibatch_by_words(
                 train_examples,
                 size=cfg["batch_size"],

From 9b49787f352a039b883e3fac74f0abf5c5c82f83 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Jul 2020 21:38:01 +0200
Subject: [PATCH 19/21] Update NER config. Getting 84.8

---
 examples/experiments/onto-ner.cfg | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg
index 8970bb3c0..228289128 100644
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@@ -13,24 +13,25 @@ dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 100000
 max_epochs = 0
-max_steps = 100000
-eval_frequency = 2000
+max_steps = 0
+eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 1
+accumulate_gradient = 2
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = false
+discard_oversize = true
 omit_extra_lookups = false
+batch_by_words = true
 
 [training.batch_size]
 @schedules = "compounding.v1"
-start = 100
-stop = 2000
+start = 1000
+stop = 1000
 compound = 1.001
 
 [training.optimizer]
@@ -38,7 +39,7 @@ compound = 1.001
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
-L2 = 0.0
+L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
@@ -64,15 +65,15 @@ min_action_freq = 1
 nr_feature_tokens = 3
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
+use_upper = true
 
 [nlp.pipeline.ner.model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = ${nlp:vectors}
-width = 300
+width = 96
 depth = 4
 window_size = 1
-embed_size = 7000
+embed_size = 2000
 maxout_pieces = 1
 subword_features = true
 dropout = ${training:dropout}

From ad15499b3b2b71892a8c46c9e75237e394654ce1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 9 Jul 2020 01:41:58 +0200
Subject: [PATCH 20/21] Fix get_loss for values outside of labels in senter
 (#5730)

* Fix get_loss for None alignments in senter

When converting the `sent_start` values back to `SentenceRecognizer`
labels, handle `None` alignments.

* Handle SENT_START as -1

Handle SENT_START as -1 (or -1 converted to uint64) by treating any
values other than 1 the same as 0 in `SentenceRecognizer.get_loss`.
---
 spacy/pipeline/pipes.pyx            | 13 ++++++++++++-
 spacy/tests/pipeline/test_senter.py |  5 +++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index cc3c39f03..86c768e9b 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -523,7 +523,18 @@ class SentenceRecognizer(Tagger):
     def get_loss(self, examples, scores):
         labels = self.labels
         loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
-        truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
+        truths = []
+        for eg in examples:
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
+                else:
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
             raise ValueError("nan value when computing loss")
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index bfa1bd65a..82f536076 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -38,6 +38,11 @@ def test_overfitting_IO():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # add some cases where SENT_START == -1
+    train_examples[0].reference[10].is_sent_start = False
+    train_examples[1].reference[1].is_sent_start = False
+    train_examples[1].reference[11].is_sent_start = False
+
     nlp.add_pipe(senter)
     optimizer = nlp.begin_training()
 

From 8f9552d9e722d6e14e47304c0fc40ec5b4177677 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 9 Jul 2020 01:42:51 +0200
Subject: [PATCH 21/21] Refactor project CLI (#5732)

* Make project command a submodule

* Update with WIP

* Add helper for joining commands

* Update docstrins, formatting and types

* Update assets and add support for copying local files

* Fix type

* Update success messages
---
 spacy/cli/__init__.py         |   6 +-
 spacy/cli/_app.py             |   7 +
 spacy/cli/project.py          | 805 ----------------------------------
 spacy/cli/project/__init__.py |   0
 spacy/cli/project/assets.py   | 154 +++++++
 spacy/cli/project/clone.py    | 110 +++++
 spacy/cli/project/dvc.py      | 206 +++++++++
 spacy/cli/project/run.py      | 250 +++++++++++
 spacy/cli/project/util.py     |  57 +++
 spacy/schemas.py              |   2 +-
 spacy/tests/test_projects.py  |  31 ++
 spacy/util.py                 |  19 +
 12 files changed, 839 insertions(+), 808 deletions(-)
 delete mode 100644 spacy/cli/project.py
 create mode 100644 spacy/cli/project/__init__.py
 create mode 100644 spacy/cli/project/assets.py
 create mode 100644 spacy/cli/project/clone.py
 create mode 100644 spacy/cli/project/dvc.py
 create mode 100644 spacy/cli/project/run.py
 create mode 100644 spacy/cli/project/util.py
 create mode 100644 spacy/tests/test_projects.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 5dc3070b6..0568b34de 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -15,8 +15,10 @@ from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project import project_clone, project_assets, project_run  # noqa: F401
-from .project import project_run_all  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
 
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py
index 2b3ad9524..e970c4dde 100644
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
 
 DOCS: https://spacy.io/api/cli
 """
+PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
+project templates. You'd typically start by cloning a project template to a local
+directory and fetching its assets like datasets etc. See the project's
+project.yml for the available commands.
+"""
 
 
 app = typer.Typer(name=NAME, help=HELP)
+project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
+app.add_typer(project_cli)
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
diff --git a/spacy/cli/project.py b/spacy/cli/project.py
deleted file mode 100644
index 33a8ff11a..000000000
--- a/spacy/cli/project.py
+++ /dev/null
@@ -1,805 +0,0 @@
-from typing import List, Dict, Any, Optional, Sequence, Union
-import typer
-import srsly
-from pathlib import Path
-from wasabi import msg
-import subprocess
-import os
-import re
-import shutil
-import sys
-import requests
-import tqdm
-
-from ._app import app, Arg, Opt, COMMAND, NAME
-from .. import about
-from ..schemas import ProjectConfigSchema, validate
-from ..util import ensure_path, run_command, make_tempdir, working_dir
-from ..util import get_hash, get_checksum, split_command
-
-
-PROJECT_FILE = "project.yml"
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-DIRS = [
-    "assets",
-    "metas",
-    "configs",
-    "packages",
-    "metrics",
-    "scripts",
-    "notebooks",
-    "training",
-    "corpus",
-]
-CACHES = [
-    Path.home() / ".torch",
-    Path.home() / ".caches" / "torch",
-    os.environ.get("TORCH_HOME"),
-    Path.home() / ".keras",
-]
-DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit
-# it directly and edit the {PROJECT_FILE} instead and re-run the project."""
-CLI_HELP = f"""Command-line interface for spaCy projects and working with project
-templates. You'd typically start by cloning a project template to a local
-directory and fetching its assets like datasets etc. See the project's
-{PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
-Version Control) to manage input and output files and to ensure steps are only
-re-run if their inputs change.
-"""
-
-project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
-
-
-@project_cli.callback(invoke_without_command=True)
-def callback(ctx: typer.Context):
-    """This runs before every project command and ensures DVC is installed."""
-    ensure_dvc()
-
-
-################
-# CLI COMMANDS #
-################
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
-    initialize the project directory as a Git repo. If the project is intended
-    to be a Git repo, it should be initialized with Git first, before
-    initializing DVC (Data Version Control). This allows DVC to integrate with
-    Git.
-    """
-    if dest == Path.cwd():
-        dest = dest / name
-    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
-
-
-@project_cli.command("init")
-def project_init_cli(
-    # fmt: off
-    path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"),
-    # fmt: on
-):
-    """Initialize a project directory with DVC and optionally Git. This should
-    typically be taken care of automatically when you run the "project clone"
-    command, but you can also run it separately. If the project is intended to
-    be a Git repo, it should be initialized with Git first, before initializing
-    DVC. This allows DVC to integrate with Git.
-    """
-    project_init(path, git=git, force=force)
-
-
-@project_cli.command("assets")
-def project_assets_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Use DVC (Data Version Control) to fetch project assets. Assets are
-    defined in the "assets" section of the project.yml. If possible, DVC
-    will try to track the files so you can pull changes from upstream. It will
-    also try and store the checksum so the assets are versioned. If the file
-    can't be tracked or checked, it will be downloaded without DVC. If a checksum
-    is provided in the project.yml, the file is only downloaded if no local
-    file with the same checksum exists.
-    """
-    project_assets(project_dir)
-
-
-@project_cli.command(
-    "run-all",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_all_cli(
-    # fmt: off
-    ctx: typer.Context,
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run all commands defined in the project. This command will use DVC and
-    the defined outputs and dependencies in the project.yml to determine
-    which steps need to be re-run and where to start. This means you're only
-    re-generating data if the inputs have changed.
-
-    This command calls into "dvc repro" and all additional arguments are passed
-    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
-    """
-    if show_help:
-        print_run_help(project_dir)
-    else:
-        project_run_all(project_dir, *ctx.args)
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,
-    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named script defined in the project.yml. If the command is
-    part of the default pipeline defined in the "run" section, DVC is used to
-    determine whether the step should re-run if its inputs have changed, or
-    whether everything is up to date. If the script is not part of the default
-    pipeline, it will be called separately without DVC.
-
-    If DVC is used, the command calls into "dvc repro" and all additional
-    arguments are passed to the "dvc repro" command:
-    https://dvc.org/doc/command-reference/repro
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        project_run(project_dir, subcommand, *ctx.args)
-
-
-@project_cli.command("exec", hidden=True)
-def project_exec_cli(
-    # fmt: off
-    subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Execute a command defined in the project.yml. This CLI command is
-    only called internally in auto-generated DVC pipelines, as a shortcut for
-    multi-step commands in the project.yml. You typically shouldn't have to
-    call it yourself. To run a command, call "run" or "run-all".
-    """
-    project_exec(project_dir, subcommand)
-
-
-@project_cli.command("update-dvc")
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Update the auto-generated DVC config file. Uses the steps defined in the
-    "run" section of the project.yml. This typically happens automatically
-    when running a command, but can also be triggered manually if needed.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
-    if updated:
-        msg.good(f"Updated DVC config from {PROJECT_FILE}")
-    else:
-        msg.info(f"No changes found in {PROJECT_FILE}, no update needed")
-
-
-app.add_typer(project_cli, name="project")
-
-
-#################
-# CLI FUNCTIONS #
-#################
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    git: bool = False,
-    no_init: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    git (bool): Initialize project as Git repo. Should be set to True if project
-        is intended as a repo, since it will allow DVC to integrate with Git.
-    no_init (bool): Don't initialize DVC and Git automatically. If True, the
-        "init" command or "git init" and "dvc init" need to be run manually.
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
-        try:
-            run_command(cmd)
-        except DVCError:
-            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
-            msg.fail(err)
-        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
-            f.write(name)
-        try:
-            run_command(["git", "-C", str(tmp_dir), "fetch"])
-            run_command(["git", "-C", str(tmp_dir), "checkout"])
-        except DVCError:
-            err = f"Could not clone '{name}' in the repo '{repo}'."
-            msg.fail(err)
-        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
-    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
-    for sub_dir in DIRS:
-        dir_path = project_dir / sub_dir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    if not no_init:
-        project_init(project_dir, git=git, force=True, silent=True)
-    msg.good(f"Your project is now ready!", dest)
-    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def project_init(
-    project_dir: Path,
-    *,
-    git: bool = False,
-    force: bool = False,
-    silent: bool = False,
-    analytics: bool = False,
-):
-    """Initialize a project as a DVC and (optionally) as a Git repo.
-
-    project_dir (Path): Path to project directory.
-    git (bool): Also call "git init" to initialize directory as a Git repo.
-    silent (bool): Don't print any output (via DVC).
-    analytics (bool): Opt-in to DVC analytics (defaults to False).
-    """
-    with working_dir(project_dir) as cwd:
-        if git:
-            run_command(["git", "init"])
-        flags = {"--force": force, "--quiet": silent, "--no-scm": not git}
-        try:
-            run_dvc_command(["init"], flags=flags)
-        except DVCError:
-            msg.fail(
-                "Failed to initialize project. This likely means that the "
-                "project is already initialized and has a .dvc directory. "
-                "To force-initialize, use the --force flag.",
-                exits=1,
-            )
-        # We don't want to have analytics on by default – our users should
-        # opt-in explicitly. If they want it, they can always enable it.
-        if not analytics:
-            run_dvc_command(["config", "core.analytics", "false"])
-        # Remove unused and confusing plot templates from .dvc directory.
-        # Otherwise super confusing once you commit your changes via Git and it
-        # creates a bunch of files that have no purpose.
-        plots_dir = cwd / DVC_DIR / "plots"
-        if plots_dir.exists():
-            shutil.rmtree(str(plots_dir))
-        config = load_project_config(cwd)
-        setup_check_dvc(cwd, config)
-    msg.good("Initialized project")
-
-
-def project_assets(project_dir: Path) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path)
-    setup_check_dvc(project_path, config)
-    assets = config.get("assets", {})
-    if not assets:
-        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
-    msg.info(f"Fetching {len(assets)} asset(s)")
-    variables = config.get("variables", {})
-    fetched_assets = []
-    for asset in assets:
-        dest = asset["dest"].format(**variables)
-        url = asset.get("url")
-        checksum = asset.get("checksum")
-        if not url:
-            # project.yml defines asset without URL that the user has to place
-            if not Path(dest).exists():
-                err = f"No URL provided for asset. You need to add this file yourself: {dest}"
-                msg.warn(err)
-            else:
-                if checksum == get_checksum(dest):
-                    msg.good(f"Asset exists with matching checksum: {dest}")
-                    fetched_assets.append((project_path / dest).resolve())
-                else:
-                    msg.fail(f"Asset available but with incorrect checksum: {dest}")
-            continue
-        url = url.format(**variables)
-        fetched_path = fetch_asset(project_path, url, dest, checksum)
-        if fetched_path:
-            fetched_assets.append(str(fetched_path))
-    if fetched_assets:
-        with working_dir(project_path):
-            run_dvc_command(["add", *fetched_assets, "--external"])
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> Optional[Path]:
-    """Fetch an asset from a given URL or path. Will try to import the file
-    using DVC's import-url if possible (fully tracked and versioned) and falls
-    back to get-url (versioned) and a non-DVC download if necessary. If a
-    checksum is provided and a local file exists, it's only re-downloaded if the
-    checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    url = convert_asset_url(url)
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
-        # If there's already a file, check for checksum
-        # TODO: add support for caches (dvc import-url with local path)
-        if checksum == get_checksum(dest_path):
-            msg.good(f"Skipping download with matching checksum: {dest}")
-            return dest_path
-    with working_dir(project_path):
-        try:
-            # If these fail, we don't want to output an error or info message.
-            # Try with tracking the source first, then just downloading with
-            # DVC, then a regular non-DVC download.
-            try:
-                run_dvc_command(["import-url", url, str(dest_path)])
-            except DVCError:
-                run_dvc_command(["get-url", url, str(dest_path)])
-        except DVCError:
-            try:
-                download_file(url, dest_path)
-            except requests.exceptions.HTTPError as e:
-                msg.fail(f"Download failed: {dest}", e)
-                return None
-    if checksum and checksum != get_checksum(dest_path):
-        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
-    msg.good(f"Fetched asset {dest}")
-    return dest_path
-
-
-def project_run_all(project_dir: Path, *dvc_args) -> None:
-    """Run all commands defined in the project using DVC.
-
-    project_dir (Path): Path to project directory.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    with working_dir(project_dir):
-        try:
-            run_dvc_command(["repro", *dvc_args])
-        except DVCError:
-            # We could raise a custom error here, but the output produced by
-            # DVC is already pretty substantial.
-            sys.exit(1)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project.yml.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    if subcommand:
-        validate_subcommand(commands.keys(), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
-        help_text = commands[subcommand].get("help")
-        if help_text:
-            msg.text(f"\n{help_text}\n")
-    else:
-        print(f"\nAvailable commands in {PROJECT_FILE}")
-        print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
-        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
-        print(f"{COMMAND} project run-all {project_dir}")
-
-
-def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
-    """Run a named script defined in the project.yml. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    validate_subcommand(commands.keys(), subcommand)
-    if subcommand in config.get("run", []):
-        # This is one of the pipeline commands tracked in DVC
-        with working_dir(project_dir):
-            try:
-                run_dvc_command(["repro", subcommand, *dvc_args])
-            except DVCError:
-                # We could raise a custom error here, but the output produced by
-                # DVC is already pretty substantial.
-                sys.exit(1)
-    else:
-        cmd = commands[subcommand]
-        # Deps in non-DVC commands aren't tracked, but if they're defined,
-        # make sure they exist before running the command
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                msg.fail(err, exits=1)
-        with working_dir(project_dir):
-            run_commands(cmd["script"], variables)
-
-
-def project_exec(project_dir: Path, subcommand: str) -> None:
-    """Execute a command defined in the project.yml.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    with working_dir(project_dir):
-        run_commands(commands[subcommand]["script"], variables)
-
-
-###########
-# HELPERS #
-###########
-
-
-def load_project_config(path: Path) -> Dict[str, Any]:
-    """Load the project.yml file from a directory and validate it.
-
-    path (Path): The path to the project directory.
-    RETURNS (Dict[str, Any]): The loaded project.yml.
-    """
-    config_path = path / PROJECT_FILE
-    if not config_path.exists():
-        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
-    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err, "\n".join(errors), exits=1)
-    return config
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    verbose: bool = False,
-    silent: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project.yml, don't need to update
-        dvc_config_path.unlink()
-    variables = config.get("variables", {})
-    dvc_commands = []
-    # We only want to include commands that are part of the main list of "run"
-    # commands in project.yml and should be run in sequence
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    for name in config.get("run", []):
-        validate_subcommand(config_commands.keys(), name)
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "exec", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        dvc_commands.append(" ".join(full_cmd))
-    with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def ensure_dvc() -> None:
-    """Ensure that the "dvc" command is available and show an error if not."""
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-
-
-def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
-    """Check that the project is set up correctly with DVC and update its
-    config if needed. Will raise an error if the project is not an initialized
-    DVC project.
-
-    project_dir (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    """
-    if not project_dir.exists():
-        msg.fail(f"Can't find project directory: {project_dir}")
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project.",
-            f"Make sure that the project template was cloned correctly. To "
-            f"initialize the project directory manually, you can run: "
-            f"{COMMAND} project init {project_dir}",
-            exits=1,
-        )
-    with msg.loading("Updating DVC config..."):
-        updated = update_dvc_config(project_dir, config, silent=True)
-    if updated:
-        msg.good(f"Updated DVC config from changed {PROJECT_FILE}")
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match(r"(http(s?)):\/\/github.com", url):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    try:
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
-            f"To clone a project without Git, copy the files from the '{name}' "
-            f"directory in the {repo} to {dest} manually and then run:",
-            f"{COMMAND} project init {dest}",
-            exits=1,
-        )
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
-            exits=1,
-        )
-
-
-def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if subcommand not in commands:
-        msg.fail(
-            f"Can't find command '{subcommand}' in {PROJECT_FILE}. "
-            f"Available commands: {', '.join(commands)}",
-            exits=1,
-        )
-
-
-def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
-    """Download a file using requests.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    chunk_size (int): The size of chunks to read/write.
-    """
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    total = int(response.headers.get("content-length", 0))
-    progress_settings = {
-        "total": total,
-        "unit": "iB",
-        "unit_scale": True,
-        "unit_divisor": chunk_size,
-        "leave": False,
-    }
-    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
-        for data in response.iter_content(chunk_size=chunk_size):
-            size = f.write(data)
-            bar.update(size)
-
-
-def run_commands(
-    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    silent (bool): Don't print the commands.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {' '.join(command)}")
-        run_command(command)
-
-
-def run_dvc_commands(
-    commands: List[str] = tuple(),
-    variables: Dict[str, str] = {},
-    flags: Dict[str, bool] = {},
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        run_dvc_command(command, flags=flags)
-
-
-def run_dvc_command(
-    command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False
-) -> None:
-    """Run a DVC command in a subprocess. This wrapper gives us a bit more
-    control over how the output and errors are presented. Raises a DVC error if
-    the "dvc" command returns a non-zero exit code and uses the error message
-    logged by DVC.
-
-    command (Union[str, List[str]]): The command, without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    silent (bool): Don't print any output.
-    """
-    if isinstance(command, str):
-        command = split_command(command)
-    dvc_command = ["dvc", *command]
-    # Add the flags if they are set to True
-    for flag, is_active in flags.items():
-        if is_active:
-            dvc_command.append(flag)
-    proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    if not silent:
-        lines = proc.stdout.read().decode("utf8").split("\n\n")
-        for line in lines:
-            line = line.strip()
-            if is_relevant_dvc_output(line):
-                print(f"{line}\n")
-    _, err = proc.communicate()  # Important: otherwise returncode will be None!
-    if proc.returncode != 0:
-        if isinstance(err, bytes):
-            err = err.decode("utf8")
-        raise DVCError(err)
-
-
-def is_relevant_dvc_output(line: str) -> bool:
-    """Check whether the output by DVC is something we want to keep.
-
-    line (str): A line written to stdout,.
-    RETURNS (bool): Whether to use/print the line.
-    """
-    # Writing them like this for readability but maybe replace with regex?
-    conditions = [
-        not line,
-        line.startswith("What's next?"),
-        line.startswith("Having any troubles?"),
-    ]
-    return not any(conditions)
-
-
-class DVCError(RuntimeError):
-    """Custom error type for anything produced by the DVC CLI."""
-
-    pass
diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
new file mode 100644
index 000000000..0ef3419f3
--- /dev/null
+++ b/spacy/cli/project/assets.py
@@ -0,0 +1,154 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import requests
+import tqdm
+import re
+import shutil
+
+from ...util import ensure_path, get_checksum, working_dir
+from .._app import project_cli, Arg
+from .util import PROJECT_FILE, load_project_config
+
+
+# TODO: find a solution for caches
+# CACHES = [
+#     Path.home() / ".torch",
+#     Path.home() / ".caches" / "torch",
+#     os.environ.get("TORCH_HOME"),
+#     Path.home() / ".keras",
+# ]
+
+
+@project_cli.command("assets")
+def project_assets_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+    """
+    project_assets(project_dir)
+
+
+def project_assets(project_dir: Path) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path)
+    assets = config.get("assets", {})
+    if not assets:
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+    msg.info(f"Fetching {len(assets)} asset(s)")
+    variables = config.get("variables", {})
+    for asset in assets:
+        dest = asset["dest"].format(**variables)
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            check_private_asset(dest, checksum)
+            continue
+        url = url.format(**variables)
+        fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Desintation path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if checksum and checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    # TODO: add support for caches
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists() and checksum:
+        # If there's already a file, check for checksum
+        if checksum == get_checksum(dest_path):
+            msg.good(f"Skipping download with matching checksum: {dest}")
+            return dest_path
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+                return
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if re.match(r"(http(s?)):\/\/github.com", url):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
+
+
+def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
+    """Download a file using requests.
+
+    url (str): The URL of the file.
+    dest (Path): The destination path.
+    chunk_size (int): The size of chunks to read/write.
+    """
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total = int(response.headers.get("content-length", 0))
+    progress_settings = {
+        "total": total,
+        "unit": "iB",
+        "unit_scale": True,
+        "unit_divisor": chunk_size,
+        "leave": False,
+    }
+    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
+        for data in response.iter_content(chunk_size=chunk_size):
+            size = f.write(data)
+            bar.update(size)
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
new file mode 100644
index 000000000..ee1fd790c
--- /dev/null
+++ b/spacy/cli/project/clone.py
@@ -0,0 +1,110 @@
+from pathlib import Path
+from wasabi import msg
+import subprocess
+import shutil
+
+from ... import about
+from ...util import ensure_path, run_command, make_tempdir
+from .._app import project_cli, Arg, Opt, COMMAND
+
+
+DIRS = [
+    "assets",
+    "metas",
+    "configs",
+    "packages",
+    "metrics",
+    "scripts",
+    "notebooks",
+    "training",
+    "corpus",
+]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to fetch"),
+    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo). Setting the --git flag will also
+    initialize the project directory as a Git repo. If the project is intended
+    to be a Git repo, it should be initialized with Git first, before
+    initializing DVC (Data Version Control). This allows DVC to integrate with
+    Git.
+    """
+    if dest == Path.cwd():
+        dest = dest / name
+    project_clone(name, dest, repo=repo)
+
+
+def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    # We're using Git and sparse checkout to only clone the files we need
+    with make_tempdir() as tmp_dir:
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
+        try:
+            run_command(cmd)
+        except subprocess.CalledProcessError:
+            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
+            msg.fail(err)
+        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
+            f.write(name)
+        try:
+            run_command(["git", "-C", str(tmp_dir), "fetch"])
+            run_command(["git", "-C", str(tmp_dir), "checkout"])
+        except subprocess.CalledProcessError:
+            err = f"Could not clone '{name}' in the repo '{repo}'."
+            msg.fail(err)
+        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
+    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
+    for sub_dir in DIRS:
+        dir_path = project_dir / sub_dir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    msg.good(f"Your project is now ready!", dest)
+    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    try:
+        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+            f"To clone a project without Git, copy the files from the '{name}' "
+            f"directory in the {repo} to {dest} manually and then run:",
+            f"{COMMAND} project init {dest}",
+            exits=1,
+        )
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            exits=1,
+        )
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
new file mode 100644
index 000000000..a98cb939a
--- /dev/null
+++ b/spacy/cli/project/dvc.py
@@ -0,0 +1,206 @@
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+from typing import Dict, Any, List, Optional
+import subprocess
+from pathlib import Path
+from wasabi import msg
+
+from .util import PROJECT_FILE, load_project_config
+from .._app import project_cli, Arg, Opt, NAME, COMMAND
+from ...util import get_hash, working_dir, split_command, join_command, run_command
+
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    silent: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    silent (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    variables = config.get("variables", {})
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+    with working_dir(path):
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        dvc_command = ["dvc", *command]
+        # Add the flags if they are set to True
+        for flag, is_active in flags.items():
+            if is_active:
+                dvc_command.append(flag)
+        run_command(dvc_command)
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
new file mode 100644
index 000000000..a4d7dd644
--- /dev/null
+++ b/spacy/cli/project/run.py
@@ -0,0 +1,250 @@
+from typing import Optional, List, Dict, Sequence, Any
+from pathlib import Path
+from wasabi import msg
+import typer
+import sys
+import srsly
+
+from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
+from ...util import get_hash, join_command
+from .._app import project_cli, Arg, Opt, COMMAND
+from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named script or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define inputs and/or outputs, they will only be re-run if state
+    has changed.
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    """
+    config = load_project_config(project_dir)
+    variables = config.get("variables", {})
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(project_dir, cmd, force=force, dry=dry)
+    else:
+        cmd = commands[subcommand]
+        variables = config.get("variables", {})
+        for dep in cmd.get("deps", []):
+            dep = dep.format(**variables)
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_kwargs = {"exits": 1} if not dry else {}
+                msg.fail(err, **err_kwargs)
+        with working_dir(project_dir) as current_dir:
+            rerun = check_rerun(current_dir, cmd, variables)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                msg.divider(subcommand)
+                run_commands(cmd["script"], variables, dry=dry)
+                update_lockfile(current_dir, cmd, variables)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(commands.keys(), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        help_text = commands[subcommand].get("help")
+        if help_text:
+            msg.text(f"\n{help_text}\n")
+    else:
+        print(f"\nAvailable commands in {PROJECT_FILE}")
+        print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
+        print(f"{COMMAND} project run {project_loc}")
+
+
+def run_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, Any] = {},
+    silent: bool = False,
+    dry: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (bool): Whether to re-run the command.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+
+
+def update_lockfile(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+    }
+
+
+def get_fileinfo(
+    project_dir: Path, paths: List[str], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        path = path.format(**variables)
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
diff --git a/spacy/cli/project/util.py b/spacy/cli/project/util.py
new file mode 100644
index 000000000..5f2dc59ee
--- /dev/null
+++ b/spacy/cli/project/util.py
@@ -0,0 +1,57 @@
+from typing import Dict, Any
+from pathlib import Path
+from wasabi import msg
+import srsly
+
+from ...schemas import ProjectConfigSchema, validate
+
+
+PROJECT_FILE = "project.yml"
+PROJECT_LOCK = "project.lock"
+
+
+def load_project_config(path: Path) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it.
+
+    path (Path): The path to the project directory.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err, "\n".join(errors), exits=1)
+    validate_project_commands(config)
+    return config
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
diff --git a/spacy/schemas.py b/spacy/schemas.py
index ca17fe50b..b7307b5b2 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
     # fmt: off
     variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
     assets: List[ProjectConfigAsset] = Field([], title="Data assets")
-    run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
     commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
     # fmt: on
 
diff --git a/spacy/tests/test_projects.py b/spacy/tests/test_projects.py
new file mode 100644
index 000000000..c3477f463
--- /dev/null
+++ b/spacy/tests/test_projects.py
@@ -0,0 +1,31 @@
+import pytest
+from spacy.cli.project.util import validate_project_commands
+from spacy.schemas import ProjectConfigSchema, validate
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        {"commands": [{"name": "a"}, {"name": "a"}]},
+        {"commands": [{"name": "a"}], "workflows": {"a": []}},
+        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
+    ],
+)
+def test_project_config_validation1(config):
+    with pytest.raises(SystemExit):
+        validate_project_commands(config)
+
+
+@pytest.mark.parametrize(
+    "config,n_errors",
+    [
+        ({"commands": {"a": []}}, 1),
+        ({"commands": [{"help": "..."}]}, 1),
+        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
+        ({"commands": [{"extra": "b"}]}, 2),
+        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
+    ],
+)
+def test_project_config_validation2(config, n_errors):
+    errors = validate(ProjectConfigSchema, config)
+    assert len(errors) == n_errors
diff --git a/spacy/util.py b/spacy/util.py
index 4a17b7f24..66b88d2d8 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
     return shlex.split(command, posix=not is_windows)
 
 
+def join_command(command: List[str]) -> str:
+    """Join a command using shlex. shlex.join is only available for Python 3.8+,
+    so we're using a workaround here.
+
+    command (List[str]): The command to join.
+    RETURNS (str): The joined command
+    """
+    return " ".join(shlex.quote(cmd) for cmd in command)
+
+
 def run_command(command: Union[str, List[str]]) -> None:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.
@@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
     return hashlib.md5(Path(path).read_bytes()).hexdigest()
 
 
+def is_cwd(path: Union[Path, str]) -> bool:
+    """Check whether a path is the current working directory.
+
+    path (Union[Path, str]): The directory path.
+    RETURNS (bool): Whether the path is the current working directory.
+    """
+    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
+
+
 def is_in_jupyter():
     """Check if user is running spaCy from a Jupyter notebook by detecting the
     IPython kernel. Mainly used for the displaCy visualizer.