Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-09-16 00:52:38 +03:00 · 2020-07-07 17:21:10 +02:00 · 2020-07-07 17:21:10 +02:00 · 8177f25b6c
commit 8177f25b6c
parent d1fd3438c3 fa00a85828
53 changed files with 1240 additions and 1438 deletions
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -23,6 +23,7 @@ def test_issue2070():
    assert len(doc) == 11


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
    assert len(matches) == 3


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
+        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -88,6 +88,7 @@ def test_issue3199():
    assert list(doc[0:3].noun_chunks) == []


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -0,0 +1,472 @@
+import pytest
+from spacy.language import Language
+from spacy.vocab import Vocab
+from spacy.pipeline import EntityRuler, DependencyParser
+from spacy.pipeline.defaults import default_parser
+from spacy import displacy, load
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc, Token
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.errors import MatchPatternError
+from spacy.util import minibatch
+from spacy.gold import Example
+from spacy.lang.hi import Hindi
+from spacy.lang.es import Spanish
+from spacy.lang.en import English
+from spacy.attrs import IS_ALPHA
+from thinc.api import compounding
+import spacy
+import srsly
+import numpy
+
+from ..util import make_tempdir, get_doc
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
+    assert tok.is_stop
+
+
+def test_issue_3526_1(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler_bytes = ruler.to_bytes()
+    assert len(ruler) == len(patterns)
+    assert len(ruler.labels) == 4
+    assert ruler.overwrite
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(ruler_bytes)
+    assert len(new_ruler) == len(ruler)
+    assert len(new_ruler.labels) == 4
+    assert new_ruler.overwrite == ruler.overwrite
+    assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+def test_issue_3526_2(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(bytes_old_style)
+    assert len(new_ruler) == len(ruler)
+    for pattern in ruler.patterns:
+        assert pattern in new_ruler.patterns
+    assert new_ruler.overwrite is not ruler.overwrite
+
+
+def test_issue_3526_3(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    with make_tempdir() as tmpdir:
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
+        for pattern in ruler.patterns:
+            assert pattern in new_ruler.patterns
+        assert len(new_ruler) == len(ruler)
+        assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue_3526_4(en_vocab):
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, overwrite_ents=True)
+    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+    nlp.add_pipe(ruler)
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
+        nlp2 = load(tmpdir)
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True
+
+
+def test_issue3531():
+    """Test that displaCy renderer doesn't require "settings" key."""
+    example_dep = {
+        "words": [
+            {"text": "But", "tag": "CCONJ"},
+            {"text": "Google", "tag": "PROPN"},
+            {"text": "is", "tag": "VERB"},
+            {"text": "starting", "tag": "VERB"},
+            {"text": "from", "tag": "ADP"},
+            {"text": "behind.", "tag": "ADV"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+        ],
+    }
+    example_ent = {
+        "text": "But Google is starting from behind.",
+        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+    }
+    dep_html = displacy.render(example_dep, style="dep", manual=True)
+    assert dep_html
+    ent_html = displacy.render(example_ent, style="ent", manual=True)
+    assert ent_html
+
+
+def test_issue3540(en_vocab):
+    words = ["I", "live", "in", "NewYork", "right", "now"]
+    tensor = numpy.asarray(
+        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+        dtype="f",
+    )
+    doc = Doc(en_vocab, words=words)
+    doc.tensor = tensor
+    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_1 = [token.vector for token in doc]
+    assert len(vectors_1) == len(doc)
+
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[3], 1), doc[2]]
+        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_2 = [token.vector for token in doc]
+    assert len(vectors_2) == len(doc)
+    assert vectors_1[0].tolist() == vectors_2[0].tolist()
+    assert vectors_1[1].tolist() == vectors_2[1].tolist()
+    assert vectors_1[2].tolist() == vectors_2[2].tolist()
+    assert vectors_1[4].tolist() == vectors_2[5].tolist()
+    assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
+def test_issue3549(en_vocab):
+    """Test that match pattern validation doesn't raise on empty errors."""
+    matcher = Matcher(en_vocab, validate=True)
+    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+    matcher.add("GOOD", [pattern])
+    with pytest.raises(MatchPatternError):
+        matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.xfail
+def test_issue3555(en_vocab):
+    """Test that custom extensions with default None don't break matcher."""
+    Token.set_extension("issue3555", default=None)
+    matcher = Matcher(en_vocab)
+    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["have", "apple"])
+    matcher(doc)
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training(X=x_train, Y=y_train)
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+
+
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
+
+
+def test_issue3803():
+    """Test that spanish num-like tokens have True for like_num attribute."""
+    nlp = Spanish()
+    text = "2 dos 1000 mil 12 doce"
+    doc = nlp(text)
+
+    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_no_subtok():
+    """Test that the parser doesn't have subtok label if not learn_tokens"""
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" not in parser.labels
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_with_subtok():
+    """Test that the parser does have subtok label if learn_tokens=True."""
+    config = {
+        "learn_tokens": True,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" in parser.labels
+
+
+def test_issue3839(en_vocab):
+    """Test that match IDs returned by the matcher are correct, are in the string """
+    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+    matcher = Matcher(en_vocab)
+    match_id = "PATTERN"
+    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+    matcher.add(match_id, [pattern1])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+    matcher = Matcher(en_vocab)
+    matcher.add(match_id, [pattern2])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+    doc = nlp(sentence)
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+def test_issue3879(en_vocab):
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    assert len(doc) == 5
+    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [pattern])
+    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("parser"))
+    nlp.add_pipe(nlp.create_pipe("ner"))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.get_pipe("parser").add_label("dep")
+    nlp.get_pipe("ner").add_label("PERSON")
+    nlp.get_pipe("tagger").add_label("NN")
+    nlp.begin_training()
+    for doc in nlp.pipe(texts):
+        pass
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)
+
+
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+def test_issue3959():
+    """ Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+        assert doc2[0].pos_ == "NOUN"
+
+
+def test_issue3962(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+def test_issue3962_long(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # head set to the new artificial head (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
+
+
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+
+    assert len(matches) == 2
+
+    # We should have a match for each of the two rules
+    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+    assert "A" in found_ids
+    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,8 +0,0 @@
-import pytest
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-def test_issue3521(en_tokenizer, word):
-    tok = en_tokenizer(word)[1]
-    # 'not' and 'would' should be stopwords, also in their abbreviated forms
-    assert tok.is_stop
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@ -1,85 +0,0 @@
-import pytest
-from spacy.tokens import Span
-from spacy.language import Language
-from spacy.pipeline import EntityRuler
-from spacy import load
-import srsly
-
-from ..util import make_tempdir
-
-
-@pytest.fixture
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-
-
-@pytest.fixture
-def add_ent():
-    def add_ent_component(doc):
-        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
-        return doc
-
-    return add_ent_component
-
-
-def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    ruler_bytes = ruler.to_bytes()
-    assert len(ruler) == len(patterns)
-    assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(ruler_bytes)
-    assert len(new_ruler) == len(ruler)
-    assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, overwrite_ents=True)
-
-    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-    nlp.add_pipe(ruler)
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        ruler = nlp.get_pipe("entity_ruler")
-        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
-        nlp2 = load(tmpdir)
-        new_ruler = nlp2.get_pipe("entity_ruler")
-        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@ -1,30 +0,0 @@
-from spacy import displacy
-
-
-def test_issue3531():
-    """Test that displaCy renderer doesn't require "settings" key."""
-    example_dep = {
-        "words": [
-            {"text": "But", "tag": "CCONJ"},
-            {"text": "Google", "tag": "PROPN"},
-            {"text": "is", "tag": "VERB"},
-            {"text": "starting", "tag": "VERB"},
-            {"text": "from", "tag": "ADP"},
-            {"text": "behind.", "tag": "ADV"},
-        ],
-        "arcs": [
-            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
-            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
-            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
-            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
-            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
-        ],
-    }
-    example_ent = {
-        "text": "But Google is starting from behind.",
-        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
-    }
-    dep_html = displacy.render(example_dep, style="dep", manual=True)
-    assert dep_html
-    ent_html = displacy.render(example_ent, style="ent", manual=True)
-    assert ent_html
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@ -1,44 +0,0 @@
-from spacy.tokens import Doc
-
-import numpy as np
-
-
-def test_issue3540(en_vocab):
-
-    words = ["I", "live", "in", "NewYork", "right", "now"]
-    tensor = np.asarray(
-        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
-        dtype="f",
-    )
-    doc = Doc(en_vocab, words=words)
-    doc.tensor = tensor
-
-    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_1 = [token.vector for token in doc]
-    assert len(vectors_1) == len(doc)
-
-    with doc.retokenize() as retokenizer:
-        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
-        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
-    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_2 = [token.vector for token in doc]
-    assert len(vectors_2) == len(doc)
-
-    assert vectors_1[0].tolist() == vectors_2[0].tolist()
-    assert vectors_1[1].tolist() == vectors_2[1].tolist()
-    assert vectors_1[2].tolist() == vectors_2[2].tolist()
-
-    assert vectors_1[4].tolist() == vectors_2[5].tolist()
-    assert vectors_1[5].tolist() == vectors_2[6].tolist()
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@ -1,12 +0,0 @@
-import pytest
-from spacy.matcher import Matcher
-from spacy.errors import MatchPatternError
-
-
-def test_issue3549(en_vocab):
-    """Test that match pattern validation doesn't raise on empty errors."""
-    matcher = Matcher(en_vocab, validate=True)
-    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-    matcher.add("GOOD", [pattern])
-    with pytest.raises(MatchPatternError):
-        matcher.add("BAD", [[{"X": "Y"}]])
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@ -1,14 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher
-
-
-@pytest.mark.xfail
-def test_issue3555(en_vocab):
-    """Test that custom extensions with default None don't break matcher."""
-    Token.set_extension("issue3555", default=None)
-    matcher = Matcher(en_vocab)
-    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["have", "apple"])
-    matcher(doc)
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,45 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue3611():
-    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training(X=x_train, Y=y_train)
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -1,9 +0,0 @@
-from spacy.lang.hi import Hindi
-
-
-def test_issue3625():
-    """Test that default punctuation rules applies to hindi unicode characters"""
-    nlp = Hindi()
-    doc = nlp("hi. how हुए. होटल, होटल")
-    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
-    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@ -1,10 +0,0 @@
-from spacy.lang.es import Spanish
-
-
-def test_issue3803():
-    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
-
-    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@ -1,34 +0,0 @@
-from spacy.pipeline.pipes import DependencyParser
-from spacy.vocab import Vocab
-
-from spacy.pipeline.defaults import default_parser
-
-
-def test_issue3830_no_subtok():
-    """Test that the parser doesn't have subtok label if not learn_tokens"""
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" not in parser.labels
-
-
-def test_issue3830_with_subtok():
-    """Test that the parser does have subtok label if learn_tokens=True."""
-    config = {
-        "learn_tokens": True,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,18 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3839(en_vocab):
-    """Test that match IDs returned by the matcher are correct, are in the string """
-    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
-    matcher = Matcher(en_vocab)
-    match_id = "PATTERN"
-    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
-    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
-    matcher.add(match_id, [pattern1])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-    matcher = Matcher(en_vocab)
-    matcher.add(match_id, [pattern2])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.attrs import IS_ALPHA
-from spacy.lang.en import English
-
-
-@pytest.mark.parametrize(
-    "sentence",
-    [
-        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
-        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
-    ],
-)
-def test_issue3869(sentence):
-    """Test that the Doc's count_by function works consistently"""
-    nlp = English()
-    doc = nlp(sentence)
-
-    count = 0
-    for token in doc:
-        count += token.is_alpha
-
-    assert count == doc.count_by(IS_ALPHA).get(1, 0)
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,11 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3879(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
-    assert len(doc) == 5
-    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [pattern])
-    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@ -1,21 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue3880():
-    """Test that `nlp.pipe()` works when an empty string ends the batch.
-
-    Fixed in v7.0.5 of Thinc.
-    """
-    texts = ["hello", "world", "", ""]
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("parser"))
-    nlp.add_pipe(nlp.create_pipe("ner"))
-    nlp.add_pipe(nlp.create_pipe("tagger"))
-    nlp.get_pipe("parser").add_label("dep")
-    nlp.get_pipe("ner").add_label("PERSON")
-    nlp.get_pipe("tagger").add_label("NN")
-    nlp.begin_training()
-    for doc in nlp.pipe(texts):
-        pass
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -1,12 +0,0 @@
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc
-
-
-def test_issue3882(en_vocab):
-    """Test that displaCy doesn't serialize the doc.user_data when making a
-    copy of the Doc.
-    """
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
-    doc.user_data["test"] = set()
-    parse_deps(doc)
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,17 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3951(en_vocab):
-    """Test that combinations of optional rules are matched correctly."""
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"LOWER": "hello"},
-        {"LOWER": "this", "OP": "?"},
-        {"OP": "?"},
-        {"LOWER": "world"},
-    ]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
-    matches = matcher(doc)
-    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -1,26 +0,0 @@
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-def test_issue3959():
-    """ Ensure that a modified pos attribute is serialized correctly."""
-    nlp = English()
-    doc = nlp(
-        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
-    )
-    assert doc[0].pos_ == ""
-
-    doc[0].pos_ = "NOUN"
-    assert doc[0].pos_ == "NOUN"
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    with make_tempdir() as tmp_dir:
-        file_path = tmp_dir / "my_doc"
-        doc.to_disk(file_path)
-
-        doc2 = nlp("")
-        doc2.from_disk(file_path)
-
-        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -1,117 +0,0 @@
-import pytest
-
-from ..util import get_doc
-
-
-@pytest.fixture
-def doc(en_tokenizer):
-    text = "He jests at scars, that never felt a wound."
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ccomp",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962(doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = doc[1:5]  # "jests at scars ,"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root
-    assert doc2[0].dep_ == "dep"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"  # head set to the new artificial root
-    assert doc2[3].dep_ == "dep"
-
-    # We should still have 1 sentence
-    assert len(list(doc2.sents)) == 1
-
-    span3 = doc[6:9]  # "never felt a"
-    doc3 = span3.as_doc()
-    doc3_json = doc3.to_json()
-    assert doc3_json
-
-    assert doc3[0].head.text == "felt"
-    assert doc3[0].dep_ == "neg"
-    assert doc3[1].head.text == "felt"
-    assert doc3[1].dep_ == "ROOT"
-    assert doc3[2].head.text == "felt"  # head set to ancestor
-    assert doc3[2].dep_ == "dep"
-
-    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
-    assert len(list(doc3.sents)) == 1
-
-
-@pytest.fixture
-def two_sent_doc(en_tokenizer):
-    text = "He jests at scars. They never felt a wound."
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ROOT",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962_long(two_sent_doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root (in sentence 1)
-    assert doc2[0].dep_ == "ROOT"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"
-    assert doc2[3].dep_ == "punct"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to itself, being the new artificial root (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to the new artificial head (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-
-    # We should still have 2 sentences
-    sents = list(doc2.sents)
-    assert len(sents) == 2
-    assert sents[0].text == "jests at scars ."
-    assert sents[1].text == "They never"
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,19 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
-    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
-    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
-    matches = matcher(doc)
-
-    assert len(matches) == 2
-
-    # We should have a match for each of the two rules
-    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
-    assert "A" in found_ids
-    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -0,0 +1,469 @@
+import pytest
+from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
+from spacy.pipeline.defaults import default_ner
+from spacy.matcher import PhraseMatcher, Matcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example, Corpus
+from spacy.gold.converters import json2docs
+from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.util import minibatch, ensure_path, load_model
+from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
+from spacy.tokenizer import Tokenizer
+from spacy.lang.el import Greek
+from spacy.language import Language
+import spacy
+from thinc.api import compounding
+from collections import defaultdict
+
+from ..util import make_tempdir
+
+
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes.
+    """
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", [pattern1])
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", [pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
+def test_issue4030():
+    """ Test whether textcat works fine with empty doc """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+    # processing of an empty doc should result in 0.0 for all categories
+    doc = nlp("")
+    assert doc.cats["offensive"] == 0.0
+    assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = load_model(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        config = {
+            "learn_tokens": False,
+            "min_action_freq": 30,
+            "beam_width": 1,
+            "beam_update_prob": 1.0,
+        }
+        ner2 = EntityRecognizer(vocab, default_ner(), **config)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
+
+
+def test_issue4054(en_vocab):
+    """Test that a new blank model can be made with a vocab from file,
+    and that serialization does not drop the language at any point."""
+    nlp1 = English()
+    vocab1 = nlp1.vocab
+    with make_tempdir() as d:
+        vocab_dir = ensure_path(d / "vocab")
+        if not vocab_dir.exists():
+            vocab_dir.mkdir()
+        vocab1.to_disk(vocab_dir)
+        vocab2 = Vocab().from_disk(vocab_dir)
+        print("lang", vocab2.lang)
+        nlp2 = spacy.blank("en", vocab=vocab2)
+        nlp_dir = ensure_path(d / "nlp")
+        if not nlp_dir.exists():
+            nlp_dir.mkdir()
+        nlp2.to_disk(nlp_dir)
+        nlp3 = load_model(nlp_dir)
+        assert nlp3.lang == "en"
+
+
+def test_issue4120(en_vocab):
+    """Test that matches without a final {OP: ?} token are returned."""
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+    doc1 = Doc(en_vocab, words=["a"])
+    assert len(matcher(doc1)) == 1  # works
+    doc2 = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc2)) == 2  # fixed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc3)) == 2  # works
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc4)) == 3  # fixed
+
+
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    doc_bytes = doc.to_bytes()
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+    assert actual == pos
+
+
+def test_issue4190():
+    def customize_tokenizer(nlp):
+        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+        infix_re = compile_infix_regex(nlp.Defaults.infixes)
+        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+        exceptions = {
+            k: v
+            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+            if not (len(k) == 2 and k[1] == ".")
+        }
+        new_tokenizer = Tokenizer(
+            nlp.vocab,
+            exceptions,
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=nlp.tokenizer.token_match,
+        )
+        nlp.tokenizer = new_tokenizer
+
+    test_string = "Test c."
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = load_model(model_dir)
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+    assert result_1b == result_2
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4267():
+    """ Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    assert "ner" in nlp.pipe_names
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.is_nered
+    for token in doc1:
+        assert token.ent_iob == 2
+    # add entity ruler and run again
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.is_nered
+    for token in doc2:
+        assert token.ent_iob == 2
+
+
+def test_issue4272():
+    """Test that lookup table can be accessed from Token.lemma if no POS tags
+    are available."""
+    nlp = Greek()
+    doc = nlp("Χθες")
+    assert doc[0].lemma_
+
+
+def test_multiple_predictions():
+    class DummyPipe(Pipe):
+        def __init__(self):
+            self.model = "dummy_model"
+
+        def predict(self, docs):
+            return ([1, 2, 3], [4, 5, 6])
+
+        def set_annotations(self, docs, scores, tensors=None):
+            return docs
+
+    nlp = Language()
+    doc = nlp.make_doc("foo")
+    dummy_pipe = DummyPipe()
+    dummy_pipe(doc)
+
+
+@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+    nlp = English()
+    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+    TRAIN_DATA = [example, example]
+    tagger = nlp.create_pipe("tagger")
+    nlp.add_pipe(tagger)
+    optimizer = nlp.begin_training()
+    for i in range(5):
+        losses = {}
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+
+
+def test_issue4367():
+    """Test that docbin init goes well"""
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+def test_issue4373():
+    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+    matcher = Matcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+    matcher = PhraseMatcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+
+
+def test_issue4402():
+    json_data = {
+        "id": 0,
+        "paragraphs": [
+            {
+                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "How", "ner": "O"},
+                            {"id": 1, "orth": "should", "ner": "O"},
+                            {"id": 2, "orth": "I", "ner": "O"},
+                            {"id": 3, "orth": "cook", "ner": "O"},
+                            {"id": 4, "orth": "bacon", "ner": "O"},
+                            {"id": 5, "orth": "in", "ner": "O"},
+                            {"id": 6, "orth": "an", "ner": "O"},
+                            {"id": 7, "orth": "oven", "ner": "O"},
+                            {"id": 8, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {
+                        "tokens": [
+                            {"id": 9, "orth": "\n", "ner": "O"},
+                            {"id": 10, "orth": "I", "ner": "O"},
+                            {"id": 11, "orth": "'ve", "ner": "O"},
+                            {"id": 12, "orth": "heard", "ner": "O"},
+                            {"id": 13, "orth": "of", "ner": "O"},
+                            {"id": 14, "orth": "people", "ner": "O"},
+                            {"id": 15, "orth": "cooking", "ner": "O"},
+                            {"id": 16, "orth": "bacon", "ner": "O"},
+                            {"id": 17, "orth": "in", "ner": "O"},
+                            {"id": 18, "orth": "an", "ner": "O"},
+                            {"id": 19, "orth": "oven", "ner": "O"},
+                            {"id": 20, "orth": ".", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                ],
+                "cats": [
+                    {"label": "baking", "value": 1.0},
+                    {"label": "not_baking", "value": 0.0},
+                ],
+            },
+            {
+                "raw": "What is the difference between white and brown eggs?\n",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "What", "ner": "O"},
+                            {"id": 1, "orth": "is", "ner": "O"},
+                            {"id": 2, "orth": "the", "ner": "O"},
+                            {"id": 3, "orth": "difference", "ner": "O"},
+                            {"id": 4, "orth": "between", "ner": "O"},
+                            {"id": 5, "orth": "white", "ner": "O"},
+                            {"id": 6, "orth": "and", "ner": "O"},
+                            {"id": 7, "orth": "brown", "ner": "O"},
+                            {"id": 8, "orth": "eggs", "ner": "O"},
+                            {"id": 9, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+                ],
+                "cats": [
+                    {"label": "baking", "value": 0.0},
+                    {"label": "not_baking", "value": 1.0},
+                ],
+            },
+        ],
+    }
+    nlp = English()
+    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "test4402.spacy"
+        docs = json2docs([json_data])
+        data = DocBin(docs=docs, attrs=attrs).to_bytes()
+        with output_file.open("wb") as file_:
+            file_.write(data)
+        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
+
+        train_data = list(corpus.train_dataset(nlp))
+        assert len(train_data) == 2
+
+        split_train_data = []
+        for eg in train_data:
+            split_train_data.extend(eg.split_sents())
+        assert len(split_train_data) == 4
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@ -1,23 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes.
-    """
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern1 = Doc(en_vocab, words=["c", "d"])
-    assert [t.norm_ for t in pattern1] == ["c", "d"]
-    matcher.add("TEST", [pattern1])
-    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
-    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern2 = Doc(en_vocab, words=["1", "2"])
-    pattern2[0].norm_ = "c"
-    pattern2[1].norm_ = "d"
-    assert [t.norm_ for t in pattern2] == ["c", "d"]
-    matcher.add("TEST", [pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@ -1,50 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue4030():
-    """ Test whether textcat works fine with empty doc """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
-
-    # processing of an empty doc should result in 0.0 for all categories
-    doc = nlp("")
-    assert doc.cats["offensive"] == 0.0
-    assert doc.cats["inoffensive"] == 0.0
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -1,85 +0,0 @@
-import spacy
-from spacy.pipeline import EntityRecognizer, EntityRuler
-from spacy.lang.en import English
-from spacy.tokens import Span
-from spacy.util import ensure_path
-from spacy.pipeline.defaults import default_ner
-
-from ..util import make_tempdir
-
-
-def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
-    nlp = English()
-
-    # add ner pipe
-    ner = nlp.create_pipe("ner")
-    ner.add_label("SOME_LABEL")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    # Add entity ruler
-    ruler = EntityRuler(nlp)
-    patterns = [
-        {"label": "MY_ORG", "pattern": "Apple"},
-        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
-    ]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
-    doc1 = nlp("What do you think about Apple ?")
-    assert doc1.ents[0].label_ == "MY_ORG"
-
-    with make_tempdir() as d:
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.to_disk(output_dir)
-
-        nlp2 = spacy.load(output_dir)
-        doc2 = nlp2("What do you think about Apple ?")
-        assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
-    """
-    Test that serialization of an NER works fine when new labels were added.
-    This is the second bug of two bugs underlying the issue 4042.
-    """
-    nlp1 = English()
-    vocab = nlp1.vocab
-
-    # add ner pipe
-    ner1 = nlp1.create_pipe("ner")
-    ner1.add_label("SOME_LABEL")
-    nlp1.add_pipe(ner1)
-    nlp1.begin_training()
-
-    # add a new label to the doc
-    doc1 = nlp1("What do you think about Apple ?")
-    assert len(ner1.labels) == 1
-    assert "SOME_LABEL" in ner1.labels
-    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
-    doc1.ents = list(doc1.ents) + [apple_ent]
-
-    # reapply the NER - at this point it should resize itself
-    ner1(doc1)
-    assert len(ner1.labels) == 2
-    assert "SOME_LABEL" in ner1.labels
-    assert "MY_ORG" in ner1.labels
-
-    with make_tempdir() as d:
-        # assert IO goes fine
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        ner1.to_disk(output_dir)
-
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-            "beam_width": 1,
-            "beam_update_prob": 1.0,
-        }
-        ner2 = EntityRecognizer(vocab, default_ner(), **config)
-        ner2.from_disk(output_dir)
-        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@ -1,30 +0,0 @@
-from spacy.vocab import Vocab
-import spacy
-from spacy.lang.en import English
-from spacy.util import ensure_path
-
-from ..util import make_tempdir
-
-
-def test_issue4054(en_vocab):
-    """Test that a new blank model can be made with a vocab from file,
-    and that serialization does not drop the language at any point."""
-    nlp1 = English()
-    vocab1 = nlp1.vocab
-
-    with make_tempdir() as d:
-        vocab_dir = ensure_path(d / "vocab")
-        if not vocab_dir.exists():
-            vocab_dir.mkdir()
-        vocab1.to_disk(vocab_dir)
-
-        vocab2 = Vocab().from_disk(vocab_dir)
-        print("lang", vocab2.lang)
-        nlp2 = spacy.blank("en", vocab=vocab2)
-
-        nlp_dir = ensure_path(d / "nlp")
-        if not nlp_dir.exists():
-            nlp_dir.mkdir()
-        nlp2.to_disk(nlp_dir)
-        nlp3 = spacy.load(nlp_dir)
-        assert nlp3.lang == "en"
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,23 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue4120(en_vocab):
-    """Test that matches without a final {OP: ?} token are returned."""
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
-    doc1 = Doc(en_vocab, words=["a"])
-    assert len(matcher(doc1)) == 1  # works
-
-    doc2 = Doc(en_vocab, words=["a", "b", "c"])
-    assert len(matcher(doc2)) == 2  # fixed
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
-    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc3)) == 2  # works
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
-    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc4)) == 3  # fixed
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -1,28 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-
-
-def test_issue4133(en_vocab):
-    nlp = English()
-    vocab_bytes = nlp.vocab.to_bytes()
-    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
-    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
-    doc = Doc(en_vocab, words=words)
-    for i, token in enumerate(doc):
-        token.pos_ = pos[i]
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    doc_bytes = doc.to_bytes()
-
-    vocab = Vocab()
-    vocab = vocab.from_bytes(vocab_bytes)
-    doc = Doc(vocab).from_bytes(doc_bytes)
-
-    actual = []
-    for token in doc:
-        actual.append(token.pos_)
-
-    assert actual == pos
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -1,46 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokenizer import Tokenizer
-from spacy import util
-
-from ..util import make_tempdir
-
-
-def test_issue4190():
-    test_string = "Test c."
-    # Load default language
-    nlp_1 = English()
-    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]  # noqa: F841
-    # Modify tokenizer
-    customize_tokenizer(nlp_1)
-    doc_1b = nlp_1(test_string)
-    result_1b = [token.text for token in doc_1b]
-    # Save and Reload
-    with make_tempdir() as model_dir:
-        nlp_1.to_disk(model_dir)
-        nlp_2 = util.load_model(model_dir)
-    # This should be the modified tokenizer
-    doc_2 = nlp_2(test_string)
-    result_2 = [token.text for token in doc_2]
-    assert result_1b == result_2
-
-
-def customize_tokenizer(nlp):
-    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
-    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
-    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
-    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
-    exceptions = {
-        k: v
-        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
-        if not (len(k) == 2 and k[1] == ".")
-    }
-    new_tokenizer = Tokenizer(
-        nlp.vocab,
-        exceptions,
-        prefix_search=prefix_re.search,
-        suffix_search=suffix_re.search,
-        infix_finditer=infix_re.finditer,
-        token_match=nlp.tokenizer.token_match,
-    )
-    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4267():
-    """ Test that running an entity_ruler after ner gives consistent results"""
-    nlp = English()
-    ner = nlp.create_pipe("ner")
-    ner.add_label("PEOPLE")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    assert "ner" in nlp.pipe_names
-
-    # assert that we have correct IOB annotations
-    doc1 = nlp("hi")
-    assert doc1.is_nered
-    for token in doc1:
-        assert token.ent_iob == 2
-
-    # add entity ruler and run again
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-    assert "entity_ruler" in nlp.pipe_names
-    assert "ner" in nlp.pipe_names
-
-    # assert that we still have correct IOB annotations
-    doc2 = nlp("hi")
-    assert doc2.is_nered
-    for token in doc2:
-        assert token.ent_iob == 2
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -1,9 +0,0 @@
-from spacy.lang.el import Greek
-
-
-def test_issue4272():
-    """Test that lookup table can be accessed from Token.lemma if no POS tags
-    are available."""
-    nlp = Greek()
-    doc = nlp("Χθες")
-    assert doc[0].lemma_
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.pipeline import Pipe
-
-
-class DummyPipe(Pipe):
-    def __init__(self):
-        self.model = "dummy_model"
-
-    def predict(self, docs):
-        return ([1, 2, 3], [4, 5, 6])
-
-    def set_annotations(self, docs, scores, tensors=None):
-        return docs
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-def test_multiple_predictions(nlp):
-    doc = nlp.make_doc("foo")
-    dummy_pipe = DummyPipe()
-    dummy_pipe(doc)
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -1,47 +0,0 @@
-from collections import defaultdict
-
-import pytest
-
-from spacy.pipeline.defaults import default_ner
-from spacy.pipeline import EntityRecognizer
-
-from spacy.lang.en import English
-from spacy.tokens import Span
-
-
-# skipped after removing Beam stuff during the Example/GoldParse refactor
-@pytest.mark.skip
-def test_issue4313():
-    """ This should not crash or exit with some strange error code """
-    beam_width = 16
-    beam_density = 0.0001
-    nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
-    ner.add_label("SOME_LABEL")
-    ner.begin_training([])
-    nlp.add_pipe(ner)
-
-    # add a new label to the doc
-    doc = nlp("What do you think about Apple ?")
-    assert len(ner.labels) == 1
-    assert "SOME_LABEL" in ner.labels
-    apple_ent = Span(doc, 5, 6, label="MY_ORG")
-    doc.ents = list(doc.ents) + [apple_ent]
-
-    # ensure the beam_parse still works with the new label
-    docs = [doc]
-    beams = nlp.entity.beam_parse(
-        docs, beam_width=beam_width, beam_density=beam_density
-    )
-
-    for doc, beam in zip(docs, beams):
-        entity_scores = defaultdict(float)
-        for score, ents in nlp.entity.moves.get_beam_parses(beam):
-            for start, end, label in ents:
-                entity_scores[(start, end, label)] += score
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@ -1,24 +0,0 @@
-from spacy.gold import Example
-from spacy.lang.en import English
-from spacy.util import minibatch
-from thinc.api import compounding
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue4348():
-    """Test that training the tagger with empty data, doesn't throw errors"""
-
-    nlp = English()
-    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
-    TRAIN_DATA = [example, example]
-
-    tagger = nlp.create_pipe("tagger")
-    nlp.add_pipe(tagger)
-
-    optimizer = nlp.begin_training()
-    for i in range(5):
-        losses = {}
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
-        for batch in batches:
-            nlp.update(batch, sgd=optimizer, losses=losses)
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -1,8 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue4367():
-    """Test that docbin init goes well"""
-    DocBin()
-    DocBin(attrs=["LEMMA"])
-    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@ -1,10 +0,0 @@
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.vocab import Vocab
-
-
-def test_issue4373():
-    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
-    matcher = Matcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-    matcher = PhraseMatcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -1,98 +0,0 @@
-from spacy.gold import Corpus
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-from ...gold.converters import json2docs
-from ...tokens import DocBin
-
-
-def test_issue4402():
-    nlp = English()
-    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
-        data = DocBin(docs=docs, attrs=attrs).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
-        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
-
-        train_data = list(corpus.train_dataset(nlp))
-        assert len(train_data) == 2
-
-        split_train_data = []
-        for eg in train_data:
-            split_train_data.extend(eg.split_sents())
-        assert len(split_train_data) == 4
-
-
-json_data = {
-    "id": 0,
-    "paragraphs": [
-        {
-            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "How", "ner": "O"},
-                        {"id": 1, "orth": "should", "ner": "O"},
-                        {"id": 2, "orth": "I", "ner": "O"},
-                        {"id": 3, "orth": "cook", "ner": "O"},
-                        {"id": 4, "orth": "bacon", "ner": "O"},
-                        {"id": 5, "orth": "in", "ner": "O"},
-                        {"id": 6, "orth": "an", "ner": "O"},
-                        {"id": 7, "orth": "oven", "ner": "O"},
-                        {"id": 8, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {
-                    "tokens": [
-                        {"id": 9, "orth": "\n", "ner": "O"},
-                        {"id": 10, "orth": "I", "ner": "O"},
-                        {"id": 11, "orth": "'ve", "ner": "O"},
-                        {"id": 12, "orth": "heard", "ner": "O"},
-                        {"id": 13, "orth": "of", "ner": "O"},
-                        {"id": 14, "orth": "people", "ner": "O"},
-                        {"id": 15, "orth": "cooking", "ner": "O"},
-                        {"id": 16, "orth": "bacon", "ner": "O"},
-                        {"id": 17, "orth": "in", "ner": "O"},
-                        {"id": 18, "orth": "an", "ner": "O"},
-                        {"id": 19, "orth": "oven", "ner": "O"},
-                        {"id": 20, "orth": ".", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-            ],
-            "cats": [
-                {"label": "baking", "value": 1.0},
-                {"label": "not_baking", "value": 0.0},
-            ],
-        },
-        {
-            "raw": "What is the difference between white and brown eggs?\n",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "What", "ner": "O"},
-                        {"id": 1, "orth": "is", "ner": "O"},
-                        {"id": 2, "orth": "the", "ner": "O"},
-                        {"id": 3, "orth": "difference", "ner": "O"},
-                        {"id": 4, "orth": "between", "ner": "O"},
-                        {"id": 5, "orth": "white", "ner": "O"},
-                        {"id": 6, "orth": "and", "ner": "O"},
-                        {"id": 7, "orth": "brown", "ner": "O"},
-                        {"id": 8, "orth": "eggs", "ner": "O"},
-                        {"id": 9, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
-            ],
-            "cats": [
-                {"label": "baking", "value": 0.0},
-                {"label": "not_baking", "value": 1.0},
-            ],
-        },
-    ],
-}
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -0,0 +1,288 @@
+import pytest
+from mock import Mock
+from spacy.pipeline import EntityRuler
+from spacy.matcher import DependencyMatcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example
+from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.lang.en import English
+from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
+from spacy.language import Language
+from spacy.util import ensure_path, load_model_from_path
+import numpy
+import pickle
+
+from ..util import get_doc, make_tempdir
+
+
+def test_issue4528(en_vocab):
+    """Test that user_data is correctly serialized in DocBin."""
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc.user_data["foo"] = "bar"
+    # This is how extension attribute values are stored in the user data
+    doc.user_data[("._.", "foo", None, None)] = "bar"
+    doc_bin = DocBin(store_user_data=True)
+    doc_bin.add(doc)
+    doc_bin_bytes = doc_bin.to_bytes()
+    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+    assert new_doc.user_data["foo"] == "bar"
+    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    Example.from_dict(doc, {"words": words})
+
+
+def test_issue4590(en_vocab):
+    """Test that matches param in on_match method are the same as matches run with no on_match method"""
+    pattern = [
+        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+    ]
+
+    on_match = Mock()
+    matcher = DependencyMatcher(en_vocab)
+    matcher.add("pattern", on_match, pattern)
+    text = "The quick brown fox jumped over the lazy fox"
+    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
+    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+    matches = matcher(doc)
+    on_match_args = on_match.call_args
+    assert on_match_args[0][3] == matches
+
+
+def test_issue4651_with_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4651_without_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    not specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4665():
+    """
+    conllu2json should not raise an exception if the HEAD column contains an
+    underscore
+    """
+    input_data = """
+1	[	_	PUNCT	-LRB-	_	_	punct	_	_
+2	This	_	DET	DT	_	_	det	_	_
+3	killing	_	NOUN	NN	_	_	nsubj	_	_
+4	of	_	ADP	IN	_	_	case	_	_
+5	a	_	DET	DT	_	_	det	_	_
+6	respected	_	ADJ	JJ	_	_	amod	_	_
+7	cleric	_	NOUN	NN	_	_	nmod	_	_
+8	will	_	AUX	MD	_	_	aux	_	_
+9	be	_	AUX	VB	_	_	aux	_	_
+10	causing	_	VERB	VBG	_	_	root	_	_
+11	us	_	PRON	PRP	_	_	iobj	_	_
+12	trouble	_	NOUN	NN	_	_	dobj	_	_
+13	for	_	ADP	IN	_	_	case	_	_
+14	years	_	NOUN	NNS	_	_	nmod	_	_
+15	to	_	PART	TO	_	_	mark	_	_
+16	come	_	VERB	VB	_	_	acl	_	_
+17	.	_	PUNCT	.	_	_	punct	_	_
+18	]	_	PUNCT	-RRB-	_	_	punct	_	_
+"""
+    conllu2docs(input_data)
+
+
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    with pytest.warns(UserWarning):
+        kb.set_entities(
+            entity_list=["Q1", "Q1"],
+            freq_list=[32, 111],
+            vector_list=[vector1, vector2],
+        )
+    assert kb.get_size_entities() == 1
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.dump(str(file_path))
+        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
+        kb2.load_bulk(str(file_path))
+    assert kb2.get_size_entities() == 1
+
+
+def test_issue4707():
+    """Tests that disabled component names are also excluded from nlp.from_disk
+    by default when loading a model.
+    """
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
+    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
+    exclude = ["tokenizer", "sentencizer"]
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir, exclude=exclude)
+        new_nlp = load_model_from_path(tmpdir, disable=exclude)
+    assert "sentencizer" not in new_nlp.pipe_names
+    assert "entity_ruler" in new_nlp.pipe_names
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_1():
+    """ Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["min_action_freq"] == 342
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["min_action_freq"] == 342
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_2():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
+
+def test_issue4849():
+    nlp = English()
+    ruler = EntityRuler(
+        nlp,
+        patterns=[
+            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+        ],
+        phrase_matcher_attr="LOWER",
+    )
+    nlp.add_pipe(ruler)
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+    # USING 2 PROCESSES
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=2):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+
+
+class CustomPipe:
+    name = "my_pipe"
+
+    def __init__(self):
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
+def test_issue4903():
+    """Ensure that this runs correctly and doesn't hang or crash on Windows /
+    macOS."""
+    nlp = English()
+    custom_component = CustomPipe()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(custom_component, after="sentencizer")
+
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    docs = list(nlp.pipe(text, n_process=2))
+    assert docs[0].text == "I like bananas."
+    assert docs[1].text == "Do you like them?"
+    assert docs[2].text == "No, I prefer wasabi."
+
+
+def test_issue4924():
+    nlp = Language()
+    example = Example.from_dict(nlp.make_doc(""), {})
+    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@ -1,16 +0,0 @@
-from spacy.tokens import Doc, DocBin
-
-
-def test_issue4528(en_vocab):
-    """Test that user_data is correctly serialized in DocBin."""
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc.user_data["foo"] = "bar"
-    # This is how extension attribute values are stored in the user data
-    doc.user_data[("._.", "foo", None, None)] = "bar"
-    doc_bin = DocBin(store_user_data=True)
-    doc_bin.add(doc)
-    doc_bin_bytes = doc_bin.to_bytes()
-    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
-    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
-    assert new_doc.user_data["foo"] == "bar"
-    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,11 +0,0 @@
-import pytest
-
-from spacy.gold import Example
-
-
-@pytest.mark.parametrize(
-    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
-    doc = en_tokenizer(text)
-    Example.from_dict(doc, {"words": words})
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -1,35 +0,0 @@
-from mock import Mock
-from spacy.matcher import DependencyMatcher
-from ..util import get_doc
-
-
-def test_issue4590(en_vocab):
-    """Test that matches param in on_match method are the same as matches run with no on_match method"""
-    pattern = [
-        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {
-            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-        {
-            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-    ]
-
-    on_match = Mock()
-
-    matcher = DependencyMatcher(en_vocab)
-    matcher.add("pattern", on_match, pattern)
-
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
-
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-
-    matches = matcher(doc)
-
-    on_match_args = on_match.call_args
-
-    assert on_match_args[0][3] == matches
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@ -1,62 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-from ..util import make_tempdir
-
-
-def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
-
-
-def test_issue4651_without_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    not specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@ -1,35 +0,0 @@
-import pytest
-
-# TODO
-# from spacy.gold.converters.conllu2docs import conllu2docs
-
-input_data = """
-1	[	_	PUNCT	-LRB-	_	_	punct	_	_
-2	This	_	DET	DT	_	_	det	_	_
-3	killing	_	NOUN	NN	_	_	nsubj	_	_
-4	of	_	ADP	IN	_	_	case	_	_
-5	a	_	DET	DT	_	_	det	_	_
-6	respected	_	ADJ	JJ	_	_	amod	_	_
-7	cleric	_	NOUN	NN	_	_	nmod	_	_
-8	will	_	AUX	MD	_	_	aux	_	_
-9	be	_	AUX	VB	_	_	aux	_	_
-10	causing	_	VERB	VBG	_	_	root	_	_
-11	us	_	PRON	PRP	_	_	iobj	_	_
-12	trouble	_	NOUN	NN	_	_	dobj	_	_
-13	for	_	ADP	IN	_	_	case	_	_
-14	years	_	NOUN	NNS	_	_	nmod	_	_
-15	to	_	PART	TO	_	_	mark	_	_
-16	come	_	VERB	VB	_	_	acl	_	_
-17	.	_	PUNCT	.	_	_	punct	_	_
-18	]	_	PUNCT	-RRB-	_	_	punct	_	_
-"""
-
-
-@pytest.mark.xfail
-def test_issue4665():
-    """
-    conllu2json should not raise an exception if the HEAD column contains an
-    underscore
-    """
-    pass
-    # conllu2json(input_data)
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -1,36 +0,0 @@
-import pytest
-from spacy.kb import KnowledgeBase
-from spacy.util import ensure_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4674():
-    """Test that setting entities with overlapping identifiers does not mess up IO"""
-    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-
-    vector1 = [0.9, 1.1, 1.01]
-    vector2 = [1.8, 2.25, 2.01]
-    with pytest.warns(UserWarning):
-        kb.set_entities(
-            entity_list=["Q1", "Q1"],
-            freq_list=[32, 111],
-            vector_list=[vector1, vector2],
-        )
-
-    assert kb.get_size_entities() == 1
-
-    # dumping to file & loading back in
-    with make_tempdir() as d:
-        dir_path = ensure_path(d)
-        if not dir_path.exists():
-            dir_path.mkdir()
-        file_path = dir_path / "kb"
-        kb.dump(str(file_path))
-
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-        kb2.load_bulk(str(file_path))
-
-    assert kb2.get_size_entities() == 1
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@ -1,20 +0,0 @@
-from spacy.util import load_model_from_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4707():
-    """Tests that disabled component names are also excluded from nlp.from_disk
-    by default when loading a model.
-    """
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
-    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
-    exclude = ["tokenizer", "sentencizer"]
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir, exclude=exclude)
-        new_nlp = load_model_from_path(tmpdir, disable=exclude)
-    assert "sentencizer" not in new_nlp.pipe_names
-    assert "entity_ruler" in new_nlp.pipe_names
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -1,41 +0,0 @@
-import pickle
-import numpy
-
-from spacy.lang.en import English
-from spacy.vocab import Vocab
-
-from spacy.tests.util import make_tempdir
-
-
-def test_pickle_ner():
-    """ Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
-    with make_tempdir() as tmp_path:
-        with (tmp_path / "ner.pkl").open("wb") as file_:
-            pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
-
-        with (tmp_path / "ner.pkl").open("rb") as file_:
-            ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
-
-
-def test_issue4725():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4849():
-    nlp = English()
-
-    ruler = EntityRuler(
-        nlp,
-        patterns=[
-            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
-            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
-        ],
-        phrase_matcher_attr="LOWER",
-    )
-
-    nlp.add_pipe(ruler)
-
-    text = """
-    The left is starting to take aim at Democratic front-runner Joe Biden.
-    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
-    """
-
-    # USING 1 PROCESS
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=1):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
-
-    # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,40 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Span, Doc
-
-
-class CustomPipe:
-    name = "my_pipe"
-
-    def __init__(self):
-        Span.set_extension("my_ext", getter=self._get_my_ext)
-        Doc.set_extension("my_ext", default=None)
-
-    def __call__(self, doc):
-        gathered_ext = []
-        for sent in doc.sents:
-            sent_ext = self._get_my_ext(sent)
-            sent._.set("my_ext", sent_ext)
-            gathered_ext.append(sent_ext)
-
-        doc._.set("my_ext", "\n".join(gathered_ext))
-
-        return doc
-
-    @staticmethod
-    def _get_my_ext(span):
-        return str(span.end)
-
-
-def test_issue4903():
-    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
-
-    nlp = English()
-    custom_component = CustomPipe()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(custom_component, after="sentencizer")
-
-    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -1,8 +0,0 @@
-from spacy.gold import Example
-from spacy.language import Language
-
-
-def test_issue4924():
-    nlp = Language()
-    example = Example.from_dict(nlp.make_doc(""), {})
-    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,8 @@
+import pytest
 from spacy.lang.en import English


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
    # Test that the comparison between a Span and a Token, goes well
    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
    text = nlp("Talk about being boring!")
    text_var = nlp("Talk of being boring!")
    y = nlp("Let")
-
    span = text[0:3]  # Talk about being
    span_2 = text[0:3]  # Talk about being
    span_3 = text_var[0:3]  # Talk of being
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -63,7 +63,8 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
+        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger