Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-22 11:44:16 +03:00 · 2020-07-07 17:21:10 +02:00 · 2020-07-07 17:21:10 +02:00 · 8177f25b6c
commit 8177f25b6c
parent d1fd3438c3 fa00a85828
53 changed files with 1240 additions and 1438 deletions
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -23,6 +23,7 @@ def test_issue2070():
    assert len(doc) == 11
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
    assert len(matches) == 3
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -88,6 +88,7 @@ def test_issue3199():
    assert list(doc[0:3].noun_chunks) == []
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -0,0 +1,472 @@
 import pytest
 from spacy.language import Language
 from spacy.vocab import Vocab
 from spacy.pipeline import EntityRuler, DependencyParser
 from spacy.pipeline.defaults import default_parser
 from spacy import displacy, load
 from spacy.displacy import parse_deps
 from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.errors import MatchPatternError
 from spacy.util import minibatch
 from spacy.gold import Example
 from spacy.lang.hi import Hindi
 from spacy.lang.es import Spanish
 from spacy.lang.en import English
 from spacy.attrs import IS_ALPHA
 from thinc.api import compounding
 import spacy
 import srsly
 import numpy
 from ..util import make_tempdir, get_doc
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
 def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
 def test_issue_3526_1(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    ruler_bytes = ruler.to_bytes()
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert ruler.overwrite
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(ruler)
    assert len(new_ruler.labels) == 4
    assert new_ruler.overwrite == ruler.overwrite
    assert new_ruler.ent_id_sep == ruler.ent_id_sep
 def test_issue_3526_2(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    for pattern in ruler.patterns:
        assert pattern in new_ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
 def test_issue_3526_3(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue_3526_4(en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)
    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
 def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html
 def test_issue3540(en_vocab):
    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = numpy.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)
    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)
    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()
    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
 def test_issue3549(en_vocab):
    """Test that match pattern validation doesn't raise on empty errors."""
    matcher = Matcher(en_vocab, validate=True)
    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
    matcher.add("GOOD", [pattern])
    with pytest.raises(MatchPatternError):
        matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.xfail
 def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
 def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training(X=x_train, Y=y_train)
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
 def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected
 def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" not in parser.labels
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" in parser.labels
 def test_issue3839(en_vocab):
    """Test that match IDs returned by the matcher are correct, are in the string """
    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
    matcher = Matcher(en_vocab)
    match_id = "PATTERN"
    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
    matcher.add(match_id, [pattern1])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
    matcher = Matcher(en_vocab)
    matcher.add(match_id, [pattern2])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
@pytest.mark.parametrize(
    "sentence",
    [
        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
 )
 def test_issue3869(sentence):
    """Test that the Doc's count_by function works consistently"""
    nlp = English()
    doc = nlp(sentence)
    count = 0
    for token in doc:
        count += token.is_alpha
    assert count == doc.count_by(IS_ALPHA).get(1, 0)
 def test_issue3879(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    assert len(doc) == 5
    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [pattern])
    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.
    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("parser"))
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.get_pipe("parser").add_label("dep")
    nlp.get_pipe("ner").add_label("PERSON")
    nlp.get_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass
 def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
 def test_issue3951(en_vocab):
    """Test that combinations of optional rules are matched correctly."""
    matcher = Matcher(en_vocab)
    pattern = [
        {"LOWER": "hello"},
        {"LOWER": "this", "OP": "?"},
        {"OP": "?"},
        {"LOWER": "world"},
    ]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"
 def test_issue3962(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1
 def test_issue3962_long(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # head set to the new artificial head (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
 def test_issue3972(en_vocab):
    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
    matches = matcher(doc)
    assert len(matches) == 2
    # We should have a match for each of the two rules
    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
    assert "A" in found_ids
    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,8 +0,0 @@
 import pytest
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
 def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@ -1,85 +0,0 @@
 import pytest
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy import load
 import srsly
 from ..util import make_tempdir
@pytest.fixture
 def patterns():
    return [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
@pytest.fixture
 def add_ent():
    def add_ent_component(doc):
        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
        return doc
    return add_ent_component
 def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    ruler_bytes = ruler.to_bytes()
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert ruler.overwrite
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(ruler)
    assert len(new_ruler.labels) == 4
    assert new_ruler.overwrite == ruler.overwrite
    assert new_ruler.ent_id_sep == ruler.ent_id_sep
 def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    for pattern in ruler.patterns:
        assert pattern in new_ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
 def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
 def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)
    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@ -1,30 +0,0 @@
 from spacy import displacy
 def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@ -1,44 +0,0 @@
 from spacy.tokens import Doc
 import numpy as np
 def test_issue3540(en_vocab):
    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = np.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)
    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)
    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()
    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@ -1,12 +0,0 @@
 import pytest
 from spacy.matcher import Matcher
 from spacy.errors import MatchPatternError
 def test_issue3549(en_vocab):
    """Test that match pattern validation doesn't raise on empty errors."""
    matcher = Matcher(en_vocab, validate=True)
    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
    matcher.add("GOOD", [pattern])
    with pytest.raises(MatchPatternError):
        matcher.add("BAD", [[{"X": "Y"}]])
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@ -1,14 +0,0 @@
 import pytest
 from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher
@pytest.mark.xfail
 def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,45 +0,0 @@
 import spacy
 from spacy.util import minibatch
 from thinc.api import compounding
 from spacy.gold import Example
 def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training(X=x_train, Y=y_train)
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -1,9 +0,0 @@
 from spacy.lang.hi import Hindi
 def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@ -1,10 +0,0 @@
 from spacy.lang.es import Spanish
 def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@ -1,34 +0,0 @@
 from spacy.pipeline.pipes import DependencyParser
 from spacy.vocab import Vocab
 from spacy.pipeline.defaults import default_parser
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" not in parser.labels
 def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,18 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3839(en_vocab):
    """Test that match IDs returned by the matcher are correct, are in the string """
    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
    matcher = Matcher(en_vocab)
    match_id = "PATTERN"
    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
    matcher.add(match_id, [pattern1])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
    matcher = Matcher(en_vocab)
    matcher.add(match_id, [pattern2])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -1,25 +0,0 @@
 import pytest
 from spacy.attrs import IS_ALPHA
 from spacy.lang.en import English
@pytest.mark.parametrize(
    "sentence",
    [
        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
 )
 def test_issue3869(sentence):
    """Test that the Doc's count_by function works consistently"""
    nlp = English()
    doc = nlp(sentence)
    count = 0
    for token in doc:
        count += token.is_alpha
    assert count == doc.count_by(IS_ALPHA).get(1, 0)
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,11 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3879(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    assert len(doc) == 5
    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [pattern])
    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@ -1,21 +0,0 @@
 from spacy.lang.en import English
 import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.
    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("parser"))
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.get_pipe("parser").add_label("dep")
    nlp.get_pipe("ner").add_label("PERSON")
    nlp.get_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -1,12 +0,0 @@
 from spacy.displacy import parse_deps
 from spacy.tokens import Doc
 def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,17 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3951(en_vocab):
    """Test that combinations of optional rules are matched correctly."""
    matcher = Matcher(en_vocab)
    pattern = [
        {"LOWER": "hello"},
        {"LOWER": "this", "OP": "?"},
        {"OP": "?"},
        {"LOWER": "world"},
    ]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
    matches = matcher(doc)
    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -1,26 +0,0 @@
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -1,117 +0,0 @@
 import pytest
 from ..util import get_doc
@pytest.fixture
 def doc(en_tokenizer):
    text = "He jests at scars, that never felt a wound."
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ccomp",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962(doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1
@pytest.fixture
 def two_sent_doc(en_tokenizer):
    text = "He jests at scars. They never felt a wound."
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ROOT",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962_long(two_sent_doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    assert (
        doc2[4].head.text == "They"
    )  # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].dep_ == "dep"
    assert (
        doc2[4].head.text == "They"
    )  # head set to the new artificial head (in sentence 2)
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,19 +0,0 @@
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 def test_issue3972(en_vocab):
    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
    matches = matcher(doc)
    assert len(matches) == 2
    # We should have a match for each of the two rules
    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
    assert "A" in found_ids
    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -0,0 +1,469 @@
 import pytest
 from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
 from spacy.pipeline.defaults import default_ner
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.gold import Example, Corpus
 from spacy.gold.converters import json2docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
 from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
 from spacy.tokenizer import Tokenizer
 from spacy.lang.el import Greek
 from spacy.language import Language
 import spacy
 from thinc.api import compounding
 from collections import defaultdict
 from ..util import make_tempdir
 def test_issue4002(en_vocab):
    """Test that the PhraseMatcher can match on overwritten NORM attributes.
    """
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern1 = Doc(en_vocab, words=["c", "d"])
    assert [t.norm_ for t in pattern1] == ["c", "d"]
    matcher.add("TEST", [pattern1])
    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
    matches = matcher(doc)
    assert len(matches) == 1
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern2 = Doc(en_vocab, words=["1", "2"])
    pattern2[0].norm_ = "c"
    pattern2[1].norm_ = "d"
    assert [t.norm_ for t in pattern2] == ["c", "d"]
    matcher.add("TEST", [pattern2])
    matches = matcher(doc)
    assert len(matches) == 1
 def test_issue4030():
    """ Test whether textcat works fine with empty doc """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.create_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.add_pipe(ner)
    nlp.begin_training()
    # Add entity ruler
    ruler = EntityRuler(nlp)
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"
    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = load_model(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab
    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {
            "learn_tokens": False,
            "min_action_freq": 30,
            "beam_width": 1,
            "beam_update_prob": 1.0,
        }
        ner2 = EntityRecognizer(vocab, default_ner(), **config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
 def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab
    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)
        vocab2 = Vocab().from_disk(vocab_dir)
        print("lang", vocab2.lang)
        nlp2 = spacy.blank("en", vocab=vocab2)
        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = load_model(nlp_dir)
        assert nlp3.lang == "en"
 def test_issue4120(en_vocab):
    """Test that matches without a final {OP: ?} token are returned."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
    doc1 = Doc(en_vocab, words=["a"])
    assert len(matcher(doc1)) == 1  # works
    doc2 = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc2)) == 2  # fixed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc3)) == 2  # works
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc4)) == 3  # fixed
 def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
 def test_issue4190():
    def customize_tokenizer(nlp):
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
        infix_re = compile_infix_regex(nlp.Defaults.infixes)
        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
        exceptions = {
            k: v
            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
            if not (len(k) == 2 and k[1] == ".")
        }
        new_tokenizer = Tokenizer(
            nlp.vocab,
            exceptions,
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
            token_match=nlp.tokenizer.token_match,
        )
        nlp.tokenizer = new_tokenizer
    test_string = "Test c."
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
    result_1a = [token.text for token in doc_1a]  # noqa: F841
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
        nlp_2 = load_model(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
    assert result_1b == result_2
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
    nlp = Greek()
    doc = nlp("Χθες")
    assert doc[0].lemma_
 def test_multiple_predictions():
    class DummyPipe(Pipe):
        def __init__(self):
            self.model = "dummy_model"
        def predict(self, docs):
            return ([1, 2, 3], [4, 5, 6])
        def set_annotations(self, docs, scores, tensors=None):
            return docs
    nlp = Language()
    doc = nlp.make_doc("foo")
    dummy_pipe = DummyPipe()
    dummy_pipe(doc)
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
 def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]
    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(
        docs, beam_width=beam_width, beam_density=beam_density
    )
    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4348():
    """Test that training the tagger with empty data, doesn't throw errors"""
    nlp = English()
    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
    TRAIN_DATA = [example, example]
    tagger = nlp.create_pipe("tagger")
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
 def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
 def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
 def test_issue4402():
    json_data = {
        "id": 0,
        "paragraphs": [
            {
                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "How", "ner": "O"},
                            {"id": 1, "orth": "should", "ner": "O"},
                            {"id": 2, "orth": "I", "ner": "O"},
                            {"id": 3, "orth": "cook", "ner": "O"},
                            {"id": 4, "orth": "bacon", "ner": "O"},
                            {"id": 5, "orth": "in", "ner": "O"},
                            {"id": 6, "orth": "an", "ner": "O"},
                            {"id": 7, "orth": "oven", "ner": "O"},
                            {"id": 8, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {
                        "tokens": [
                            {"id": 9, "orth": "\n", "ner": "O"},
                            {"id": 10, "orth": "I", "ner": "O"},
                            {"id": 11, "orth": "'ve", "ner": "O"},
                            {"id": 12, "orth": "heard", "ner": "O"},
                            {"id": 13, "orth": "of", "ner": "O"},
                            {"id": 14, "orth": "people", "ner": "O"},
                            {"id": 15, "orth": "cooking", "ner": "O"},
                            {"id": 16, "orth": "bacon", "ner": "O"},
                            {"id": 17, "orth": "in", "ner": "O"},
                            {"id": 18, "orth": "an", "ner": "O"},
                            {"id": 19, "orth": "oven", "ner": "O"},
                            {"id": 20, "orth": ".", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                ],
                "cats": [
                    {"label": "baking", "value": 1.0},
                    {"label": "not_baking", "value": 0.0},
                ],
            },
            {
                "raw": "What is the difference between white and brown eggs?\n",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "What", "ner": "O"},
                            {"id": 1, "orth": "is", "ner": "O"},
                            {"id": 2, "orth": "the", "ner": "O"},
                            {"id": 3, "orth": "difference", "ner": "O"},
                            {"id": 4, "orth": "between", "ner": "O"},
                            {"id": 5, "orth": "white", "ner": "O"},
                            {"id": 6, "orth": "and", "ner": "O"},
                            {"id": 7, "orth": "brown", "ner": "O"},
                            {"id": 8, "orth": "eggs", "ner": "O"},
                            {"id": 9, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
                ],
                "cats": [
                    {"label": "baking", "value": 0.0},
                    {"label": "not_baking", "value": 1.0},
                ],
            },
        ],
    }
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json2docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
        train_data = list(corpus.train_dataset(nlp))
        assert len(train_data) == 2
        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@ -1,23 +0,0 @@
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 def test_issue4002(en_vocab):
    """Test that the PhraseMatcher can match on overwritten NORM attributes.
    """
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern1 = Doc(en_vocab, words=["c", "d"])
    assert [t.norm_ for t in pattern1] == ["c", "d"]
    matcher.add("TEST", [pattern1])
    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
    matches = matcher(doc)
    assert len(matches) == 1
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern2 = Doc(en_vocab, words=["1", "2"])
    pattern2[0].norm_ = "c"
    pattern2[1].norm_ = "d"
    assert [t.norm_ for t in pattern2] == ["c", "d"]
    matcher.add("TEST", [pattern2])
    matches = matcher(doc)
    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@ -1,50 +0,0 @@
 import spacy
 from spacy.util import minibatch
 from thinc.api import compounding
 from spacy.gold import Example
 def test_issue4030():
    """ Test whether textcat works fine with empty doc """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -1,85 +0,0 @@
 import spacy
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.lang.en import English
 from spacy.tokens import Span
 from spacy.util import ensure_path
 from spacy.pipeline.defaults import default_ner
 from ..util import make_tempdir
 def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.create_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.add_pipe(ner)
    nlp.begin_training()
    # Add entity ruler
    ruler = EntityRuler(nlp)
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"
    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
 def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab
    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {
            "learn_tokens": False,
            "min_action_freq": 30,
            "beam_width": 1,
            "beam_update_prob": 1.0,
        }
        ner2 = EntityRecognizer(vocab, default_ner(), **config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@ -1,30 +0,0 @@
 from spacy.vocab import Vocab
 import spacy
 from spacy.lang.en import English
 from spacy.util import ensure_path
 from ..util import make_tempdir
 def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab
    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)
        vocab2 = Vocab().from_disk(vocab_dir)
        print("lang", vocab2.lang)
        nlp2 = spacy.blank("en", vocab=vocab2)
        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = spacy.load(nlp_dir)
        assert nlp3.lang == "en"
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,23 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue4120(en_vocab):
    """Test that matches without a final {OP: ?} token are returned."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
    doc1 = Doc(en_vocab, words=["a"])
    assert len(matcher(doc1)) == 1  # works
    doc2 = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc2)) == 2  # fixed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc3)) == 2  # works
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc4)) == 3  # fixed
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -1,28 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -1,46 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy import util
 from ..util import make_tempdir
 def test_issue4190():
    test_string = "Test c."
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
    result_1a = [token.text for token in doc_1a]  # noqa: F841
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
        nlp_2 = util.load_model(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
    assert result_1b == result_2
 def customize_tokenizer(nlp):
    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
        if not (len(k) == 2 and k[1] == ".")
    }
    new_tokenizer = Tokenizer(
        nlp.vocab,
        exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,34 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -1,9 +0,0 @@
 from spacy.lang.el import Greek
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
    nlp = Greek()
    doc = nlp("Χθες")
    assert doc[0].lemma_
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@ -1,25 +0,0 @@
 import pytest
 from spacy.language import Language
 from spacy.pipeline import Pipe
 class DummyPipe(Pipe):
    def __init__(self):
        self.model = "dummy_model"
    def predict(self, docs):
        return ([1, 2, 3], [4, 5, 6])
    def set_annotations(self, docs, scores, tensors=None):
        return docs
@pytest.fixture
 def nlp():
    return Language()
 def test_multiple_predictions(nlp):
    doc = nlp.make_doc("foo")
    dummy_pipe = DummyPipe()
    dummy_pipe(doc)
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -1,47 +0,0 @@
 from collections import defaultdict
 import pytest
 from spacy.pipeline.defaults import default_ner
 from spacy.pipeline import EntityRecognizer
 from spacy.lang.en import English
 from spacy.tokens import Span
 # skipped after removing Beam stuff during the Example/GoldParse refactor
@pytest.mark.skip
 def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]
    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(
        docs, beam_width=beam_width, beam_density=beam_density
    )
    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@ -1,24 +0,0 @@
 from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.util import minibatch
 from thinc.api import compounding
 import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4348():
    """Test that training the tagger with empty data, doesn't throw errors"""
    nlp = English()
    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
    TRAIN_DATA = [example, example]
    tagger = nlp.create_pipe("tagger")
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -1,8 +0,0 @@
 from spacy.tokens import DocBin
 def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@ -1,10 +0,0 @@
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.vocab import Vocab
 def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -1,98 +0,0 @@
 from spacy.gold import Corpus
 from spacy.lang.en import English
 from ..util import make_tempdir
 from ...gold.converters import json2docs
 from ...tokens import DocBin
 def test_issue4402():
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json2docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
        train_data = list(corpus.train_dataset(nlp))
        assert len(train_data) == 2
        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4
 json_data = {
    "id": 0,
    "paragraphs": [
        {
            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
            "sentences": [
                {
                    "tokens": [
                        {"id": 0, "orth": "How", "ner": "O"},
                        {"id": 1, "orth": "should", "ner": "O"},
                        {"id": 2, "orth": "I", "ner": "O"},
                        {"id": 3, "orth": "cook", "ner": "O"},
                        {"id": 4, "orth": "bacon", "ner": "O"},
                        {"id": 5, "orth": "in", "ner": "O"},
                        {"id": 6, "orth": "an", "ner": "O"},
                        {"id": 7, "orth": "oven", "ner": "O"},
                        {"id": 8, "orth": "?", "ner": "O"},
                    ],
                    "brackets": [],
                },
                {
                    "tokens": [
                        {"id": 9, "orth": "\n", "ner": "O"},
                        {"id": 10, "orth": "I", "ner": "O"},
                        {"id": 11, "orth": "'ve", "ner": "O"},
                        {"id": 12, "orth": "heard", "ner": "O"},
                        {"id": 13, "orth": "of", "ner": "O"},
                        {"id": 14, "orth": "people", "ner": "O"},
                        {"id": 15, "orth": "cooking", "ner": "O"},
                        {"id": 16, "orth": "bacon", "ner": "O"},
                        {"id": 17, "orth": "in", "ner": "O"},
                        {"id": 18, "orth": "an", "ner": "O"},
                        {"id": 19, "orth": "oven", "ner": "O"},
                        {"id": 20, "orth": ".", "ner": "O"},
                    ],
                    "brackets": [],
                },
            ],
            "cats": [
                {"label": "baking", "value": 1.0},
                {"label": "not_baking", "value": 0.0},
            ],
        },
        {
            "raw": "What is the difference between white and brown eggs?\n",
            "sentences": [
                {
                    "tokens": [
                        {"id": 0, "orth": "What", "ner": "O"},
                        {"id": 1, "orth": "is", "ner": "O"},
                        {"id": 2, "orth": "the", "ner": "O"},
                        {"id": 3, "orth": "difference", "ner": "O"},
                        {"id": 4, "orth": "between", "ner": "O"},
                        {"id": 5, "orth": "white", "ner": "O"},
                        {"id": 6, "orth": "and", "ner": "O"},
                        {"id": 7, "orth": "brown", "ner": "O"},
                        {"id": 8, "orth": "eggs", "ner": "O"},
                        {"id": 9, "orth": "?", "ner": "O"},
                    ],
                    "brackets": [],
                },
                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
            ],
            "cats": [
                {"label": "baking", "value": 0.0},
                {"label": "not_baking", "value": 1.0},
            ],
        },
    ],
 }
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -0,0 +1,288 @@
 import pytest
 from mock import Mock
 from spacy.pipeline import EntityRuler
 from spacy.matcher import DependencyMatcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.gold import Example
 from spacy.gold.converters.conllu2docs import conllu2docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
 from spacy.language import Language
 from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle
 from ..util import get_doc, make_tempdir
 def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
@pytest.mark.parametrize(
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 )
 def test_gold_misaligned(en_tokenizer, text, words):
    doc = en_tokenizer(text)
    Example.from_dict(doc, {"words": words})
 def test_issue4590(en_vocab):
    """Test that matches param in on_match method are the same as matches run with no on_match method"""
    pattern = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]
    on_match = Mock()
    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", on_match, pattern)
    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
    matches = matcher(doc)
    on_match_args = on_match.call_args
    assert on_match_args[0][3] == matches
 def test_issue4651_with_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4651_without_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    not specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4665():
    """
    conllu2json should not raise an exception if the HEAD column contains an
    underscore
    """
    input_data = """
 1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 2	This	_	DET	DT	_	_	det	_	_
 3	killing	_	NOUN	NN	_	_	nsubj	_	_
 4	of	_	ADP	IN	_	_	case	_	_
 5	a	_	DET	DT	_	_	det	_	_
 6	respected	_	ADJ	JJ	_	_	amod	_	_
 7	cleric	_	NOUN	NN	_	_	nmod	_	_
 8	will	_	AUX	MD	_	_	aux	_	_
 9	be	_	AUX	VB	_	_	aux	_	_
 10	causing	_	VERB	VBG	_	_	root	_	_
 11	us	_	PRON	PRP	_	_	iobj	_	_
 12	trouble	_	NOUN	NN	_	_	dobj	_	_
 13	for	_	ADP	IN	_	_	case	_	_
 14	years	_	NOUN	NNS	_	_	nmod	_	_
 15	to	_	PART	TO	_	_	mark	_	_
 16	come	_	VERB	VB	_	_	acl	_	_
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
    conllu2docs(input_data)
 def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))
    assert kb2.get_size_entities() == 1
 def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4725_1():
    """ Ensure the pickling of the NER goes well"""
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
    with make_tempdir() as tmp_path:
        with (tmp_path / "ner.pkl").open("wb") as file_:
            pickle.dump(ner, file_)
            assert ner.cfg["min_action_freq"] == 342
        with (tmp_path / "ner.pkl").open("rb") as file_:
            ner2 = pickle.load(file_)
            assert ner2.cfg["min_action_freq"] == 342
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4725_2():
    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    nlp.begin_training()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
 def test_issue4849():
    nlp = English()
    ruler = EntityRuler(
        nlp,
        patterns=[
            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
        ],
        phrase_matcher_attr="LOWER",
    )
    nlp.add_pipe(ruler)
    text = """
    The left is starting to take aim at Democratic front-runner Joe Biden.
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
    """
    # USING 1 PROCESS
    count_ents = 0
    for doc in nlp.pipe([text], n_process=1):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
    # USING 2 PROCESSES
    count_ents = 0
    for doc in nlp.pipe([text], n_process=2):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
 class CustomPipe:
    name = "my_pipe"
    def __init__(self):
        Span.set_extension("my_ext", getter=self._get_my_ext)
        Doc.set_extension("my_ext", default=None)
    def __call__(self, doc):
        gathered_ext = []
        for sent in doc.sents:
            sent_ext = self._get_my_ext(sent)
            sent._.set("my_ext", sent_ext)
            gathered_ext.append(sent_ext)
        doc._.set("my_ext", "\n".join(gathered_ext))
        return doc
    @staticmethod
    def _get_my_ext(span):
        return str(span.end)
 def test_issue4903():
    """Ensure that this runs correctly and doesn't hang or crash on Windows /
    macOS."""
    nlp = English()
    custom_component = CustomPipe()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(custom_component, after="sentencizer")
    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
    docs = list(nlp.pipe(text, n_process=2))
    assert docs[0].text == "I like bananas."
    assert docs[1].text == "Do you like them?"
    assert docs[2].text == "No, I prefer wasabi."
 def test_issue4924():
    nlp = Language()
    example = Example.from_dict(nlp.make_doc(""), {})
    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@ -1,16 +0,0 @@
 from spacy.tokens import Doc, DocBin
 def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,11 +0,0 @@
 import pytest
 from spacy.gold import Example
@pytest.mark.parametrize(
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 )
 def test_gold_misaligned(en_tokenizer, text, words):
    doc = en_tokenizer(text)
    Example.from_dict(doc, {"words": words})
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -1,35 +0,0 @@
 from mock import Mock
 from spacy.matcher import DependencyMatcher
 from ..util import get_doc
 def test_issue4590(en_vocab):
    """Test that matches param in on_match method are the same as matches run with no on_match method"""
    pattern = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]
    on_match = Mock()
    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", on_match, pattern)
    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
    matches = matcher(doc)
    on_match_args = on_match.call_args
    assert on_match_args[0][3] == matches
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@ -1,62 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 from ..util import make_tempdir
 def test_issue4651_with_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4651_without_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    not specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@ -1,35 +0,0 @@
 import pytest
 # TODO
 # from spacy.gold.converters.conllu2docs import conllu2docs
 input_data = """
 1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 2	This	_	DET	DT	_	_	det	_	_
 3	killing	_	NOUN	NN	_	_	nsubj	_	_
 4	of	_	ADP	IN	_	_	case	_	_
 5	a	_	DET	DT	_	_	det	_	_
 6	respected	_	ADJ	JJ	_	_	amod	_	_
 7	cleric	_	NOUN	NN	_	_	nmod	_	_
 8	will	_	AUX	MD	_	_	aux	_	_
 9	be	_	AUX	VB	_	_	aux	_	_
 10	causing	_	VERB	VBG	_	_	root	_	_
 11	us	_	PRON	PRP	_	_	iobj	_	_
 12	trouble	_	NOUN	NN	_	_	dobj	_	_
 13	for	_	ADP	IN	_	_	case	_	_
 14	years	_	NOUN	NNS	_	_	nmod	_	_
 15	to	_	PART	TO	_	_	mark	_	_
 16	come	_	VERB	VB	_	_	acl	_	_
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
@pytest.mark.xfail
 def test_issue4665():
    """
    conllu2json should not raise an exception if the HEAD column contains an
    underscore
    """
    pass
    # conllu2json(input_data)
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -1,36 +0,0 @@
 import pytest
 from spacy.kb import KnowledgeBase
 from spacy.util import ensure_path
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))
    assert kb2.get_size_entities() == 1
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@ -1,20 +0,0 @@
 from spacy.util import load_model_from_path
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -1,41 +0,0 @@
 import pickle
 import numpy
 from spacy.lang.en import English
 from spacy.vocab import Vocab
 from spacy.tests.util import make_tempdir
 def test_pickle_ner():
    """ Ensure the pickling of the NER goes well"""
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
    with make_tempdir() as tmp_path:
        with (tmp_path / "ner.pkl").open("wb") as file_:
            pickle.dump(ner, file_)
            assert ner.cfg["min_action_freq"] == 342
        with (tmp_path / "ner.pkl").open("rb") as file_:
            ner2 = pickle.load(file_)
            assert ner2.cfg["min_action_freq"] == 342
 def test_issue4725():
    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    nlp.begin_training()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -1,34 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 def test_issue4849():
    nlp = English()
    ruler = EntityRuler(
        nlp,
        patterns=[
            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
        ],
        phrase_matcher_attr="LOWER",
    )
    nlp.add_pipe(ruler)
    text = """
    The left is starting to take aim at Democratic front-runner Joe Biden.
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
    """
    # USING 1 PROCESS
    count_ents = 0
    for doc in nlp.pipe([text], n_process=1):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
    # USING 2 PROCESSES
    count_ents = 0
    for doc in nlp.pipe([text], n_process=2):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,40 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokens import Span, Doc
 class CustomPipe:
    name = "my_pipe"
    def __init__(self):
        Span.set_extension("my_ext", getter=self._get_my_ext)
        Doc.set_extension("my_ext", default=None)
    def __call__(self, doc):
        gathered_ext = []
        for sent in doc.sents:
            sent_ext = self._get_my_ext(sent)
            sent._.set("my_ext", sent_ext)
            gathered_ext.append(sent_ext)
        doc._.set("my_ext", "\n".join(gathered_ext))
        return doc
    @staticmethod
    def _get_my_ext(span):
        return str(span.end)
 def test_issue4903():
    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
    nlp = English()
    custom_component = CustomPipe()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(custom_component, after="sentencizer")
    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
    docs = list(nlp.pipe(text, n_process=2))
    assert docs[0].text == "I like bananas."
    assert docs[1].text == "Do you like them?"
    assert docs[2].text == "No, I prefer wasabi."
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -1,8 +0,0 @@
 from spacy.gold import Example
 from spacy.language import Language
 def test_issue4924():
    nlp = Language()
    example = Example.from_dict(nlp.make_doc(""), {})
    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,8 @@
 import pytest
 from spacy.lang.en import English
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
    # Test that the comparison between a Span and a Token, goes well
    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
    text = nlp("Talk about being boring!")
    text_var = nlp("Talk of being boring!")
    y = nlp("Let")
    span = text[0:3]  # Talk about being
    span_2 = text[0:3]  # Talk about being
    span_3 = text_var[0:3]  # Talk of being
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -63,7 +63,8 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger