spaCy/spacy/tests/regression/test_issue6501-7000.py

import pytest
from spacy.lang.en import English
import numpy as np
import spacy
from spacy.tokens import Doc
from spacy.matcher import PhraseMatcher
from spacy.tokens import DocBin
from spacy.util import load_config_from_str
from spacy.training import Example
from spacy.training.initialize import init_nlp
import pickle

from ..util import make_tempdir


@pytest.mark.issue(6730)
def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase

    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False

    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}


@pytest.mark.issue(6755)
def test_issue6755(en_tokenizer):
    doc = en_tokenizer("This is a magnificent sentence.")
    span = doc[:0]
    assert span.text_with_ws == ""
    assert span.text == ""


@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,label",
    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
)
@pytest.mark.issue(6815)
def test_issue6815_1(sentence, start_idx, end_idx, label):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, label=label)
    assert span.label_ == label


@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
)
@pytest.mark.issue(6815)
def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
    assert span.kb_id == kb_id


@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,vector",
    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
)
@pytest.mark.issue(6815)
def test_issue6815_3(sentence, start_idx, end_idx, vector):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, vector=vector)
    assert (span.vector == vector).all()


@pytest.mark.issue(6839)
def test_issue6839(en_vocab):
    """Ensure that PhraseMatcher accepts Span as input"""
    # fmt: off
    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("SPACY", [pattern])
    matches = matcher(span)
    assert matches


CONFIG_ISSUE_6908 = """
[paths]
train = "TRAIN_PLACEHOLDER"
raw = null
init_tok2vec = null
vectors = null

[system]
seed = 0
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["textcat"]
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000

[components]

[components.textcat]
factory = "TEXTCAT_PLACEHOLDER"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths:train}

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths:train}


[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
frozen_components = []
before_to_disk = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.components.textcat]
labels = ['label1', 'label2']

[initialize.tokenizer]
"""


@pytest.mark.parametrize(
    "component_name",
    ["textcat", "textcat_multilabel"],
)
@pytest.mark.issue(6908)
def test_issue6908(component_name):
    """Test intializing textcat with labels in a list"""

    def create_data(out_file):
        nlp = spacy.blank("en")
        doc = nlp.make_doc("Some text")
        doc.cats = {"label1": 0, "label2": 1}
        out_data = DocBin(docs=[doc]).to_bytes()
        with out_file.open("wb") as file_:
            file_.write(out_data)

    with make_tempdir() as tmp_path:
        train_path = tmp_path / "train.spacy"
        create_data(train_path)
        config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
        config = load_config_from_str(config_str)
        init_nlp(config)


CONFIG_ISSUE_6950 = """
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.ner]
factory = "ner"

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
"""


@pytest.mark.issue(6950)
def test_issue6950():
    """Test that the nlp object with initialized tok2vec with listeners pickles
    correctly (and doesn't have lambdas).
    """
    nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
    pickle.dumps(nlp)
    nlp("hello")
    pickle.dumps(nlp)
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`import pytest`
			`from spacy.lang.en import English`
			`import numpy as np`
			`import spacy`
			`from spacy.tokens import Doc`
			`from spacy.matcher import PhraseMatcher`
			`from spacy.tokens import DocBin`
			`from spacy.util import load_config_from_str`
			`from spacy.training import Example`
			`from spacy.training.initialize import init_nlp`
			`import pickle`

			`from ..util import make_tempdir`


Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6730)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6730(en_vocab):`
			`"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""`
			`from spacy.kb import KnowledgeBase`

			`kb = KnowledgeBase(en_vocab, entity_vector_length=3)`
			`kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])`

			`with pytest.raises(ValueError):`
			`kb.add_alias(alias="", entities=["1"], probabilities=[0.4])`
			`assert kb.contains_alias("") is False`

			`kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])`
			`kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])`

			`with make_tempdir() as tmp_dir:`
			`kb.to_disk(tmp_dir)`
			`kb.from_disk(tmp_dir)`
			`assert kb.get_size_aliases() == 2`
			`assert set(kb.get_alias_strings()) == {"x", "y"}`


Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6755)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6755(en_tokenizer):`
			`doc = en_tokenizer("This is a magnificent sentence.")`
			`span = doc[:0]`
			`assert span.text_with_ws == ""`
			`assert span.text == ""`


			`@pytest.mark.parametrize(`
			`"sentence, start_idx,end_idx,label",`
			`[("Welcome to Mumbai, my friend", 11, 17, "GPE")],`
			`)`
Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6815)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6815_1(sentence, start_idx, end_idx, label):`
			`nlp = English()`
			`doc = nlp(sentence)`
			`span = doc[:].char_span(start_idx, end_idx, label=label)`
			`assert span.label_ == label`


			`@pytest.mark.parametrize(`
			`"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]`
			`)`
Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6815)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6815_2(sentence, start_idx, end_idx, kb_id):`
			`nlp = English()`
			`doc = nlp(sentence)`
			`span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)`
			`assert span.kb_id == kb_id`


			`@pytest.mark.parametrize(`
			`"sentence, start_idx,end_idx,vector",`
			`[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],`
			`)`
Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6815)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6815_3(sentence, start_idx, end_idx, vector):`
			`nlp = English()`
			`doc = nlp(sentence)`
			`span = doc[:].char_span(start_idx, end_idx, vector=vector)`
			`assert (span.vector == vector).all()`


Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6839)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6839(en_vocab):`
			`"""Ensure that PhraseMatcher accepts Span as input"""`
			`# fmt: off`
			`words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]`
			`# fmt: on`
			`doc = Doc(en_vocab, words=words)`
			`span = doc[:8]`
			`pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])`
			`matcher = PhraseMatcher(en_vocab)`
			`matcher.add("SPACY", [pattern])`
			`matches = matcher(span)`
			`assert matches`


			`CONFIG_ISSUE_6908 = """`
			`[paths]`
			`train = "TRAIN_PLACEHOLDER"`
			`raw = null`
			`init_tok2vec = null`
			`vectors = null`

			`[system]`
			`seed = 0`
			`gpu_allocator = null`

			`[nlp]`
			`lang = "en"`
			`pipeline = ["textcat"]`
			`tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}`
			`disabled = []`
			`before_creation = null`
			`after_creation = null`
			`after_pipeline_creation = null`
			`batch_size = 1000`

			`[components]`

			`[components.textcat]`
			`factory = "TEXTCAT_PLACEHOLDER"`

			`[corpora]`

			`[corpora.train]`
			`@readers = "spacy.Corpus.v1"`
			`path = ${paths:train}`

			`[corpora.dev]`
			`@readers = "spacy.Corpus.v1"`
			`path = ${paths:train}`


			`[training]`
			`train_corpus = "corpora.train"`
			`dev_corpus = "corpora.dev"`
			`seed = ${system.seed}`
			`gpu_allocator = ${system.gpu_allocator}`
			`frozen_components = []`
			`before_to_disk = null`

			`[pretraining]`

			`[initialize]`
			`vectors = ${paths.vectors}`
			`init_tok2vec = ${paths.init_tok2vec}`
			`vocab_data = null`
			`lookups = null`
			`before_init = null`
			`after_init = null`

			`[initialize.components]`

			`[initialize.components.textcat]`
			`labels = ['label1', 'label2']`

			`[initialize.tokenizer]`
			`"""`


			`@pytest.mark.parametrize(`
Tidy up code 2021-06-28 12:48:00 +03:00			`"component_name",`
			`["textcat", "textcat_multilabel"],`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`)`
Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6908)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6908(component_name):`
			`"""Test intializing textcat with labels in a list"""`

			`def create_data(out_file):`
			`nlp = spacy.blank("en")`
			`doc = nlp.make_doc("Some text")`
			`doc.cats = {"label1": 0, "label2": 1}`
			`out_data = DocBin(docs=[doc]).to_bytes()`
			`with out_file.open("wb") as file_:`
			`file_.write(out_data)`

			`with make_tempdir() as tmp_path:`
			`train_path = tmp_path / "train.spacy"`
			`create_data(train_path)`
			`config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)`
			`config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())`
			`config = load_config_from_str(config_str)`
			`init_nlp(config)`


			`CONFIG_ISSUE_6950 = """`
			`[nlp]`
			`lang = "en"`
			`pipeline = ["tok2vec", "tagger"]`

			`[components]`

			`[components.tok2vec]`
			`factory = "tok2vec"`

			`[components.tok2vec.model]`
			`@architectures = "spacy.Tok2Vec.v1"`

			`[components.tok2vec.model.embed]`
			`@architectures = "spacy.MultiHashEmbed.v1"`
			`width = ${components.tok2vec.model.encode:width}`
			`attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]`
			`rows = [5000,2500,2500,2500]`
			`include_static_vectors = false`

			`[components.tok2vec.model.encode]`
			`@architectures = "spacy.MaxoutWindowEncoder.v1"`
			`width = 96`
			`depth = 4`
			`window_size = 1`
			`maxout_pieces = 3`

			`[components.ner]`
			`factory = "ner"`

			`[components.tagger]`
			`factory = "tagger"`

			`[components.tagger.model]`
			`@architectures = "spacy.Tagger.v1"`
			`nO = null`

			`[components.tagger.model.tok2vec]`
			`@architectures = "spacy.Tok2VecListener.v1"`
			`width = ${components.tok2vec.model.encode:width}`
			`upstream = "*"`
			`"""`


Decorate regression tests Even if the issue number is already in the file, I still decorated them just to follow the convention found in test_issue8168.py 2021-11-05 04:27:19 +03:00			`@pytest.mark.issue(6950)`
Tidy up and auto-format 2021-02-13 04:55:56 +03:00			`def test_issue6950():`
			`"""Test that the nlp object with initialized tok2vec with listeners pickles`
			`correctly (and doesn't have lambdas).`
			`"""`
			`nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))`
			`nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])`
			`pickle.dumps(nlp)`
			`nlp("hello")`
			`pickle.dumps(nlp)`