spaCy/spacy/tests/training/test_readers.py

from typing import Dict, Iterable, Callable
import pytest
from thinc.api import Config
from spacy import Language
from spacy.util import load_model_from_config, registry, resolve_dot_names
from spacy.schemas import ConfigSchemaTraining
from spacy.training import Example


def test_readers():
    config_string = """
    [training]

    [corpora]
    @readers = "myreader.v1"

    [nlp]
    lang = "en"
    pipeline = ["tok2vec", "textcat"]

    [components]

    [components.tok2vec]
    factory = "tok2vec"

    [components.textcat]
    factory = "textcat"
    """

    @registry.readers.register("myreader.v1")
    def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
        annots = {"cats": {"POS": 1.0, "NEG": 0.0}}

        def reader(nlp: Language):
            doc = nlp.make_doc(f"This is an example")
            return [Example.from_dict(doc, annots)]

        return {"train": reader, "dev": reader, "extra": reader, "something": reader}

    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
    T = registry.resolve(
        nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
    )
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    assert isinstance(train_corpus, Callable)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
    scores = nlp.evaluate(list(dev_corpus(nlp)))
    assert scores["cats_score"] == 0.0
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
    corpora = {"corpora": nlp.config.interpolate()["corpora"]}
    extra_corpus = registry.resolve(corpora)["corpora"]["extra"]
    assert isinstance(extra_corpus, Callable)


@pytest.mark.slow
@pytest.mark.parametrize(
    "reader,additional_config",
    [
        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
    ],
)
def test_cat_readers(reader, additional_config):
    nlp_config_string = """
    [training]

    [corpora]
    @readers = "PLACEHOLDER"

    [nlp]
    lang = "en"
    pipeline = ["tok2vec", "textcat"]

    [components]

    [components.tok2vec]
    factory = "tok2vec"

    [components.textcat]
    factory = "textcat"
    """
    config = Config().from_str(nlp_config_string)
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
    T = registry.resolve(
        nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
    )
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
    scores = nlp.evaluate(dev_examples)
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`from typing import Dict, Iterable, Callable`
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`import pytest`
			`from thinc.api import Config`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`from spacy import Language`
Refactor CLI 2020-09-28 16:09:59 +03:00			`from spacy.util import load_model_from_config, registry, resolve_dot_names`
			`from spacy.schemas import ConfigSchemaTraining`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`from spacy.training import Example`


			`def test_readers():`
			`config_string = """`
			`[training]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`[corpora]`
			`@readers = "myreader.v1"`

			`[nlp]`
			`lang = "en"`
			`pipeline = ["tok2vec", "textcat"]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`[components]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`[components.tok2vec]`
			`factory = "tok2vec"`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`[components.textcat]`
			`factory = "textcat"`
			`"""`
cleanup and formatting 2020-09-17 12:48:04 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`@registry.readers.register("myreader.v1")`
			`def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:`
			`annots = {"cats": {"POS": 1.0, "NEG": 0.0}}`
cleanup and formatting 2020-09-17 12:48:04 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`def reader(nlp: Language):`
			`doc = nlp.make_doc(f"This is an example")`
			`return [Example.from_dict(doc, annots)]`
cleanup and formatting 2020-09-17 12:48:04 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`return {"train": reader, "dev": reader, "extra": reader, "something": reader}`

			`config = Config().from_str(config_string)`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = load_model_from_config(config, auto_fill=True)`
Fix config resolution and interpolation TODO: auto-interpolate in Thinc if config is dict (i.e. likely subsection) 2020-09-28 16:34:00 +03:00			`T = registry.resolve(`
			`nlp.config.interpolate()["training"], schema=ConfigSchemaTraining`
			`)`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`dot_names = [T["train_corpus"], T["dev_corpus"]]`
			`train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)`
			`assert isinstance(train_corpus, Callable)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`optimizer = T["optimizer"]`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`# simulate a training loop`
begin_training -> initialize 2020-09-28 22:35:09 +03:00			`nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`for example in train_corpus(nlp):`
			`nlp.update([example], sgd=optimizer)`
			`scores = nlp.evaluate(list(dev_corpus(nlp)))`
TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos 2020-10-18 15:50:41 +03:00			`assert scores["cats_score"] == 0.0`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`# ensure the pipeline runs`
			`doc = nlp("Quick test")`
			`assert doc.cats`
Fix config resolution and interpolation TODO: auto-interpolate in Thinc if config is dict (i.e. likely subsection) 2020-09-28 16:34:00 +03:00			`corpora = {"corpora": nlp.config.interpolate()["corpora"]}`
			`extra_corpus = registry.resolve(corpora)["corpora"]["extra"]`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`assert isinstance(extra_corpus, Callable)`
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00

			`@pytest.mark.slow`
			`@pytest.mark.parametrize(`
			`"reader,additional_config",`
			`[`
			`("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),`
			`("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),`
			`("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),`
			`],`
			`)`
			`def test_cat_readers(reader, additional_config):`
			`nlp_config_string = """`
			`[training]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`[corpora]`
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`@readers = "PLACEHOLDER"`

			`[nlp]`
			`lang = "en"`
			`pipeline = ["tok2vec", "textcat"]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`[components]`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`[components.tok2vec]`
			`factory = "tok2vec"`
Tidy up and auto-format 2020-09-21 11:59:07 +03:00
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`[components.textcat]`
			`factory = "textcat"`
			`"""`
			`config = Config().from_str(nlp_config_string)`
generalize corpora, dot notation for dev and train corpus 2020-09-17 12:38:59 +03:00			`config["corpora"]["@readers"] = reader`
			`config["corpora"].update(additional_config)`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = load_model_from_config(config, auto_fill=True)`
Fix config resolution and interpolation TODO: auto-interpolate in Thinc if config is dict (i.e. likely subsection) 2020-09-28 16:34:00 +03:00			`T = registry.resolve(`
			`nlp.config["training"].interpolate(), schema=ConfigSchemaTraining`
			`)`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`dot_names = [T["train_corpus"], T["dev_corpus"]]`
			`train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`optimizer = T["optimizer"]`
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`# simulate a training loop`
begin_training -> initialize 2020-09-28 22:35:09 +03:00			`nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)`
actual commit with test for custom readers with ml_datasets >= 0.2 2020-09-16 17:41:28 +03:00			`for example in train_corpus(nlp):`
			`assert example.y.cats`
			`# this shouldn't fail if each training example has at least one positive label`
			`assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]`
			`nlp.update([example], sgd=optimizer)`
			`# simulate performance benchmark on dev corpus`
			`dev_examples = list(dev_corpus(nlp))`
			`for example in dev_examples:`
			`# this shouldn't fail if each dev example has at least one positive label`
			`assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]`
			`scores = nlp.evaluate(dev_examples)`
			`assert scores["cats_score"]`
			`# ensure the pipeline runs`
			`doc = nlp("Quick test")`
			`assert doc.cats`