spaCy/spacy/tests/test_language.py

# coding: utf-8
from __future__ import unicode_literals

import itertools

import pytest
from spacy.compat import is_python2
from spacy.gold import GoldParse
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab

from .util import add_vecs_to_vocab, assert_docs_equal


@pytest.fixture
def nlp():
    nlp = Language(Vocab())
    textcat = nlp.create_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.add_pipe(textcat)
    nlp.begin_training()
    return nlp


def test_language_update(nlp):
    text = "hello world"
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
    wrongkeyannots = {"LABEL": True}
    doc = Doc(nlp.vocab, words=text.split(" "))
    gold = GoldParse(doc, **annots)
    # Update with doc and gold objects
    nlp.update([doc], [gold])
    # Update with text and dict
    nlp.update([text], [annots])
    # Update with doc object and dict
    nlp.update([doc], [annots])
    # Update with text and gold object
    nlp.update([text], [gold])
    # Update badly
    with pytest.raises(IndexError):
        nlp.update([doc], [])
    with pytest.raises(IndexError):
        nlp.update([], [gold])
    with pytest.raises(ValueError):
        nlp.update([text], [wrongkeyannots])


def test_language_evaluate(nlp):
    text = "hello world"
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
    doc = Doc(nlp.vocab, words=text.split(" "))
    gold = GoldParse(doc, **annots)
    # Evaluate with doc and gold objects
    nlp.evaluate([(doc, gold)])
    # Evaluate with text and dict
    nlp.evaluate([(text, annots)])
    # Evaluate with doc object and dict
    nlp.evaluate([(doc, annots)])
    # Evaluate with text and gold object
    nlp.evaluate([(text, gold)])
    # Evaluate badly
    with pytest.raises(Exception):
        nlp.evaluate([text, gold])


def vector_modification_pipe(doc):
    doc.vector += 1
    return doc


def userdata_pipe(doc):
    doc.user_data["foo"] = "bar"
    return doc


def ner_pipe(doc):
    span = Span(doc, 0, 1, label="FIRST")
    doc.ents += (span,)
    return doc


@pytest.fixture
def sample_vectors():
    return [
        ("spacy", [-0.1, -0.2, -0.3]),
        ("world", [-0.2, -0.3, -0.4]),
        ("pipe", [0.7, 0.8, 0.9]),
    ]


@pytest.fixture
def nlp2(nlp, sample_vectors):
    add_vecs_to_vocab(nlp.vocab, sample_vectors)
    nlp.add_pipe(vector_modification_pipe)
    nlp.add_pipe(ner_pipe)
    nlp.add_pipe(userdata_pipe)
    return nlp


@pytest.fixture
def texts():
    data = [
        "Hello world.",
        "This is spacy.",
        "You can use multiprocessing with pipe method.",
        "Please try!",
    ]
    return data


@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe(nlp2, n_process, texts):
    texts = texts * 10
    expecteds = [nlp2(text) for text in texts]
    docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)

    for doc, expected_doc in zip(docs, expecteds):
        assert_docs_equal(doc, expected_doc)


@pytest.mark.skipif(
    is_python2, reason="python2 seems to be unable to handle iterator properly"
)
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_stream(nlp2, n_process, texts):
    # check if nlp.pipe can handle infinite length iterator properly.
    stream_texts = itertools.cycle(texts)
    texts0, texts1 = itertools.tee(stream_texts)
    expecteds = (nlp2(text) for text in texts0)
    docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)

    n_fetch = 20
    for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
        assert_docs_equal(doc, expected_doc)
💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

multiprocessing pipe (#1303) (#4371) * refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment 2019-10-08 13:20:55 +03:00			`import itertools`

💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00			`import pytest`
multiprocessing pipe (#1303) (#4371) * refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment 2019-10-08 13:20:55 +03:00			`from spacy.compat import is_python2`
💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00			`from spacy.gold import GoldParse`
multiprocessing pipe (#1303) (#4371) * refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment 2019-10-08 13:20:55 +03:00			`from spacy.language import Language`
			`from spacy.tokens import Doc, Span`
			`from spacy.vocab import Vocab`

			`from .util import add_vecs_to_vocab, assert_docs_equal`
💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00

			`@pytest.fixture`
			`def nlp():`
			`nlp = Language(Vocab())`
			`textcat = nlp.create_pipe("textcat")`
			`for label in ("POSITIVE", "NEGATIVE"):`
			`textcat.add_label(label)`
			`nlp.add_pipe(textcat)`
			`nlp.begin_training()`
			`return nlp`


			`def test_language_update(nlp):`
			`text = "hello world"`
			`annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}`
Raise error if annotation dict in simple training style has unexpected keys #4074 (#4079) * adding enhancement #4074. * modified behavior to strictly require top level dictionary keys - issue #4074 * pass expected keys to error message and add links as expected top level key 2019-08-06 12:01:25 +03:00			`wrongkeyannots = {"LABEL": True}`
💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00			`doc = Doc(nlp.vocab, words=text.split(" "))`
			`gold = GoldParse(doc, **annots)`
			`# Update with doc and gold objects`
			`nlp.update([doc], [gold])`
			`# Update with text and dict`
			`nlp.update([text], [annots])`
			`# Update with doc object and dict`
			`nlp.update([doc], [annots])`
			`# Update with text and gold object`
			`nlp.update([text], [gold])`
			`# Update badly`
			`with pytest.raises(IndexError):`
			`nlp.update([doc], [])`
			`with pytest.raises(IndexError):`
			`nlp.update([], [gold])`
Raise error if annotation dict in simple training style has unexpected keys #4074 (#4079) * adding enhancement #4074. * modified behavior to strictly require top level dictionary keys - issue #4074 * pass expected keys to error message and add links as expected top level key 2019-08-06 12:01:25 +03:00			`with pytest.raises(ValueError):`
			`nlp.update([text], [wrongkeyannots])`
💫 Support simple training format in nlp.evaluate and add tests (#4033) * Support simple training format in nlp.evaluate and add tests * Update docs [ci skip] 2019-07-27 18:30:18 +03:00

			`def test_language_evaluate(nlp):`
			`text = "hello world"`
			`annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}`
			`doc = Doc(nlp.vocab, words=text.split(" "))`
			`gold = GoldParse(doc, **annots)`
			`# Evaluate with doc and gold objects`
			`nlp.evaluate([(doc, gold)])`
			`# Evaluate with text and dict`
			`nlp.evaluate([(text, annots)])`
			`# Evaluate with doc object and dict`
			`nlp.evaluate([(doc, annots)])`
			`# Evaluate with text and gold object`
			`nlp.evaluate([(text, gold)])`
			`# Evaluate badly`
			`with pytest.raises(Exception):`
			`nlp.evaluate([text, gold])`
multiprocessing pipe (#1303) (#4371) * refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment 2019-10-08 13:20:55 +03:00

			`def vector_modification_pipe(doc):`
			`doc.vector += 1`
			`return doc`


			`def userdata_pipe(doc):`
			`doc.user_data["foo"] = "bar"`
			`return doc`


			`def ner_pipe(doc):`
			`span = Span(doc, 0, 1, label="FIRST")`
			`doc.ents += (span,)`
			`return doc`


			`@pytest.fixture`
			`def sample_vectors():`
			`return [`
			`("spacy", [-0.1, -0.2, -0.3]),`
			`("world", [-0.2, -0.3, -0.4]),`
			`("pipe", [0.7, 0.8, 0.9]),`
			`]`


			`@pytest.fixture`
			`def nlp2(nlp, sample_vectors):`
			`add_vecs_to_vocab(nlp.vocab, sample_vectors)`
			`nlp.add_pipe(vector_modification_pipe)`
			`nlp.add_pipe(ner_pipe)`
			`nlp.add_pipe(userdata_pipe)`
			`return nlp`


			`@pytest.fixture`
			`def texts():`
			`data = [`
			`"Hello world.",`
			`"This is spacy.",`
			`"You can use multiprocessing with pipe method.",`
			`"Please try!",`
			`]`
			`return data`


			`@pytest.mark.parametrize("n_process", [1, 2])`
			`def test_language_pipe(nlp2, n_process, texts):`
			`texts = texts * 10`
			`expecteds = [nlp2(text) for text in texts]`
			`docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)`

			`for doc, expected_doc in zip(docs, expecteds):`
			`assert_docs_equal(doc, expected_doc)`


			`@pytest.mark.skipif(`
			`is_python2, reason="python2 seems to be unable to handle iterator properly"`
			`)`
			`@pytest.mark.parametrize("n_process", [1, 2])`
			`def test_language_pipe_stream(nlp2, n_process, texts):`
			`# check if nlp.pipe can handle infinite length iterator properly.`
			`stream_texts = itertools.cycle(texts)`
			`texts0, texts1 = itertools.tee(stream_texts)`
			`expecteds = (nlp2(text) for text in texts0)`
			`docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)`

			`n_fetch = 20`
			`for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):`
			`assert_docs_equal(doc, expected_doc)`