spaCy/spacy/tests/pipeline/test_analysis.py

import spacy.language
from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
from mock import Mock, ANY
import pytest


def test_component_decorator_function():
    @component(name="test")
    def test_component(doc):
        """docstring"""
        return doc

    assert test_component.name == "test"
    assert test_component.__doc__ == "docstring"
    assert test_component("foo") == "foo"


def test_component_decorator_class():
    @component(name="test")
    class TestComponent(object):
        """docstring1"""

        foo = "bar"

        def __call__(self, doc):
            """docstring2"""
            return doc

        def custom(self, x):
            """docstring3"""
            return x

    assert TestComponent.name == "test"
    assert TestComponent.foo == "bar"
    assert hasattr(TestComponent, "custom")
    test_component = TestComponent()
    assert test_component.foo == "bar"
    assert test_component("foo") == "foo"
    assert hasattr(test_component, "custom")
    assert test_component.custom("bar") == "bar"
    assert TestComponent.__doc__ == "docstring1"
    assert TestComponent.__call__.__doc__ == "docstring2"
    assert TestComponent.custom.__doc__ == "docstring3"
    assert test_component.__doc__ == "docstring1"
    assert test_component.__call__.__doc__ == "docstring2"
    assert test_component.custom.__doc__ == "docstring3"


def test_component_decorator_assigns():
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True

    @component("c1", assigns=["token.tag", "doc.tensor"])
    def test_component1(doc):
        return doc

    @component(
        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
    )
    def test_component2(doc):
        return doc

    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
    def test_component3(doc):
        return doc

    assert "c1" in Language.factories
    assert "c2" in Language.factories
    assert "c3" in Language.factories

    nlp = Language()
    nlp.add_pipe(test_component1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(test_component2)
    nlp.add_pipe(test_component3)
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
    test_component4 = nlp.create_pipe("c1")
    assert test_component4.name == "c1"
    assert test_component4.factory == "c1"
    nlp.add_pipe(test_component4, name="c4")
    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
    assert "c4" not in Language.factories
    assert nlp.pipe_factories["c1"] == "c1"
    assert nlp.pipe_factories["c4"] == "c1"
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
    assert [name for name, _ in requires_pos] == ["c2"]
    assert print_summary(nlp, no_print=True)
    assert nlp("hello world")


def test_component_factories_from_nlp():
    """Test that class components can implement a from_nlp classmethod that
    gives them access to the nlp object and config via the factory."""

    class TestComponent5(object):
        def __call__(self, doc):
            return doc

    mock = Mock()
    mock.return_value = TestComponent5()
    TestComponent5.from_nlp = classmethod(mock)
    TestComponent5 = component("c5")(TestComponent5)

    assert "c5" in Language.factories
    nlp = Language()
    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
    nlp.add_pipe(pipe)
    assert nlp("hello world")
    # The first argument here is the class itself, so we're accepting any here
    # The model will be initialized to None by the factory
    mock.assert_called_once_with(ANY, nlp, None, foo="bar")


def test_analysis_validate_attrs_valid():
    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz", "span._.xyz"]
    assert validate_attrs(attrs)
    for attr in attrs:
        assert validate_attrs([attr])
    with pytest.raises(ValueError):
        validate_attrs(["doc.sents", "doc.xyz"])


@pytest.mark.parametrize(
    "attr",
    [
        "doc",
        "doc_ents",
        "doc.xyz",
        "token.xyz",
        "token.tag_",
        "token.tag.xyz",
        "token._.xyz.abc",
        "span.label",
    ],
)
def test_analysis_validate_attrs_invalid(attr):
    with pytest.raises(ValueError):
        validate_attrs([attr])


def test_analysis_validate_attrs_remove_pipe():
    """Test that attributes are validated correctly on remove."""
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True

    @component("c1", assigns=["token.tag"])
    def c1(doc):
        return doc

    @component("c2", requires=["token.pos"])
    def c2(doc):
        return doc

    nlp = Language()
    nlp.add_pipe(c1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(c2)
    with pytest.warns(None) as record:
        nlp.remove_pipe("c2")
    assert not record.list