spaCy/spacy/tests/pipeline/test_analysis.py

import spacy.language
from spacy.language import Language, component
from spacy.pipe_analysis import print_summary, validate_attrs
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
from spacy.pipe_analysis import count_pipeline_interdependencies
from mock import Mock, ANY
import pytest


def test_component_decorator_function():
    @component(name="test")
    def test_component(doc):
        """docstring"""
        return doc

    assert test_component.name == "test"
    assert test_component.__doc__ == "docstring"
    assert test_component("foo") == "foo"


def test_component_decorator_class():
    @component(name="test")
    class TestComponent(object):
        """docstring1"""

        foo = "bar"

        def __call__(self, doc):
            """docstring2"""
            return doc

        def custom(self, x):
            """docstring3"""
            return x

    assert TestComponent.name == "test"
    assert TestComponent.foo == "bar"
    assert hasattr(TestComponent, "custom")
    test_component = TestComponent()
    assert test_component.foo == "bar"
    assert test_component("foo") == "foo"
    assert hasattr(test_component, "custom")
    assert test_component.custom("bar") == "bar"
    assert TestComponent.__doc__ == "docstring1"
    assert TestComponent.__call__.__doc__ == "docstring2"
    assert TestComponent.custom.__doc__ == "docstring3"
    assert test_component.__doc__ == "docstring1"
    assert test_component.__call__.__doc__ == "docstring2"
    assert test_component.custom.__doc__ == "docstring3"


def test_component_decorator_assigns():
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True

    @component("c1", assigns=["token.tag", "doc.tensor"])
    def test_component1(doc):
        return doc

    @component(
        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
    )
    def test_component2(doc):
        return doc

    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
    def test_component3(doc):
        return doc

    assert "c1" in Language.factories
    assert "c2" in Language.factories
    assert "c3" in Language.factories

    nlp = Language()
    nlp.add_pipe(test_component1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(test_component2)
    nlp.add_pipe(test_component3)
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
    test_component4 = nlp.create_pipe("c1")
    assert test_component4.name == "c1"
    assert test_component4.factory == "c1"
    nlp.add_pipe(test_component4, name="c4")
    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
    assert "c4" not in Language.factories
    assert nlp.pipe_factories["c1"] == "c1"
    assert nlp.pipe_factories["c4"] == "c1"
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
    assert [name for name, _ in requires_pos] == ["c2"]
    assert print_summary(nlp, no_print=True)
    assert nlp("hello world")


def test_component_factories_from_nlp():
    """Test that class components can implement a from_nlp classmethod that
    gives them access to the nlp object and config via the factory."""

    class TestComponent5(object):
        def __call__(self, doc):
            return doc

    mock = Mock()
    mock.return_value = TestComponent5()
    TestComponent5.from_nlp = classmethod(mock)
    TestComponent5 = component("c5")(TestComponent5)

    assert "c5" in Language.factories
    nlp = Language()
    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
    nlp.add_pipe(pipe)
    assert nlp("hello world")
    # The first argument here is the class itself, so we're accepting any here
    # The model will be initialized to None by the factory
    mock.assert_called_once_with(ANY, nlp, None, foo="bar")


def test_analysis_validate_attrs_valid():
    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz", "span._.xyz"]
    assert validate_attrs(attrs)
    for attr in attrs:
        assert validate_attrs([attr])
    with pytest.raises(ValueError):
        validate_attrs(["doc.sents", "doc.xyz"])


@pytest.mark.parametrize(
    "attr",
    [
        "doc",
        "doc_ents",
        "doc.xyz",
        "token.xyz",
        "token.tag_",
        "token.tag.xyz",
        "token._.xyz.abc",
        "span.label",
    ],
)
def test_analysis_validate_attrs_invalid(attr):
    with pytest.raises(ValueError):
        validate_attrs([attr])


def test_analysis_validate_attrs_remove_pipe():
    """Test that attributes are validated correctly on remove."""
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True

    @component("c1", assigns=["token.tag"])
    def c1(doc):
        return doc

    @component("c2", requires=["token.pos"])
    def c2(doc):
        return doc

    nlp = Language()
    nlp.add_pipe(c1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(c2)
    with pytest.warns(None) as record:
        nlp.remove_pipe("c2")
    assert not record.list


def test_pipe_interdependencies():
    class Fancifier:
        name = "fancifier"
        assigns = ("doc._.fancy",)
        requires = tuple()

    class FancyNeeder:
        name = "needer"
        assigns = tuple()
        requires = ("doc._.fancy",)

    pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
    counts = count_pipeline_interdependencies(pipeline)
    assert counts == [1, 0]
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00			`import spacy.language`
			`from spacy.language import Language, component`
Rename spacy.analysis to spacy.pipe_analysis 2020-05-22 18:42:06 +03:00			`from spacy.pipe_analysis import print_summary, validate_attrs`
			`from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr`
			`from spacy.pipe_analysis import count_pipeline_interdependencies`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00			`from mock import Mock, ANY`
			`import pytest`


			`def test_component_decorator_function():`
			`@component(name="test")`
			`def test_component(doc):`
			`"""docstring"""`
			`return doc`

			`assert test_component.name == "test"`
Drop Python 2.7 and 3.5 (#4828) * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip] 2019-12-22 03:53:56 +03:00			`assert test_component.__doc__ == "docstring"`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00			`assert test_component("foo") == "foo"`


			`def test_component_decorator_class():`
			`@component(name="test")`
			`class TestComponent(object):`
			`"""docstring1"""`

			`foo = "bar"`

			`def __call__(self, doc):`
			`"""docstring2"""`
			`return doc`

			`def custom(self, x):`
			`"""docstring3"""`
			`return x`

			`assert TestComponent.name == "test"`
			`assert TestComponent.foo == "bar"`
			`assert hasattr(TestComponent, "custom")`
			`test_component = TestComponent()`
			`assert test_component.foo == "bar"`
			`assert test_component("foo") == "foo"`
			`assert hasattr(test_component, "custom")`
			`assert test_component.custom("bar") == "bar"`
Drop Python 2.7 and 3.5 (#4828) * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip] 2019-12-22 03:53:56 +03:00			`assert TestComponent.__doc__ == "docstring1"`
			`assert TestComponent.__call__.__doc__ == "docstring2"`
			`assert TestComponent.custom.__doc__ == "docstring3"`
			`assert test_component.__doc__ == "docstring1"`
			`assert test_component.__call__.__doc__ == "docstring2"`
			`assert test_component.custom.__doc__ == "docstring3"`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00

			`def test_component_decorator_assigns():`
			`spacy.language.ENABLE_PIPELINE_ANALYSIS = True`

			`@component("c1", assigns=["token.tag", "doc.tensor"])`
			`def test_component1(doc):`
			`return doc`

			`@component(`
			`"c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]`
			`)`
			`def test_component2(doc):`
			`return doc`

			`@component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])`
			`def test_component3(doc):`
			`return doc`

			`assert "c1" in Language.factories`
			`assert "c2" in Language.factories`
			`assert "c3" in Language.factories`

			`nlp = Language()`
			`nlp.add_pipe(test_component1)`
			`with pytest.warns(UserWarning):`
			`nlp.add_pipe(test_component2)`
			`nlp.add_pipe(test_component3)`
			`assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")`
			`assert [name for name, _ in assigns_tensor] == ["c1", "c2"]`
			`test_component4 = nlp.create_pipe("c1")`
			`assert test_component4.name == "c1"`
			`assert test_component4.factory == "c1"`
			`nlp.add_pipe(test_component4, name="c4")`
			`assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]`
			`assert "c4" not in Language.factories`
			`assert nlp.pipe_factories["c1"] == "c1"`
			`assert nlp.pipe_factories["c4"] == "c1"`
			`assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")`
			`assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]`
			`requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")`
			`assert [name for name, _ in requires_pos] == ["c2"]`
			`assert print_summary(nlp, no_print=True)`
			`assert nlp("hello world")`


			`def test_component_factories_from_nlp():`
			`"""Test that class components can implement a from_nlp classmethod that`
			`gives them access to the nlp object and config via the factory."""`

			`class TestComponent5(object):`
			`def __call__(self, doc):`
			`return doc`

			`mock = Mock()`
			`mock.return_value = TestComponent5()`
			`TestComponent5.from_nlp = classmethod(mock)`
			`TestComponent5 = component("c5")(TestComponent5)`

			`assert "c5" in Language.factories`
			`nlp = Language()`
			`pipe = nlp.create_pipe("c5", config={"foo": "bar"})`
			`nlp.add_pipe(pipe)`
			`assert nlp("hello world")`
			`# The first argument here is the class itself, so we're accepting any here`
Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add .cfg to manifest fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build 2020-02-27 20:42:27 +03:00			`# The model will be initialized to None by the factory`
			`mock.assert_called_once_with(ANY, nlp, None, foo="bar")`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00

			`def test_analysis_validate_attrs_valid():`
Support span._. in component decorator attrs (#4555) * Support span._. in component decorator attrs * Adjust error [ci skip] 2019-10-30 19:19:36 +03:00			`attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz", "span._.xyz"]`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00			`assert validate_attrs(attrs)`
			`for attr in attrs:`
			`assert validate_attrs([attr])`
			`with pytest.raises(ValueError):`
			`validate_attrs(["doc.sents", "doc.xyz"])`


			`@pytest.mark.parametrize(`
			`"attr",`
			`[`
			`"doc",`
			`"doc_ents",`
			`"doc.xyz",`
			`"token.xyz",`
			`"token.tag_",`
			`"token.tag.xyz",`
			`"token._.xyz.abc",`
Support span._. in component decorator attrs (#4555) * Support span._. in component decorator attrs * Adjust error [ci skip] 2019-10-30 19:19:36 +03:00			`"span.label",`
Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors 2019-10-27 15:35:49 +03:00			`],`
			`)`
			`def test_analysis_validate_attrs_invalid(attr):`
			`with pytest.raises(ValueError):`
			`validate_attrs([attr])`
Fix pipeline analysis on remove pipe (#4557) Validate after component is removed, not before 2019-10-30 21:04:17 +03:00

			`def test_analysis_validate_attrs_remove_pipe():`
			`"""Test that attributes are validated correctly on remove."""`
			`spacy.language.ENABLE_PIPELINE_ANALYSIS = True`

			`@component("c1", assigns=["token.tag"])`
			`def c1(doc):`
			`return doc`

			`@component("c2", requires=["token.pos"])`
			`def c2(doc):`
			`return doc`

			`nlp = Language()`
			`nlp.add_pipe(c1)`
			`with pytest.warns(UserWarning):`
			`nlp.add_pipe(c2)`
			`with pytest.warns(None) as record:`
			`nlp.remove_pipe("c2")`
			`assert not record.list`
Move to spacy.analysis 2020-05-22 17:43:18 +03:00

			`def test_pipe_interdependencies():`
			`class Fancifier:`
			`name = "fancifier"`
			`assigns = ("doc._.fancy",)`
			`requires = tuple()`
Rename spacy.analysis to spacy.pipe_analysis 2020-05-22 18:42:06 +03:00
Move to spacy.analysis 2020-05-22 17:43:18 +03:00			`class FancyNeeder:`
			`name = "needer"`
			`assigns = tuple()`
			`requires = ("doc._.fancy",)`

			`pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]`
			`counts = count_pipeline_interdependencies(pipeline)`
			`assert counts == [1, 0]`