spaCy/spacy/tests/pipeline/test_analysis.py
Sofie Van Landeghem 06f0a8daa0
Default settings to configurations (#4995)
* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build
2020-02-27 18:42:27 +01:00

164 lines
5.0 KiB
Python

import spacy.language
from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
from mock import Mock, ANY
import pytest
def test_component_decorator_function():
@component(name="test")
def test_component(doc):
"""docstring"""
return doc
assert test_component.name == "test"
assert test_component.__doc__ == "docstring"
assert test_component("foo") == "foo"
def test_component_decorator_class():
@component(name="test")
class TestComponent(object):
"""docstring1"""
foo = "bar"
def __call__(self, doc):
"""docstring2"""
return doc
def custom(self, x):
"""docstring3"""
return x
assert TestComponent.name == "test"
assert TestComponent.foo == "bar"
assert hasattr(TestComponent, "custom")
test_component = TestComponent()
assert test_component.foo == "bar"
assert test_component("foo") == "foo"
assert hasattr(test_component, "custom")
assert test_component.custom("bar") == "bar"
assert TestComponent.__doc__ == "docstring1"
assert TestComponent.__call__.__doc__ == "docstring2"
assert TestComponent.custom.__doc__ == "docstring3"
assert test_component.__doc__ == "docstring1"
assert test_component.__call__.__doc__ == "docstring2"
assert test_component.custom.__doc__ == "docstring3"
def test_component_decorator_assigns():
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
@component("c1", assigns=["token.tag", "doc.tensor"])
def test_component1(doc):
return doc
@component(
"c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
)
def test_component2(doc):
return doc
@component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
def test_component3(doc):
return doc
assert "c1" in Language.factories
assert "c2" in Language.factories
assert "c3" in Language.factories
nlp = Language()
nlp.add_pipe(test_component1)
with pytest.warns(UserWarning):
nlp.add_pipe(test_component2)
nlp.add_pipe(test_component3)
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
test_component4 = nlp.create_pipe("c1")
assert test_component4.name == "c1"
assert test_component4.factory == "c1"
nlp.add_pipe(test_component4, name="c4")
assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
assert "c4" not in Language.factories
assert nlp.pipe_factories["c1"] == "c1"
assert nlp.pipe_factories["c4"] == "c1"
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
assert [name for name, _ in requires_pos] == ["c2"]
assert print_summary(nlp, no_print=True)
assert nlp("hello world")
def test_component_factories_from_nlp():
"""Test that class components can implement a from_nlp classmethod that
gives them access to the nlp object and config via the factory."""
class TestComponent5(object):
def __call__(self, doc):
return doc
mock = Mock()
mock.return_value = TestComponent5()
TestComponent5.from_nlp = classmethod(mock)
TestComponent5 = component("c5")(TestComponent5)
assert "c5" in Language.factories
nlp = Language()
pipe = nlp.create_pipe("c5", config={"foo": "bar"})
nlp.add_pipe(pipe)
assert nlp("hello world")
# The first argument here is the class itself, so we're accepting any here
# The model will be initialized to None by the factory
mock.assert_called_once_with(ANY, nlp, None, foo="bar")
def test_analysis_validate_attrs_valid():
attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz", "span._.xyz"]
assert validate_attrs(attrs)
for attr in attrs:
assert validate_attrs([attr])
with pytest.raises(ValueError):
validate_attrs(["doc.sents", "doc.xyz"])
@pytest.mark.parametrize(
"attr",
[
"doc",
"doc_ents",
"doc.xyz",
"token.xyz",
"token.tag_",
"token.tag.xyz",
"token._.xyz.abc",
"span.label",
],
)
def test_analysis_validate_attrs_invalid(attr):
with pytest.raises(ValueError):
validate_attrs([attr])
def test_analysis_validate_attrs_remove_pipe():
"""Test that attributes are validated correctly on remove."""
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
@component("c1", assigns=["token.tag"])
def c1(doc):
return doc
@component("c2", requires=["token.pos"])
def c2(doc):
return doc
nlp = Language()
nlp.add_pipe(c1)
with pytest.warns(UserWarning):
nlp.add_pipe(c2)
with pytest.warns(None) as record:
nlp.remove_pipe("c2")
assert not record.list