spaCy/spacy/tests/regression/test_issue5230.py
Ines Montani 43b960c01b
Refactor pipeline components, config and language data (#5759)
* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 13:42:59 +02:00

147 lines
4.2 KiB
Python

import warnings
from unittest import TestCase
import pytest
import srsly
from numpy import zeros
from spacy.kb import KnowledgeBase, Writer
from spacy.vectors import Vectors
from spacy.language import Language
from spacy.pipeline import Pipe
from spacy.util import registry
from ..util import make_tempdir
def nlp():
return Language()
def vectors():
data = zeros((3, 1), dtype="f")
keys = ["cat", "dog", "rat"]
return Vectors(data=data, keys=keys)
def custom_pipe():
# create dummy pipe partially implementing interface -- only want to test to_disk
class SerializableDummy:
def __init__(self, **cfg):
if cfg:
self.cfg = cfg
else:
self.cfg = None
super(SerializableDummy, self).__init__()
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
return srsly.msgpack_dumps({"dummy": srsly.json_dumps(None)})
def from_bytes(self, bytes_data, exclude):
return self
def to_disk(self, path, exclude=tuple(), **kwargs):
pass
def from_disk(self, path, exclude=tuple(), **kwargs):
return self
class MyPipe(Pipe):
def __init__(self, vocab, model=True, **cfg):
if cfg:
self.cfg = cfg
else:
self.cfg = None
self.model = SerializableDummy()
self.vocab = SerializableDummy()
return MyPipe(None)
def tagger():
nlp = Language()
tagger = nlp.add_pipe("tagger")
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
with pytest.warns(UserWarning):
tagger.begin_training(pipeline=nlp.pipeline)
return tagger
def entity_linker():
nlp = Language()
@registry.assets.register("TestIssue5230KB.v1")
def dummy_kb() -> KnowledgeBase:
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
return kb
config = {"kb": {"@assets": "TestIssue5230KB.v1"}}
entity_linker = nlp.add_pipe("entity_linker", config=config)
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
entity_linker.begin_training(pipeline=nlp.pipeline)
return entity_linker
objects_to_test = (
[nlp(), vectors(), custom_pipe(), tagger(), entity_linker()],
["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
)
def write_obj_and_catch_warnings(obj):
with make_tempdir() as d:
with warnings.catch_warnings(record=True) as warnings_list:
warnings.filterwarnings("always", category=ResourceWarning)
obj.to_disk(d)
# in python3.5 it seems that deprecation warnings are not filtered by filterwarnings
return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list))
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
def test_to_disk_resource_warning(obj):
warnings_list = write_obj_and_catch_warnings(obj)
assert len(warnings_list) == 0
def test_writer_with_path_py35():
writer = None
with make_tempdir() as d:
path = d / "test"
try:
writer = Writer(path)
except Exception as e:
pytest.fail(str(e))
finally:
if writer:
writer.close()
def test_save_and_load_knowledge_base():
nlp = Language()
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
path = d / "kb"
try:
kb.dump(path)
except Exception as e:
pytest.fail(str(e))
try:
kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
kb_loaded.load_bulk(path)
except Exception as e:
pytest.fail(str(e))
class TestToDiskResourceWarningUnittest(TestCase):
def test_resource_warning(self):
scenarios = zip(*objects_to_test)
for scenario in scenarios:
with self.subTest(msg=scenario[1]):
warnings_list = write_obj_and_catch_warnings(scenario[0])
self.assertEqual(len(warnings_list), 0)