spaCy/spacy/tests/serialize/test_serialize_language.py
Sofie Van Landeghem 06f0a8daa0
Default settings to configurations (#4995)
* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build
2020-02-27 18:42:27 +01:00

69 lines
2.1 KiB
Python

import pytest
import re
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from ..util import make_tempdir
@pytest.fixture
def meta_data():
return {
"name": "name-in-fixture",
"version": "version-in-fixture",
"description": "description-in-fixture",
"author": "author-in-fixture",
"email": "email-in-fixture",
"url": "url-in-fixture",
"license": "license-in-fixture",
"vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
}
def test_serialize_language_meta_disk(meta_data):
language = Language(meta=meta_data)
with make_tempdir() as d:
language.to_disk(d)
new_language = Language().from_disk(d)
assert new_language.meta == language.meta
def test_serialize_with_custom_tokenizer():
"""Test that serialization with custom tokenizer works without token_match.
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
"""
prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
suffix_re = re.compile(r"""""")
infix_re = re.compile(r"""[~]""")
def custom_tokenizer(nlp):
return Tokenizer(
nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
nlp = Language()
nlp.tokenizer = custom_tokenizer(nlp)
with make_tempdir() as d:
nlp.to_disk(d)
def test_serialize_language_exclude(meta_data):
name = "name-in-fixture"
nlp = Language(meta=meta_data)
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes())
assert new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
assert not new_nlp.meta["name"] == name
with pytest.raises(ValueError):
nlp.to_bytes(meta=False)
with pytest.raises(ValueError):
Language().from_bytes(nlp.to_bytes(), meta=False)