spaCy/spacy/tests/serialize/test_serialize_language.py
Daniël de Kok e5debc68e4
Tagger: use unnormalized probabilities for inference (#10197)
* Tagger: use unnormalized probabilities for inference

Using unnormalized softmax avoids use of the relatively expensive exp function,
which can significantly speed up non-transformer models (e.g. I got a speedup
of 27% on a German tagging + parsing pipeline).

* Add spacy.Tagger.v2 with configurable normalization

Normalization of probabilities is disabled by default to improve
performance.

* Update documentation, models, and tests to spacy.Tagger.v2

* Move Tagger.v1 to spacy-legacy

* docs/architectures: run prettier

* Unnormalized softmax is now a Softmax_v2 option

* Require thinc 8.0.14 and spacy-legacy 3.0.9
2022-03-15 14:15:31 +01:00

136 lines
3.5 KiB
Python

import re
import pickle
import pytest
from spacy.language import Language
from spacy.lang.it import Italian
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.training import Example
from spacy.util import load_config_from_str
from ..util import make_tempdir
@pytest.fixture
def meta_data():
return {
"name": "name-in-fixture",
"version": "version-in-fixture",
"description": "description-in-fixture",
"author": "author-in-fixture",
"email": "email-in-fixture",
"url": "url-in-fixture",
"license": "license-in-fixture",
"vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
}
@pytest.mark.issue(2482)
def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian()
nlp.add_pipe("ner")
b = nlp.to_bytes()
Italian().from_bytes(b)
CONFIG_ISSUE_6950 = """
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.ner]
factory = "ner"
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
"""
@pytest.mark.issue(6950)
def test_issue6950():
"""Test that the nlp object with initialized tok2vec with listeners pickles
correctly (and doesn't have lambdas).
"""
nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
pickle.dumps(nlp)
nlp("hello")
pickle.dumps(nlp)
def test_serialize_language_meta_disk(meta_data):
language = Language(meta=meta_data)
with make_tempdir() as d:
language.to_disk(d)
new_language = Language().from_disk(d)
assert new_language.meta == language.meta
def test_serialize_with_custom_tokenizer():
"""Test that serialization with custom tokenizer works without token_match.
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
"""
prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
suffix_re = re.compile(r"""""")
infix_re = re.compile(r"""[~]""")
def custom_tokenizer(nlp):
return Tokenizer(
nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
nlp = Language()
nlp.tokenizer = custom_tokenizer(nlp)
with make_tempdir() as d:
nlp.to_disk(d)
def test_serialize_language_exclude(meta_data):
name = "name-in-fixture"
nlp = Language(meta=meta_data)
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes())
assert new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
assert not new_nlp.meta["name"] == name