mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 07:27:28 +03:00 
			
		
		
		
	* Tagger: use unnormalized probabilities for inference Using unnormalized softmax avoids use of the relatively expensive exp function, which can significantly speed up non-transformer models (e.g. I got a speedup of 27% on a German tagging + parsing pipeline). * Add spacy.Tagger.v2 with configurable normalization Normalization of probabilities is disabled by default to improve performance. * Update documentation, models, and tests to spacy.Tagger.v2 * Move Tagger.v1 to spacy-legacy * docs/architectures: run prettier * Unnormalized softmax is now a Softmax_v2 option * Require thinc 8.0.14 and spacy-legacy 3.0.9
		
			
				
	
	
		
			136 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import re
 | |
| import pickle
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from spacy.language import Language
 | |
| from spacy.lang.it import Italian
 | |
| from spacy.lang.en import English
 | |
| from spacy.tokenizer import Tokenizer
 | |
| from spacy.training import Example
 | |
| from spacy.util import load_config_from_str
 | |
| 
 | |
| from ..util import make_tempdir
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def meta_data():
 | |
|     return {
 | |
|         "name": "name-in-fixture",
 | |
|         "version": "version-in-fixture",
 | |
|         "description": "description-in-fixture",
 | |
|         "author": "author-in-fixture",
 | |
|         "email": "email-in-fixture",
 | |
|         "url": "url-in-fixture",
 | |
|         "license": "license-in-fixture",
 | |
|         "vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
 | |
|     }
 | |
| 
 | |
| 
 | |
| @pytest.mark.issue(2482)
 | |
| def test_issue2482():
 | |
|     """Test we can serialize and deserialize a blank NER or parser model."""
 | |
|     nlp = Italian()
 | |
|     nlp.add_pipe("ner")
 | |
|     b = nlp.to_bytes()
 | |
|     Italian().from_bytes(b)
 | |
| 
 | |
| 
 | |
| CONFIG_ISSUE_6950 = """
 | |
| [nlp]
 | |
| lang = "en"
 | |
| pipeline = ["tok2vec", "tagger"]
 | |
| 
 | |
| [components]
 | |
| 
 | |
| [components.tok2vec]
 | |
| factory = "tok2vec"
 | |
| 
 | |
| [components.tok2vec.model]
 | |
| @architectures = "spacy.Tok2Vec.v1"
 | |
| 
 | |
| [components.tok2vec.model.embed]
 | |
| @architectures = "spacy.MultiHashEmbed.v1"
 | |
| width = ${components.tok2vec.model.encode:width}
 | |
| attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 | |
| rows = [5000,2500,2500,2500]
 | |
| include_static_vectors = false
 | |
| 
 | |
| [components.tok2vec.model.encode]
 | |
| @architectures = "spacy.MaxoutWindowEncoder.v1"
 | |
| width = 96
 | |
| depth = 4
 | |
| window_size = 1
 | |
| maxout_pieces = 3
 | |
| 
 | |
| [components.ner]
 | |
| factory = "ner"
 | |
| 
 | |
| [components.tagger]
 | |
| factory = "tagger"
 | |
| 
 | |
| [components.tagger.model]
 | |
| @architectures = "spacy.Tagger.v2"
 | |
| nO = null
 | |
| 
 | |
| [components.tagger.model.tok2vec]
 | |
| @architectures = "spacy.Tok2VecListener.v1"
 | |
| width = ${components.tok2vec.model.encode:width}
 | |
| upstream = "*"
 | |
| """
 | |
| 
 | |
| 
 | |
| @pytest.mark.issue(6950)
 | |
| def test_issue6950():
 | |
|     """Test that the nlp object with initialized tok2vec with listeners pickles
 | |
|     correctly (and doesn't have lambdas).
 | |
|     """
 | |
|     nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
 | |
|     nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
 | |
|     pickle.dumps(nlp)
 | |
|     nlp("hello")
 | |
|     pickle.dumps(nlp)
 | |
| 
 | |
| 
 | |
| def test_serialize_language_meta_disk(meta_data):
 | |
|     language = Language(meta=meta_data)
 | |
|     with make_tempdir() as d:
 | |
|         language.to_disk(d)
 | |
|         new_language = Language().from_disk(d)
 | |
|     assert new_language.meta == language.meta
 | |
| 
 | |
| 
 | |
| def test_serialize_with_custom_tokenizer():
 | |
|     """Test that serialization with custom tokenizer works without token_match.
 | |
|     See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
 | |
|     """
 | |
|     prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
 | |
|     suffix_re = re.compile(r"""""")
 | |
|     infix_re = re.compile(r"""[~]""")
 | |
| 
 | |
|     def custom_tokenizer(nlp):
 | |
|         return Tokenizer(
 | |
|             nlp.vocab,
 | |
|             {},
 | |
|             prefix_search=prefix_re.search,
 | |
|             suffix_search=suffix_re.search,
 | |
|             infix_finditer=infix_re.finditer,
 | |
|         )
 | |
| 
 | |
|     nlp = Language()
 | |
|     nlp.tokenizer = custom_tokenizer(nlp)
 | |
|     with make_tempdir() as d:
 | |
|         nlp.to_disk(d)
 | |
| 
 | |
| 
 | |
| def test_serialize_language_exclude(meta_data):
 | |
|     name = "name-in-fixture"
 | |
|     nlp = Language(meta=meta_data)
 | |
|     assert nlp.meta["name"] == name
 | |
|     new_nlp = Language().from_bytes(nlp.to_bytes())
 | |
|     assert new_nlp.meta["name"] == name
 | |
|     new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
 | |
|     assert not new_nlp.meta["name"] == name
 | |
|     new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
 | |
|     assert not new_nlp.meta["name"] == name
 |