mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Tidy up and auto-format
This commit is contained in:
		
							parent
							
								
									06f0a8daa0
								
							
						
					
					
						commit
						5da3ad682a
					
				|  | @ -120,7 +120,7 @@ def pretrain( | |||
|             window_size=1, | ||||
|             char_embed=False, | ||||
|             nM=64, | ||||
|             nC=8 | ||||
|             nC=8, | ||||
|         ), | ||||
|     ) | ||||
|     # Load in pretrained weights | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from wasabi import msg | |||
| import contextlib | ||||
| import random | ||||
| 
 | ||||
| from ..util import create_default_optimizer, registry | ||||
| from ..util import create_default_optimizer | ||||
| from ..util import use_gpu as set_gpu | ||||
| from ..attrs import PROB, IS_OOV, CLUSTER, LANG | ||||
| from ..gold import GoldCorpus | ||||
|  | @ -161,7 +161,10 @@ def train( | |||
|                 raise ValueError(f"Component {pipe} currently not supported.") | ||||
|             pipe_cfg = util.load_config(config_loc, create_objects=False) | ||||
|             if vectors: | ||||
|                 pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} | ||||
|                 pretrained_config = { | ||||
|                     "@architectures": "spacy.VocabVectors.v1", | ||||
|                     "name": vectors, | ||||
|                 } | ||||
|                 pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config | ||||
| 
 | ||||
|             if pipe == "parser": | ||||
|  | @ -202,7 +205,7 @@ def train( | |||
|         msg.text(f"Starting with blank model '{lang}'") | ||||
|         lang_cls = util.get_lang_class(lang) | ||||
|         nlp = lang_cls() | ||||
|          | ||||
| 
 | ||||
|         if vectors: | ||||
|             msg.text(f"Loading vectors from model '{vectors}'") | ||||
| 
 | ||||
|  | @ -222,7 +225,10 @@ def train( | |||
|                 raise ValueError(f"Component {pipe} currently not supported.") | ||||
|             pipe_cfg = util.load_config(config_loc, create_objects=False) | ||||
|             if vectors: | ||||
|                 pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} | ||||
|                 pretrained_config = { | ||||
|                     "@architectures": "spacy.VocabVectors.v1", | ||||
|                     "name": vectors, | ||||
|                 } | ||||
|                 pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config | ||||
| 
 | ||||
|             if pipe == "parser": | ||||
|  |  | |||
|  | @ -1,10 +1,8 @@ | |||
| from typing import Optional, Dict, List, Union, Sequence | ||||
| from pydantic import BaseModel, FilePath, StrictInt | ||||
| 
 | ||||
| from pydantic import BaseModel, FilePath | ||||
| import plac | ||||
| import tqdm | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| import thinc | ||||
| import thinc.schedules | ||||
|  |  | |||
|  | @ -130,7 +130,13 @@ class Language(object): | |||
|     factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)} | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs | ||||
|         self, | ||||
|         vocab=True, | ||||
|         make_doc=True, | ||||
|         max_length=10 ** 6, | ||||
|         meta={}, | ||||
|         config=None, | ||||
|         **kwargs, | ||||
|     ): | ||||
|         """Initialise a Language object. | ||||
| 
 | ||||
|  | @ -176,20 +182,29 @@ class Language(object): | |||
|         self.max_length = max_length | ||||
|         self._optimizer = None | ||||
| 
 | ||||
|         from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \ | ||||
|             default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \ | ||||
|             default_tensorizer_config, default_tok2vec_config | ||||
|         from .ml.models.defaults import ( | ||||
|             default_tagger_config, | ||||
|             default_parser_config, | ||||
|             default_ner_config, | ||||
|             default_textcat_config, | ||||
|             default_nel_config, | ||||
|             default_morphologizer_config, | ||||
|             default_sentrec_config, | ||||
|             default_tensorizer_config, | ||||
|             default_tok2vec_config, | ||||
|         ) | ||||
| 
 | ||||
|         self.defaults = {"tagger": default_tagger_config(), | ||||
|                 "parser": default_parser_config(), | ||||
|                 "ner": default_ner_config(), | ||||
|                 "textcat": default_textcat_config(), | ||||
|                 "entity_linker": default_nel_config(), | ||||
|                 "morphologizer": default_morphologizer_config(), | ||||
|                 "sentrec": default_sentrec_config(), | ||||
|                 "tensorizer": default_tensorizer_config(), | ||||
|                 "tok2vec": default_tok2vec_config(), | ||||
|                 } | ||||
|         self.defaults = { | ||||
|             "tagger": default_tagger_config(), | ||||
|             "parser": default_parser_config(), | ||||
|             "ner": default_ner_config(), | ||||
|             "textcat": default_textcat_config(), | ||||
|             "entity_linker": default_nel_config(), | ||||
|             "morphologizer": default_morphologizer_config(), | ||||
|             "sentrec": default_sentrec_config(), | ||||
|             "tensorizer": default_tensorizer_config(), | ||||
|             "tok2vec": default_tok2vec_config(), | ||||
|         } | ||||
| 
 | ||||
|     @property | ||||
|     def path(self): | ||||
|  | @ -329,12 +344,14 @@ class Language(object): | |||
|                 model_cfg = None | ||||
|             del config["model"] | ||||
|         if model_cfg is None and default_config is not None: | ||||
|             user_warning(Warnings.W098) | ||||
|             user_warning(Warnings.W098.format(name=name)) | ||||
|             model_cfg = default_config["model"] | ||||
|         model = None | ||||
|         if model_cfg is not None: | ||||
|             self.config[name] = {"model":  model_cfg} | ||||
|             model = registry.make_from_config({"model": model_cfg}, validate=True)["model"] | ||||
|             self.config[name] = {"model": model_cfg} | ||||
|             model = registry.make_from_config({"model": model_cfg}, validate=True)[ | ||||
|                 "model" | ||||
|             ] | ||||
|         return factory(self, model, **config) | ||||
| 
 | ||||
|     def add_pipe( | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .entity_linker import * | ||||
| from .parser import * | ||||
| from .tagger import * | ||||
| from .tensorizer import * | ||||
| from .textcat import * | ||||
| from .tok2vec import * | ||||
| from .entity_linker import *  # noqa | ||||
| from .parser import *  # noqa | ||||
| from .tagger import *  # noqa | ||||
| from .tensorizer import *  # noqa | ||||
| from .textcat import *  # noqa | ||||
| from .tok2vec import *  # noqa | ||||
|  |  | |||
|  | @ -1,9 +1,7 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from thinc.api import chain, clone, list2ragged, reduce_mean, residual | ||||
| from thinc.api import Model, Maxout, Linear | ||||
| 
 | ||||
| from spacy.util import registry | ||||
| from ...util import registry | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.EntityLinker.v1") | ||||
|  |  | |||
|  | @ -1,11 +1,10 @@ | |||
| from pydantic import StrictInt | ||||
| 
 | ||||
| from spacy.util import registry | ||||
| from spacy.ml._layers import PrecomputableAffine | ||||
| from spacy.syntax._parser_model import ParserModel | ||||
| 
 | ||||
| from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops | ||||
| 
 | ||||
| from ...util import registry | ||||
| from .._layers import PrecomputableAffine | ||||
| from ...syntax._parser_model import ParserModel | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.TransitionBasedParser.v1") | ||||
| def build_tb_parser_model( | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from thinc.api import zero_init, with_array, Softmax, chain, Model | ||||
| 
 | ||||
| from spacy.util import registry | ||||
| from ...util import registry | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.Tagger.v1") | ||||
|  |  | |||
|  | @ -1,8 +1,9 @@ | |||
| from spacy.attrs import ORTH | ||||
| from spacy.util import registry | ||||
| from spacy.ml.extract_ngrams import extract_ngrams | ||||
| from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic | ||||
| from thinc.api import SparseLinear, Softmax | ||||
| 
 | ||||
| from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax | ||||
| from ...attrs import ORTH | ||||
| from ...util import registry | ||||
| from ..extract_ngrams import extract_ngrams | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.TextCatCNN.v1") | ||||
|  | @ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): | |||
|         else: | ||||
|             # TODO: experiment with init_w=zero_init | ||||
|             linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) | ||||
|             model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() | ||||
|             model = ( | ||||
|                 tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() | ||||
|             ) | ||||
|             model.set_ref("output_layer", linear_layer) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     model.set_dim("nO", nO) | ||||
|  |  | |||
|  | @ -119,7 +119,7 @@ def hash_embed_bilstm_v1( | |||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") | ||||
| def hash_embed_bilstm_v1( | ||||
| def hash_char_embed_bilstm_v1( | ||||
|     pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 | ||||
| ): | ||||
|     # Allows using character embeddings by setting nC, nM and char_embed=True | ||||
|  |  | |||
|  | @ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc | |||
| 
 | ||||
| @component("tok2vec", assigns=["doc.tensor"]) | ||||
| class Tok2Vec(Pipe): | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_nlp(cls, nlp, model, **cfg): | ||||
|         return cls(nlp.vocab, model, **cfg) | ||||
|  |  | |||
|  | @ -73,7 +73,8 @@ def test_add_label_deserializes_correctly(): | |||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())] | ||||
|     "pipe_cls,n_moves,model", | ||||
|     [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], | ||||
| ) | ||||
| def test_add_label_get_label(pipe_cls, n_moves, model): | ||||
|     """Test that added labels are returned correctly. This test was added to | ||||
|  |  | |||
|  | @ -212,7 +212,8 @@ def test_empty_ner(): | |||
|     nlp.begin_training() | ||||
|     doc = nlp("John is watching the news about Croatia's elections") | ||||
|     # if this goes wrong, the initialization of the parser's upper layer is probably broken | ||||
|     assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||||
|     result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] | ||||
|     assert [token.ent_iob_ for token in doc] == result | ||||
| 
 | ||||
| 
 | ||||
| def test_ruler_before_ner(): | ||||
|  |  | |||
|  | @ -237,7 +237,7 @@ def test_issue1889(word): | |||
|     assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3") | ||||
| @pytest.mark.skip(reason="obsolete with the config refactor of v.3") | ||||
| def test_issue1915(): | ||||
|     cfg = {"hidden_depth": 2}  # should error out | ||||
|     nlp = Language() | ||||
|  |  | |||
|  | @ -58,10 +58,22 @@ subword_features = false | |||
| 
 | ||||
| @registry.architectures.register("my_test_parser") | ||||
| def my_parser(): | ||||
|     tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3, | ||||
|                                   maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8, | ||||
|                                   conv_depth=2, bilstm_depth=0) | ||||
|     parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5) | ||||
|     tok2vec = build_Tok2Vec_model( | ||||
|         width=321, | ||||
|         embed_size=5432, | ||||
|         pretrained_vectors=None, | ||||
|         window_size=3, | ||||
|         maxout_pieces=4, | ||||
|         subword_features=True, | ||||
|         char_embed=True, | ||||
|         nM=64, | ||||
|         nC=8, | ||||
|         conv_depth=2, | ||||
|         bilstm_depth=0, | ||||
|     ) | ||||
|     parser = build_tb_parser_model( | ||||
|         tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 | ||||
|     ) | ||||
|     return parser | ||||
| 
 | ||||
| 
 | ||||
|  | @ -88,7 +100,7 @@ def test_serialize_custom_nlp(): | |||
|     """ Create a custom nlp pipeline and ensure it serializes it correctly""" | ||||
|     nlp = English() | ||||
|     parser_cfg = dict() | ||||
|     parser_cfg["model"] = {'@architectures': "my_test_parser"} | ||||
|     parser_cfg["model"] = {"@architectures": "my_test_parser"} | ||||
|     parser = nlp.create_pipe("parser", parser_cfg) | ||||
|     nlp.add_pipe(parser) | ||||
|     nlp.begin_training() | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| import pytest | ||||
| from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer | ||||
| from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer | ||||
| from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec | ||||
| from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger | ||||
| from spacy.ml.models.defaults import default_textcat, default_sentrec | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
|  | @ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): | |||
| 
 | ||||
| def test_serialize_textcat_empty(en_vocab): | ||||
|     # See issue #1105 | ||||
|     textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]) | ||||
|     textcat = TextCategorizer( | ||||
|         en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"] | ||||
|     ) | ||||
|     textcat.to_bytes(exclude=["vocab"]) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): | |||
|         {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None,  "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None,  "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|         {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user