Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-02-28 11:57:41 +01:00
parent 06f0a8daa0
commit 5da3ad682a
17 changed files with 96 additions and 59 deletions

View File

@ -120,7 +120,7 @@ def pretrain(
window_size=1,
char_embed=False,
nM=64,
nC=8
nC=8,
),
)
# Load in pretrained weights

View File

@ -9,7 +9,7 @@ from wasabi import msg
import contextlib
import random
from ..util import create_default_optimizer, registry
from ..util import create_default_optimizer
from ..util import use_gpu as set_gpu
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
@ -161,7 +161,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":
@ -202,7 +205,7 @@ def train(
msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
@ -222,7 +225,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":

View File

@ -1,10 +1,8 @@
from typing import Optional, Dict, List, Union, Sequence
from pydantic import BaseModel, FilePath, StrictInt
from pydantic import BaseModel, FilePath
import plac
import tqdm
from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules

View File

@ -130,7 +130,13 @@ class Language(object):
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
def __init__(
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
self,
vocab=True,
make_doc=True,
max_length=10 ** 6,
meta={},
config=None,
**kwargs,
):
"""Initialise a Language object.
@ -176,20 +182,29 @@ class Language(object):
self.max_length = max_length
self._optimizer = None
from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
default_tensorizer_config, default_tok2vec_config
from .ml.models.defaults import (
default_tagger_config,
default_parser_config,
default_ner_config,
default_textcat_config,
default_nel_config,
default_morphologizer_config,
default_sentrec_config,
default_tensorizer_config,
default_tok2vec_config,
)
self.defaults = {"tagger": default_tagger_config(),
"parser": default_parser_config(),
"ner": default_ner_config(),
"textcat": default_textcat_config(),
"entity_linker": default_nel_config(),
"morphologizer": default_morphologizer_config(),
"sentrec": default_sentrec_config(),
"tensorizer": default_tensorizer_config(),
"tok2vec": default_tok2vec_config(),
}
self.defaults = {
"tagger": default_tagger_config(),
"parser": default_parser_config(),
"ner": default_ner_config(),
"textcat": default_textcat_config(),
"entity_linker": default_nel_config(),
"morphologizer": default_morphologizer_config(),
"sentrec": default_sentrec_config(),
"tensorizer": default_tensorizer_config(),
"tok2vec": default_tok2vec_config(),
}
@property
def path(self):
@ -329,12 +344,14 @@ class Language(object):
model_cfg = None
del config["model"]
if model_cfg is None and default_config is not None:
user_warning(Warnings.W098)
user_warning(Warnings.W098.format(name=name))
model_cfg = default_config["model"]
model = None
if model_cfg is not None:
self.config[name] = {"model": model_cfg}
model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
self.config[name] = {"model": model_cfg}
model = registry.make_from_config({"model": model_cfg}, validate=True)[
"model"
]
return factory(self, model, **config)
def add_pipe(

View File

@ -1,6 +1,6 @@
from .entity_linker import *
from .parser import *
from .tagger import *
from .tensorizer import *
from .textcat import *
from .tok2vec import *
from .entity_linker import * # noqa
from .parser import * # noqa
from .tagger import * # noqa
from .tensorizer import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -1,9 +1,7 @@
from pathlib import Path
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
from spacy.util import registry
from ...util import registry
@registry.architectures.register("spacy.EntityLinker.v1")

View File

@ -1,11 +1,10 @@
from pydantic import StrictInt
from spacy.util import registry
from spacy.ml._layers import PrecomputableAffine
from spacy.syntax._parser_model import ParserModel
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from ...util import registry
from .._layers import PrecomputableAffine
from ...syntax._parser_model import ParserModel
@registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model(

View File

@ -1,6 +1,6 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model
from spacy.util import registry
from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")

View File

@ -1,8 +1,9 @@
from spacy.attrs import ORTH
from spacy.util import registry
from spacy.ml.extract_ngrams import extract_ngrams
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import SparseLinear, Softmax
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
from ...attrs import ORTH
from ...util import registry
from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1")
@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
else:
# TODO: experiment with init_w=zero_init
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO)

View File

@ -119,7 +119,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_embed_bilstm_v1(
def hash_char_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
):
# Allows using character embeddings by setting nC, nM and char_embed=True

View File

@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc
@component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe):
@classmethod
def from_nlp(cls, nlp, model, **cfg):
return cls(nlp.vocab, model, **cfg)

View File

@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly():
@pytest.mark.parametrize(
"pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
"pipe_cls,n_moves,model",
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
)
def test_add_label_get_label(pipe_cls, n_moves, model):
"""Test that added labels are returned correctly. This test was added to

View File

@ -212,7 +212,8 @@ def test_empty_ner():
nlp.begin_training()
doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken
assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
assert [token.ent_iob_ for token in doc] == result
def test_ruler_before_ner():

View File

@ -237,7 +237,7 @@ def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
def test_issue1915():
cfg = {"hidden_depth": 2} # should error out
nlp = Language()

View File

@ -58,10 +58,22 @@ subword_features = false
@registry.architectures.register("my_test_parser")
def my_parser():
tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
conv_depth=2, bilstm_depth=0)
parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
tok2vec = build_Tok2Vec_model(
width=321,
embed_size=5432,
pretrained_vectors=None,
window_size=3,
maxout_pieces=4,
subword_features=True,
char_embed=True,
nM=64,
nC=8,
conv_depth=2,
bilstm_depth=0,
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
)
return parser
@ -88,7 +100,7 @@ def test_serialize_custom_nlp():
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
nlp = English()
parser_cfg = dict()
parser_cfg["model"] = {'@architectures': "my_test_parser"}
parser_cfg["model"] = {"@architectures": "my_test_parser"}
parser = nlp.create_pipe("parser", parser_cfg)
nlp.add_pipe(parser)
nlp.begin_training()

View File

@ -1,7 +1,8 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
from spacy.ml.models.defaults import default_textcat, default_sentrec
from ..util import make_tempdir
@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
textcat = TextCategorizer(
en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]
)
textcat.to_bytes(exclude=["vocab"])

View File

@ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
],
)
# fmt: on