Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-02-28 11:57:41 +01:00
parent 06f0a8daa0
commit 5da3ad682a
17 changed files with 96 additions and 59 deletions

View File

@ -120,7 +120,7 @@ def pretrain(
window_size=1, window_size=1,
char_embed=False, char_embed=False,
nM=64, nM=64,
nC=8 nC=8,
), ),
) )
# Load in pretrained weights # Load in pretrained weights

View File

@ -9,7 +9,7 @@ from wasabi import msg
import contextlib import contextlib
import random import random
from ..util import create_default_optimizer, registry from ..util import create_default_optimizer
from ..util import use_gpu as set_gpu from ..util import use_gpu as set_gpu
from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus from ..gold import GoldCorpus
@ -161,7 +161,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.") raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False) pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors: if vectors:
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser": if pipe == "parser":
@ -202,7 +205,7 @@ def train(
msg.text(f"Starting with blank model '{lang}'") msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang) lang_cls = util.get_lang_class(lang)
nlp = lang_cls() nlp = lang_cls()
if vectors: if vectors:
msg.text(f"Loading vectors from model '{vectors}'") msg.text(f"Loading vectors from model '{vectors}'")
@ -222,7 +225,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.") raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False) pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors: if vectors:
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser": if pipe == "parser":

View File

@ -1,10 +1,8 @@
from typing import Optional, Dict, List, Union, Sequence from typing import Optional, Dict, List, Union, Sequence
from pydantic import BaseModel, FilePath, StrictInt from pydantic import BaseModel, FilePath
import plac import plac
import tqdm import tqdm
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import thinc import thinc
import thinc.schedules import thinc.schedules

View File

@ -130,7 +130,13 @@ class Language(object):
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)} factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
def __init__( def __init__(
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs self,
vocab=True,
make_doc=True,
max_length=10 ** 6,
meta={},
config=None,
**kwargs,
): ):
"""Initialise a Language object. """Initialise a Language object.
@ -176,20 +182,29 @@ class Language(object):
self.max_length = max_length self.max_length = max_length
self._optimizer = None self._optimizer = None
from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \ from .ml.models.defaults import (
default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \ default_tagger_config,
default_tensorizer_config, default_tok2vec_config default_parser_config,
default_ner_config,
default_textcat_config,
default_nel_config,
default_morphologizer_config,
default_sentrec_config,
default_tensorizer_config,
default_tok2vec_config,
)
self.defaults = {"tagger": default_tagger_config(), self.defaults = {
"parser": default_parser_config(), "tagger": default_tagger_config(),
"ner": default_ner_config(), "parser": default_parser_config(),
"textcat": default_textcat_config(), "ner": default_ner_config(),
"entity_linker": default_nel_config(), "textcat": default_textcat_config(),
"morphologizer": default_morphologizer_config(), "entity_linker": default_nel_config(),
"sentrec": default_sentrec_config(), "morphologizer": default_morphologizer_config(),
"tensorizer": default_tensorizer_config(), "sentrec": default_sentrec_config(),
"tok2vec": default_tok2vec_config(), "tensorizer": default_tensorizer_config(),
} "tok2vec": default_tok2vec_config(),
}
@property @property
def path(self): def path(self):
@ -329,12 +344,14 @@ class Language(object):
model_cfg = None model_cfg = None
del config["model"] del config["model"]
if model_cfg is None and default_config is not None: if model_cfg is None and default_config is not None:
user_warning(Warnings.W098) user_warning(Warnings.W098.format(name=name))
model_cfg = default_config["model"] model_cfg = default_config["model"]
model = None model = None
if model_cfg is not None: if model_cfg is not None:
self.config[name] = {"model": model_cfg} self.config[name] = {"model": model_cfg}
model = registry.make_from_config({"model": model_cfg}, validate=True)["model"] model = registry.make_from_config({"model": model_cfg}, validate=True)[
"model"
]
return factory(self, model, **config) return factory(self, model, **config)
def add_pipe( def add_pipe(

View File

@ -1,6 +1,6 @@
from .entity_linker import * from .entity_linker import * # noqa
from .parser import * from .parser import * # noqa
from .tagger import * from .tagger import * # noqa
from .tensorizer import * from .tensorizer import * # noqa
from .textcat import * from .textcat import * # noqa
from .tok2vec import * from .tok2vec import * # noqa

View File

@ -1,9 +1,7 @@
from pathlib import Path
from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear from thinc.api import Model, Maxout, Linear
from spacy.util import registry from ...util import registry
@registry.architectures.register("spacy.EntityLinker.v1") @registry.architectures.register("spacy.EntityLinker.v1")

View File

@ -1,11 +1,10 @@
from pydantic import StrictInt from pydantic import StrictInt
from spacy.util import registry
from spacy.ml._layers import PrecomputableAffine
from spacy.syntax._parser_model import ParserModel
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from ...util import registry
from .._layers import PrecomputableAffine
from ...syntax._parser_model import ParserModel
@registry.architectures.register("spacy.TransitionBasedParser.v1") @registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model( def build_tb_parser_model(

View File

@ -1,6 +1,6 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model from thinc.api import zero_init, with_array, Softmax, chain, Model
from spacy.util import registry from ...util import registry
@registry.architectures.register("spacy.Tagger.v1") @registry.architectures.register("spacy.Tagger.v1")

View File

@ -1,8 +1,9 @@
from spacy.attrs import ORTH from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
from spacy.util import registry from thinc.api import SparseLinear, Softmax
from spacy.ml.extract_ngrams import extract_ngrams
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax from ...attrs import ORTH
from ...util import registry
from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")
@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
else: else:
# TODO: experiment with init_w=zero_init # TODO: experiment with init_w=zero_init
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model.set_ref("output_layer", linear_layer) model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO) model.set_dim("nO", nO)

View File

@ -119,7 +119,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_embed_bilstm_v1( def hash_char_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
): ):
# Allows using character embeddings by setting nC, nM and char_embed=True # Allows using character embeddings by setting nC, nM and char_embed=True

View File

@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc
@component("tok2vec", assigns=["doc.tensor"]) @component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe): class Tok2Vec(Pipe):
@classmethod @classmethod
def from_nlp(cls, nlp, model, **cfg): def from_nlp(cls, nlp, model, **cfg):
return cls(nlp.vocab, model, **cfg) return cls(nlp.vocab, model, **cfg)

View File

@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())] "pipe_cls,n_moves,model",
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
) )
def test_add_label_get_label(pipe_cls, n_moves, model): def test_add_label_get_label(pipe_cls, n_moves, model):
"""Test that added labels are returned correctly. This test was added to """Test that added labels are returned correctly. This test was added to

View File

@ -212,7 +212,8 @@ def test_empty_ner():
nlp.begin_training() nlp.begin_training()
doc = nlp("John is watching the news about Croatia's elections") doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken # if this goes wrong, the initialization of the parser's upper layer is probably broken
assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
assert [token.ent_iob_ for token in doc] == result
def test_ruler_before_ner(): def test_ruler_before_ner():

View File

@ -237,7 +237,7 @@ def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3") @pytest.mark.skip(reason="obsolete with the config refactor of v.3")
def test_issue1915(): def test_issue1915():
cfg = {"hidden_depth": 2} # should error out cfg = {"hidden_depth": 2} # should error out
nlp = Language() nlp = Language()

View File

@ -58,10 +58,22 @@ subword_features = false
@registry.architectures.register("my_test_parser") @registry.architectures.register("my_test_parser")
def my_parser(): def my_parser():
tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3, tok2vec = build_Tok2Vec_model(
maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8, width=321,
conv_depth=2, bilstm_depth=0) embed_size=5432,
parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5) pretrained_vectors=None,
window_size=3,
maxout_pieces=4,
subword_features=True,
char_embed=True,
nM=64,
nC=8,
conv_depth=2,
bilstm_depth=0,
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
)
return parser return parser
@ -88,7 +100,7 @@ def test_serialize_custom_nlp():
""" Create a custom nlp pipeline and ensure it serializes it correctly""" """ Create a custom nlp pipeline and ensure it serializes it correctly"""
nlp = English() nlp = English()
parser_cfg = dict() parser_cfg = dict()
parser_cfg["model"] = {'@architectures': "my_test_parser"} parser_cfg["model"] = {"@architectures": "my_test_parser"}
parser = nlp.create_pipe("parser", parser_cfg) parser = nlp.create_pipe("parser", parser_cfg)
nlp.add_pipe(parser) nlp.add_pipe(parser)
nlp.begin_training() nlp.begin_training()

View File

@ -1,7 +1,8 @@
import pytest import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
from spacy.ml.models.defaults import default_textcat, default_sentrec
from ..util import make_tempdir from ..util import make_tempdir
@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab): def test_serialize_textcat_empty(en_vocab):
# See issue #1105 # See issue #1105
textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]) textcat = TextCategorizer(
en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]
)
textcat.to_bytes(exclude=["vocab"]) textcat.to_bytes(exclude=["vocab"])

View File

@ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
], ],
) )
# fmt: on # fmt: on