mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Tidy up and auto-format
This commit is contained in:
parent
06f0a8daa0
commit
5da3ad682a
|
@ -120,7 +120,7 @@ def pretrain(
|
|||
window_size=1,
|
||||
char_embed=False,
|
||||
nM=64,
|
||||
nC=8
|
||||
nC=8,
|
||||
),
|
||||
)
|
||||
# Load in pretrained weights
|
||||
|
|
|
@ -9,7 +9,7 @@ from wasabi import msg
|
|||
import contextlib
|
||||
import random
|
||||
|
||||
from ..util import create_default_optimizer, registry
|
||||
from ..util import create_default_optimizer
|
||||
from ..util import use_gpu as set_gpu
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus
|
||||
|
@ -161,7 +161,10 @@ def train(
|
|||
raise ValueError(f"Component {pipe} currently not supported.")
|
||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||
if vectors:
|
||||
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
||||
pretrained_config = {
|
||||
"@architectures": "spacy.VocabVectors.v1",
|
||||
"name": vectors,
|
||||
}
|
||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||
|
||||
if pipe == "parser":
|
||||
|
@ -202,7 +205,7 @@ def train(
|
|||
msg.text(f"Starting with blank model '{lang}'")
|
||||
lang_cls = util.get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
|
||||
|
||||
if vectors:
|
||||
msg.text(f"Loading vectors from model '{vectors}'")
|
||||
|
||||
|
@ -222,7 +225,10 @@ def train(
|
|||
raise ValueError(f"Component {pipe} currently not supported.")
|
||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||
if vectors:
|
||||
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
||||
pretrained_config = {
|
||||
"@architectures": "spacy.VocabVectors.v1",
|
||||
"name": vectors,
|
||||
}
|
||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||
|
||||
if pipe == "parser":
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from typing import Optional, Dict, List, Union, Sequence
|
||||
from pydantic import BaseModel, FilePath, StrictInt
|
||||
|
||||
from pydantic import BaseModel, FilePath
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
||||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
|
|
|
@ -130,7 +130,13 @@ class Language(object):
|
|||
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
|
||||
|
||||
def __init__(
|
||||
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
|
||||
self,
|
||||
vocab=True,
|
||||
make_doc=True,
|
||||
max_length=10 ** 6,
|
||||
meta={},
|
||||
config=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialise a Language object.
|
||||
|
||||
|
@ -176,20 +182,29 @@ class Language(object):
|
|||
self.max_length = max_length
|
||||
self._optimizer = None
|
||||
|
||||
from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
|
||||
default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
|
||||
default_tensorizer_config, default_tok2vec_config
|
||||
from .ml.models.defaults import (
|
||||
default_tagger_config,
|
||||
default_parser_config,
|
||||
default_ner_config,
|
||||
default_textcat_config,
|
||||
default_nel_config,
|
||||
default_morphologizer_config,
|
||||
default_sentrec_config,
|
||||
default_tensorizer_config,
|
||||
default_tok2vec_config,
|
||||
)
|
||||
|
||||
self.defaults = {"tagger": default_tagger_config(),
|
||||
"parser": default_parser_config(),
|
||||
"ner": default_ner_config(),
|
||||
"textcat": default_textcat_config(),
|
||||
"entity_linker": default_nel_config(),
|
||||
"morphologizer": default_morphologizer_config(),
|
||||
"sentrec": default_sentrec_config(),
|
||||
"tensorizer": default_tensorizer_config(),
|
||||
"tok2vec": default_tok2vec_config(),
|
||||
}
|
||||
self.defaults = {
|
||||
"tagger": default_tagger_config(),
|
||||
"parser": default_parser_config(),
|
||||
"ner": default_ner_config(),
|
||||
"textcat": default_textcat_config(),
|
||||
"entity_linker": default_nel_config(),
|
||||
"morphologizer": default_morphologizer_config(),
|
||||
"sentrec": default_sentrec_config(),
|
||||
"tensorizer": default_tensorizer_config(),
|
||||
"tok2vec": default_tok2vec_config(),
|
||||
}
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
|
@ -329,12 +344,14 @@ class Language(object):
|
|||
model_cfg = None
|
||||
del config["model"]
|
||||
if model_cfg is None and default_config is not None:
|
||||
user_warning(Warnings.W098)
|
||||
user_warning(Warnings.W098.format(name=name))
|
||||
model_cfg = default_config["model"]
|
||||
model = None
|
||||
if model_cfg is not None:
|
||||
self.config[name] = {"model": model_cfg}
|
||||
model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
|
||||
self.config[name] = {"model": model_cfg}
|
||||
model = registry.make_from_config({"model": model_cfg}, validate=True)[
|
||||
"model"
|
||||
]
|
||||
return factory(self, model, **config)
|
||||
|
||||
def add_pipe(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .entity_linker import *
|
||||
from .parser import *
|
||||
from .tagger import *
|
||||
from .tensorizer import *
|
||||
from .textcat import *
|
||||
from .tok2vec import *
|
||||
from .entity_linker import * # noqa
|
||||
from .parser import * # noqa
|
||||
from .tagger import * # noqa
|
||||
from .tensorizer import * # noqa
|
||||
from .textcat import * # noqa
|
||||
from .tok2vec import * # noqa
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
|
||||
from spacy.util import registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
from pydantic import StrictInt
|
||||
|
||||
from spacy.util import registry
|
||||
from spacy.ml._layers import PrecomputableAffine
|
||||
from spacy.syntax._parser_model import ParserModel
|
||||
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
|
||||
from ...util import registry
|
||||
from .._layers import PrecomputableAffine
|
||||
from ...syntax._parser_model import ParserModel
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||
def build_tb_parser_model(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||
|
||||
from spacy.util import registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tagger.v1")
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from spacy.attrs import ORTH
|
||||
from spacy.util import registry
|
||||
from spacy.ml.extract_ngrams import extract_ngrams
|
||||
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import SparseLinear, Softmax
|
||||
|
||||
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
|
||||
from ...attrs import ORTH
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||
|
@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
|||
else:
|
||||
# TODO: experiment with init_w=zero_init
|
||||
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||
model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||
model = (
|
||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||
)
|
||||
model.set_ref("output_layer", linear_layer)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_dim("nO", nO)
|
||||
|
|
|
@ -119,7 +119,7 @@ def hash_embed_bilstm_v1(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
||||
def hash_embed_bilstm_v1(
|
||||
def hash_char_embed_bilstm_v1(
|
||||
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
|
||||
):
|
||||
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||
|
|
|
@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc
|
|||
|
||||
@component("tok2vec", assigns=["doc.tensor"])
|
||||
class Tok2Vec(Pipe):
|
||||
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, model, **cfg):
|
||||
return cls(nlp.vocab, model, **cfg)
|
||||
|
|
|
@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
|
||||
"pipe_cls,n_moves,model",
|
||||
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
|
||||
)
|
||||
def test_add_label_get_label(pipe_cls, n_moves, model):
|
||||
"""Test that added labels are returned correctly. This test was added to
|
||||
|
|
|
@ -212,7 +212,8 @@ def test_empty_ner():
|
|||
nlp.begin_training()
|
||||
doc = nlp("John is watching the news about Croatia's elections")
|
||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||
assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||
assert [token.ent_iob_ for token in doc] == result
|
||||
|
||||
|
||||
def test_ruler_before_ner():
|
||||
|
|
|
@ -237,7 +237,7 @@ def test_issue1889(word):
|
|||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
|
||||
@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
|
||||
def test_issue1915():
|
||||
cfg = {"hidden_depth": 2} # should error out
|
||||
nlp = Language()
|
||||
|
|
|
@ -58,10 +58,22 @@ subword_features = false
|
|||
|
||||
@registry.architectures.register("my_test_parser")
|
||||
def my_parser():
|
||||
tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
|
||||
maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
|
||||
conv_depth=2, bilstm_depth=0)
|
||||
parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
|
||||
tok2vec = build_Tok2Vec_model(
|
||||
width=321,
|
||||
embed_size=5432,
|
||||
pretrained_vectors=None,
|
||||
window_size=3,
|
||||
maxout_pieces=4,
|
||||
subword_features=True,
|
||||
char_embed=True,
|
||||
nM=64,
|
||||
nC=8,
|
||||
conv_depth=2,
|
||||
bilstm_depth=0,
|
||||
)
|
||||
parser = build_tb_parser_model(
|
||||
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
|
@ -88,7 +100,7 @@ def test_serialize_custom_nlp():
|
|||
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
|
||||
nlp = English()
|
||||
parser_cfg = dict()
|
||||
parser_cfg["model"] = {'@architectures': "my_test_parser"}
|
||||
parser_cfg["model"] = {"@architectures": "my_test_parser"}
|
||||
parser = nlp.create_pipe("parser", parser_cfg)
|
||||
nlp.add_pipe(parser)
|
||||
nlp.begin_training()
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import pytest
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
|
||||
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
|
||||
from spacy.ml.models.defaults import default_textcat, default_sentrec
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
|||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||
textcat = TextCategorizer(
|
||||
en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]
|
||||
)
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
|
|
|
@ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
Loading…
Reference in New Issue
Block a user