mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Tidy up and auto-format
This commit is contained in:
parent
06f0a8daa0
commit
5da3ad682a
|
@ -120,7 +120,7 @@ def pretrain(
|
||||||
window_size=1,
|
window_size=1,
|
||||||
char_embed=False,
|
char_embed=False,
|
||||||
nM=64,
|
nM=64,
|
||||||
nC=8
|
nC=8,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
|
|
|
@ -9,7 +9,7 @@ from wasabi import msg
|
||||||
import contextlib
|
import contextlib
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from ..util import create_default_optimizer, registry
|
from ..util import create_default_optimizer
|
||||||
from ..util import use_gpu as set_gpu
|
from ..util import use_gpu as set_gpu
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
|
@ -161,7 +161,10 @@ def train(
|
||||||
raise ValueError(f"Component {pipe} currently not supported.")
|
raise ValueError(f"Component {pipe} currently not supported.")
|
||||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||||
if vectors:
|
if vectors:
|
||||||
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
pretrained_config = {
|
||||||
|
"@architectures": "spacy.VocabVectors.v1",
|
||||||
|
"name": vectors,
|
||||||
|
}
|
||||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||||
|
|
||||||
if pipe == "parser":
|
if pipe == "parser":
|
||||||
|
@ -222,7 +225,10 @@ def train(
|
||||||
raise ValueError(f"Component {pipe} currently not supported.")
|
raise ValueError(f"Component {pipe} currently not supported.")
|
||||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||||
if vectors:
|
if vectors:
|
||||||
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
pretrained_config = {
|
||||||
|
"@architectures": "spacy.VocabVectors.v1",
|
||||||
|
"name": vectors,
|
||||||
|
}
|
||||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||||
|
|
||||||
if pipe == "parser":
|
if pipe == "parser":
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
from typing import Optional, Dict, List, Union, Sequence
|
||||||
from pydantic import BaseModel, FilePath, StrictInt
|
from pydantic import BaseModel, FilePath
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
|
|
|
@ -130,7 +130,13 @@ class Language(object):
|
||||||
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
|
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
|
self,
|
||||||
|
vocab=True,
|
||||||
|
make_doc=True,
|
||||||
|
max_length=10 ** 6,
|
||||||
|
meta={},
|
||||||
|
config=None,
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
|
||||||
|
@ -176,11 +182,20 @@ class Language(object):
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
|
||||||
from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
|
from .ml.models.defaults import (
|
||||||
default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
|
default_tagger_config,
|
||||||
default_tensorizer_config, default_tok2vec_config
|
default_parser_config,
|
||||||
|
default_ner_config,
|
||||||
|
default_textcat_config,
|
||||||
|
default_nel_config,
|
||||||
|
default_morphologizer_config,
|
||||||
|
default_sentrec_config,
|
||||||
|
default_tensorizer_config,
|
||||||
|
default_tok2vec_config,
|
||||||
|
)
|
||||||
|
|
||||||
self.defaults = {"tagger": default_tagger_config(),
|
self.defaults = {
|
||||||
|
"tagger": default_tagger_config(),
|
||||||
"parser": default_parser_config(),
|
"parser": default_parser_config(),
|
||||||
"ner": default_ner_config(),
|
"ner": default_ner_config(),
|
||||||
"textcat": default_textcat_config(),
|
"textcat": default_textcat_config(),
|
||||||
|
@ -329,12 +344,14 @@ class Language(object):
|
||||||
model_cfg = None
|
model_cfg = None
|
||||||
del config["model"]
|
del config["model"]
|
||||||
if model_cfg is None and default_config is not None:
|
if model_cfg is None and default_config is not None:
|
||||||
user_warning(Warnings.W098)
|
user_warning(Warnings.W098.format(name=name))
|
||||||
model_cfg = default_config["model"]
|
model_cfg = default_config["model"]
|
||||||
model = None
|
model = None
|
||||||
if model_cfg is not None:
|
if model_cfg is not None:
|
||||||
self.config[name] = {"model": model_cfg}
|
self.config[name] = {"model": model_cfg}
|
||||||
model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
|
model = registry.make_from_config({"model": model_cfg}, validate=True)[
|
||||||
|
"model"
|
||||||
|
]
|
||||||
return factory(self, model, **config)
|
return factory(self, model, **config)
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .entity_linker import *
|
from .entity_linker import * # noqa
|
||||||
from .parser import *
|
from .parser import * # noqa
|
||||||
from .tagger import *
|
from .tagger import * # noqa
|
||||||
from .tensorizer import *
|
from .tensorizer import * # noqa
|
||||||
from .textcat import *
|
from .textcat import * # noqa
|
||||||
from .tok2vec import *
|
from .tok2vec import * # noqa
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear
|
from thinc.api import Model, Maxout, Linear
|
||||||
|
|
||||||
from spacy.util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
from pydantic import StrictInt
|
from pydantic import StrictInt
|
||||||
|
|
||||||
from spacy.util import registry
|
|
||||||
from spacy.ml._layers import PrecomputableAffine
|
|
||||||
from spacy.syntax._parser_model import ParserModel
|
|
||||||
|
|
||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||||
|
|
||||||
|
from ...util import registry
|
||||||
|
from .._layers import PrecomputableAffine
|
||||||
|
from ...syntax._parser_model import ParserModel
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
|
|
||||||
from spacy.util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tagger.v1")
|
@registry.architectures.register("spacy.Tagger.v1")
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from spacy.attrs import ORTH
|
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from spacy.util import registry
|
from thinc.api import SparseLinear, Softmax
|
||||||
from spacy.ml.extract_ngrams import extract_ngrams
|
|
||||||
|
|
||||||
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
|
from ...attrs import ORTH
|
||||||
|
from ...util import registry
|
||||||
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
||||||
else:
|
else:
|
||||||
# TODO: experiment with init_w=zero_init
|
# TODO: experiment with init_w=zero_init
|
||||||
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
model = (
|
||||||
|
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||||
|
)
|
||||||
model.set_ref("output_layer", linear_layer)
|
model.set_ref("output_layer", linear_layer)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
|
|
|
@ -119,7 +119,7 @@ def hash_embed_bilstm_v1(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
||||||
def hash_embed_bilstm_v1(
|
def hash_char_embed_bilstm_v1(
|
||||||
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
|
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
|
||||||
):
|
):
|
||||||
# Allows using character embeddings by setting nC, nM and char_embed=True
|
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||||
|
|
|
@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc
|
||||||
|
|
||||||
@component("tok2vec", assigns=["doc.tensor"])
|
@component("tok2vec", assigns=["doc.tensor"])
|
||||||
class Tok2Vec(Pipe):
|
class Tok2Vec(Pipe):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_nlp(cls, nlp, model, **cfg):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
return cls(nlp.vocab, model, **cfg)
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
|
@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
|
"pipe_cls,n_moves,model",
|
||||||
|
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
|
||||||
)
|
)
|
||||||
def test_add_label_get_label(pipe_cls, n_moves, model):
|
def test_add_label_get_label(pipe_cls, n_moves, model):
|
||||||
"""Test that added labels are returned correctly. This test was added to
|
"""Test that added labels are returned correctly. This test was added to
|
||||||
|
|
|
@ -212,7 +212,8 @@ def test_empty_ner():
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
doc = nlp("John is watching the news about Croatia's elections")
|
doc = nlp("John is watching the news about Croatia's elections")
|
||||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||||
assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||||
|
assert [token.ent_iob_ for token in doc] == result
|
||||||
|
|
||||||
|
|
||||||
def test_ruler_before_ner():
|
def test_ruler_before_ner():
|
||||||
|
|
|
@ -237,7 +237,7 @@ def test_issue1889(word):
|
||||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
|
@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
|
||||||
def test_issue1915():
|
def test_issue1915():
|
||||||
cfg = {"hidden_depth": 2} # should error out
|
cfg = {"hidden_depth": 2} # should error out
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
|
@ -58,10 +58,22 @@ subword_features = false
|
||||||
|
|
||||||
@registry.architectures.register("my_test_parser")
|
@registry.architectures.register("my_test_parser")
|
||||||
def my_parser():
|
def my_parser():
|
||||||
tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
|
tok2vec = build_Tok2Vec_model(
|
||||||
maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
|
width=321,
|
||||||
conv_depth=2, bilstm_depth=0)
|
embed_size=5432,
|
||||||
parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
|
pretrained_vectors=None,
|
||||||
|
window_size=3,
|
||||||
|
maxout_pieces=4,
|
||||||
|
subword_features=True,
|
||||||
|
char_embed=True,
|
||||||
|
nM=64,
|
||||||
|
nC=8,
|
||||||
|
conv_depth=2,
|
||||||
|
bilstm_depth=0,
|
||||||
|
)
|
||||||
|
parser = build_tb_parser_model(
|
||||||
|
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +100,7 @@ def test_serialize_custom_nlp():
|
||||||
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
|
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
parser_cfg = dict()
|
parser_cfg = dict()
|
||||||
parser_cfg["model"] = {'@architectures': "my_test_parser"}
|
parser_cfg["model"] = {"@architectures": "my_test_parser"}
|
||||||
parser = nlp.create_pipe("parser", parser_cfg)
|
parser = nlp.create_pipe("parser", parser_cfg)
|
||||||
nlp.add_pipe(parser)
|
nlp.add_pipe(parser)
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||||
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
|
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
|
||||||
|
from spacy.ml.models.defaults import default_textcat, default_sentrec
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
|
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
|
textcat = TextCategorizer(
|
||||||
|
en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]
|
||||||
|
)
|
||||||
textcat.to_bytes(exclude=["vocab"])
|
textcat.to_bytes(exclude=["vocab"])
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user